PyPI - quantcpp - Versions diffs - 0.11.0__tar.gz → 0.12.1__tar.gz - Mend

quantcpp 0.11.0tar.gz → 0.12.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{quantcpp-0.11.0/quantcpp.egg-info → quantcpp-0.12.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: quantcpp
-Version: 0.11.0
+Version: 0.12.1
 Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
 Author-email: quantumaikr <noreply@quantumaikr.com>
 License: Apache-2.0

{quantcpp-0.11.0 → quantcpp-0.12.1}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "quantcpp"
-version = "0.11.0"
+version = "0.12.1"
 description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
 readme = "README.md"
 license = { text = "Apache-2.0" }

{quantcpp-0.11.0 → quantcpp-0.12.1}/quant.h RENAMED Viewed

@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
 // Section 1: Types and Specs (from tq_types.h, tq_spec.h)
 // ============================================================================
 /* Cross-language static assert: works in both C11 and C++11/17 */
 #ifdef __cplusplus
 #define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
 #define TQ_PI_2 1.5707963267948966f
 #endif
 /* ============================================================
  * Constants
  * ============================================================ */
@@ -398,8 +394,6 @@ typedef struct {
     int      enable_recompression;/* Tier 1 → Tier 2 re-compression   */
 } tq_progressive_config_t;
 /* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
  * 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
  * Block covers TQ_BK elements (128).
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
 /* Format specification — version-aware, ONNX-inspired */
 #define TQ_SPEC_VERSION 1
@@ -500,18 +488,10 @@ typedef struct {
     uint8_t  flags;            /* TQ_FLAG_* bitmask                 */
 } tq_format_spec_t;
 // ============================================================================
 // Section 2: Engine Types (from tq_engine.h)
 // ============================================================================
 /* ============================================================
  * Model configuration
  * ============================================================ */
@@ -886,6 +866,7 @@ typedef struct {
     int n_threads;
     float rep_penalty;    /* repetition penalty (default: 1.1, 1.0 = disabled) */
     int rep_window;       /* how many recent tokens to penalize (default: 32) */
+    unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
     /* Callback for streaming output */
     void (*on_token)(const char* text, void* user_data);
     void* user_data;
@@ -1123,9 +1104,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
 /* Max threads supported by thread pool */
 #define TQ_TP_MAX 16
 // ============================================================================
 // Section 3: GGUF Types (from tq_gguf.h)
 // ============================================================================
@@ -1143,10 +1121,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
  * directly into TurboQuant inference engine.
  */
 /* ============================================================
  * GGUF format constants
  * ============================================================ */
@@ -1462,14 +1436,10 @@ int tq_metal_moe_forward(
     const int*      up_types,       /* per-expert up quant types, NULL = use weight_type */
     const int*      down_types);    /* per-expert down quant types, NULL = use weight_type */
 // ============================================================================
 // Section 4: Internal API (from turboquant.h)
 // ============================================================================
 /**
  * TurboQuant.cpp — Cross-platform KV cache compression library
  *
@@ -1477,9 +1447,6 @@ int tq_metal_moe_forward(
  * Zero external dependencies (libc/libm only).
  */
 /* ============================================================
  * Version
  * ============================================================ */
@@ -1753,15 +1720,10 @@ void      tq_progressive_free(tq_progressive_t* p);
 tq_progressive_config_t tq_progressive_default_config(void);
 // ============================================================================
 // Section 5: quant_ctx struct definition
 // ============================================================================
 struct quant_ctx {
     tq_model_t* model;
     tq_state_t* state;
@@ -1788,7 +1750,6 @@ struct quant_ctx {
  * - Random signs decorrelate channels across different blocks
  */
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -1902,7 +1863,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
  */
 /* Generic reference — no compiler-specific pragmas */
 /* ---------- FP16 helpers ---------- */
 static uint16_t uni_fp32_to_fp16(float v) {
@@ -2285,7 +2245,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
 // Section 8: Type Traits (from tq_traits.c)
 // ============================================================================
 /* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
 static void tq_stub_quantize(const float* src, void* dst, int n) {
     (void)src; (void)dst; (void)n;
@@ -2583,7 +2542,6 @@ tq_type tq_type_from_name(const char* name) {
  * No external dependencies — libc/libm only.
  */
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -2617,7 +2575,6 @@ static struct {
 static int g_n_threads = 1;
 static void* tp_worker(void* arg) {
     int id = (int)(intptr_t)arg;
     int my_gen = 0;
@@ -4173,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
     config.n_threads = 1;
     config.rep_penalty = 1.1f;
     config.rep_window = 32;
+    config.rng_seed = 42ULL;
     config.on_token = NULL;
     config.user_data = NULL;
     return config;
@@ -4388,8 +4346,6 @@ void tq_matmul_1bit(float* out, const float* x,
  * SPDX-License-Identifier: MIT
  */
 #ifdef _WIN32
 #else
 #endif
@@ -5098,8 +5054,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
  * Pure C11, no external dependencies.
  */
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 #include <arm_neon.h>
 #define TQ_HAS_NEON 1
@@ -7174,7 +7128,6 @@ void tq_metal_batch_end_if_available(void) {
  * Also supports the legacy llama2.c binary tokenizer format as fallback.
  */
 /* Global for qsort comparator (vocab index sorting) */
 static char** g_vocab_for_sort;
 static int cmp_vocab_idx(const void* a, const void* b) {
@@ -8519,7 +8472,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
  * Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
  */
 #ifdef _WIN32
 #else
 #endif
@@ -12179,8 +12131,13 @@ tq_model_t* tq_load_gguf(const char* path) {
         }
         const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
-        /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */
+        /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
+         * Can be set via environment variable or compile-time define (useful for WASM). */
+#ifdef TQ_NO_Q4
+        if (1) {
+#else
         if (getenv("TQ_NO_Q4")) {
+#endif
             fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
             goto skip_q4_conversion;
         }
@@ -12929,7 +12886,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
  *   -> residual add
  */
 /* Unified Q2/1-bit matmul dispatch.
  * When model->use_1bit_weights, Q2 fields contain sign bits + norms,
  * dispatched to tq_matmul_1bit (FP32 input required).
@@ -15189,7 +15145,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         }
     }
     /* Increment profile token count if profiling is active */
     if (s->profile_kv) {
         s->profile_kv_count++;
@@ -15240,7 +15195,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
  *   - Full generation loop with streaming callback
  */
 /* ============================================================
  * Argmax sampling: return token with highest logit
  * ============================================================ */
@@ -15461,7 +15415,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         fprintf(stderr, "\n");
     }
-    /* Prefill: process all prompt tokens */
+    /* Prefill: process all prompt tokens.
+     * NOTE: No emscripten_sleep() here — the call stack during tq_forward()
+     * is too deep for ASYNCIFY to unwind (matmul → SIMD kernels). Adding
+     * sleep here breaks ASYNCIFY for the entire generate call, including
+     * the token streaming callback. The browser shows "Thinking..." via
+     * requestAnimationFrame before entering this blocking prefill. */
     for (int i = 0; i < n_prompt; i++) {
         tq_forward(model, state, prompt_tokens[i], i);
     }
@@ -15496,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         }
     }
-    /* Sample first generated token */
+    /* Sample first generated token. The seed is configurable via
+     * config->rng_seed (default 42); 0 falls back to 42 so existing
+     * callers that never set rng_seed get bit-identical behaviour. */
     int pos = n_prompt;
-    unsigned long long rng_state = 42;
+    unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
     int next_token = tq_sample_topp(state->logits, vocab_size,
                                      config->temperature, config->top_p,
                                      &rng_state);
@@ -15663,7 +15624,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     return generated;
 }
 // ============================================================================
 // ============================================================================

{quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp/__init__.py RENAMED Viewed

@@ -15,7 +15,7 @@ try:
     from importlib.metadata import version as _pkg_version
     __version__ = _pkg_version("quantcpp")
 except Exception:
-    __version__ = "0.11.0"  # fallback for editable / source-tree imports
+    __version__ = "0.12.1"  # fallback for editable / source-tree imports
 import os
 import sys

{quantcpp-0.11.0 → quantcpp-0.12.1}/quantcpp/_quant.h RENAMED Viewed

@@ -202,8 +202,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
 // Section 1: Types and Specs (from tq_types.h, tq_spec.h)
 // ============================================================================
 /* Cross-language static assert: works in both C11 and C++11/17 */
 #ifdef __cplusplus
 #define TQ_STATIC_ASSERT(cond, msg) static_assert(cond, msg)
@@ -219,8 +217,6 @@ static inline int clock_gettime(int id, struct timespec* ts) {
 #define TQ_PI_2 1.5707963267948966f
 #endif
 /* ============================================================
  * Constants
  * ============================================================ */
@@ -398,8 +394,6 @@ typedef struct {
     int      enable_recompression;/* Tier 1 → Tier 2 re-compression   */
 } tq_progressive_config_t;
 /* TurboQuant KV cache block: RHT + Lloyd-Max codebook + QJL residual
  * 3-bit variant: 2-bit codebook (4 levels) + 1-bit QJL sign hash
  * Block covers TQ_BK elements (128).
@@ -469,12 +463,6 @@ TQ_CHECK_SIZE(block_tq_turbo_kv_4b, 8 + TQ_BK * 3 / 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_1b, 8 + TQ_BK / 8);
 TQ_CHECK_SIZE(block_tq_turbo_kv_2b, 8 + TQ_BK / 8 + TQ_BK / 8);
 /* Format specification — version-aware, ONNX-inspired */
 #define TQ_SPEC_VERSION 1
@@ -500,18 +488,10 @@ typedef struct {
     uint8_t  flags;            /* TQ_FLAG_* bitmask                 */
 } tq_format_spec_t;
 // ============================================================================
 // Section 2: Engine Types (from tq_engine.h)
 // ============================================================================
 /* ============================================================
  * Model configuration
  * ============================================================ */
@@ -886,6 +866,7 @@ typedef struct {
     int n_threads;
     float rep_penalty;    /* repetition penalty (default: 1.1, 1.0 = disabled) */
     int rep_window;       /* how many recent tokens to penalize (default: 32) */
+    unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
     /* Callback for streaming output */
     void (*on_token)(const char* text, void* user_data);
     void* user_data;
@@ -1123,9 +1104,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
 /* Max threads supported by thread pool */
 #define TQ_TP_MAX 16
 // ============================================================================
 // Section 3: GGUF Types (from tq_gguf.h)
 // ============================================================================
@@ -1143,10 +1121,6 @@ void tq_tp_run(void* (*fn)(void*), void** args, int n_tasks);
  * directly into TurboQuant inference engine.
  */
 /* ============================================================
  * GGUF format constants
  * ============================================================ */
@@ -1462,14 +1436,10 @@ int tq_metal_moe_forward(
     const int*      up_types,       /* per-expert up quant types, NULL = use weight_type */
     const int*      down_types);    /* per-expert down quant types, NULL = use weight_type */
 // ============================================================================
 // Section 4: Internal API (from turboquant.h)
 // ============================================================================
 /**
  * TurboQuant.cpp — Cross-platform KV cache compression library
  *
@@ -1477,9 +1447,6 @@ int tq_metal_moe_forward(
  * Zero external dependencies (libc/libm only).
  */
 /* ============================================================
  * Version
  * ============================================================ */
@@ -1753,15 +1720,10 @@ void      tq_progressive_free(tq_progressive_t* p);
 tq_progressive_config_t tq_progressive_default_config(void);
 // ============================================================================
 // Section 5: quant_ctx struct definition
 // ============================================================================
 struct quant_ctx {
     tq_model_t* model;
     tq_state_t* state;
@@ -1788,7 +1750,6 @@ struct quant_ctx {
  * - Random signs decorrelate channels across different blocks
  */
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -1902,7 +1863,6 @@ void tq_rht_inverse(float* data, int n, uint32_t seed) {
  */
 /* Generic reference — no compiler-specific pragmas */
 /* ---------- FP16 helpers ---------- */
 static uint16_t uni_fp32_to_fp16(float v) {
@@ -2285,7 +2245,6 @@ void tq_uniform_3b_attention_ref(const float* query, const void* kv,
 // Section 8: Type Traits (from tq_traits.c)
 // ============================================================================
 /* Stub implementations for excluded quantization types (polar, qjl, turbo, mixed) */
 static void tq_stub_quantize(const float* src, void* dst, int n) {
     (void)src; (void)dst; (void)n;
@@ -2583,7 +2542,6 @@ tq_type tq_type_from_name(const char* name) {
  * No external dependencies — libc/libm only.
  */
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
@@ -2617,7 +2575,6 @@ static struct {
 static int g_n_threads = 1;
 static void* tp_worker(void* arg) {
     int id = (int)(intptr_t)arg;
     int my_gen = 0;
@@ -4173,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
     config.n_threads = 1;
     config.rep_penalty = 1.1f;
     config.rep_window = 32;
+    config.rng_seed = 42ULL;
     config.on_token = NULL;
     config.user_data = NULL;
     return config;
@@ -4388,8 +4346,6 @@ void tq_matmul_1bit(float* out, const float* x,
  * SPDX-License-Identifier: MIT
  */
 #ifdef _WIN32
 #else
 #endif
@@ -5098,8 +5054,6 @@ const tq_gguf_tensor_t* tq_gguf_find_tensor(const tq_gguf_ctx_t* ctx, const char
  * Pure C11, no external dependencies.
  */
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 #include <arm_neon.h>
 #define TQ_HAS_NEON 1
@@ -7174,7 +7128,6 @@ void tq_metal_batch_end_if_available(void) {
  * Also supports the legacy llama2.c binary tokenizer format as fallback.
  */
 /* Global for qsort comparator (vocab index sorting) */
 static char** g_vocab_for_sort;
 static int cmp_vocab_idx(const void* a, const void* b) {
@@ -8519,7 +8472,6 @@ const char* tq_decode(const tq_tokenizer_t* tok, int prev_token, int token) {
  * Supports hybrid architectures (e.g., Qwen3.5 DeltaNet + self_attn).
  */
 #ifdef _WIN32
 #else
 #endif
@@ -12179,8 +12131,13 @@ tq_model_t* tq_load_gguf(const char* path) {
         }
         const size_t MAX_FP32_BYTES = (size_t)16 * 1024 * 1024 * 1024ULL; /* 16 GB */
-        /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality */
+        /* TQ_NO_Q4=1 disables Q4 recompression → use direct GGUF dequant for better quality.
+         * Can be set via environment variable or compile-time define (useful for WASM). */
+#ifdef TQ_NO_Q4
+        if (1) {
+#else
         if (getenv("TQ_NO_Q4")) {
+#endif
             fprintf(stderr, "tq_load_gguf: TQ_NO_Q4 set — skipping Q4 conversion, using GGUF on-the-fly dequant\n");
             goto skip_q4_conversion;
         }
@@ -12929,7 +12886,6 @@ void tq_quantize_weights_1bit(tq_model_t* model) {
  *   -> residual add
  */
 /* Unified Q2/1-bit matmul dispatch.
  * When model->use_1bit_weights, Q2 fields contain sign bits + norms,
  * dispatched to tq_matmul_1bit (FP32 input required).
@@ -15189,7 +15145,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
         }
     }
     /* Increment profile token count if profiling is active */
     if (s->profile_kv) {
         s->profile_kv_count++;
@@ -15240,7 +15195,6 @@ float* tq_forward(tq_model_t* model, tq_state_t* s, int token, int pos) {
  *   - Full generation loop with streaming callback
  */
 /* ============================================================
  * Argmax sampling: return token with highest logit
  * ============================================================ */
@@ -15461,7 +15415,12 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         fprintf(stderr, "\n");
     }
-    /* Prefill: process all prompt tokens */
+    /* Prefill: process all prompt tokens.
+     * NOTE: No emscripten_sleep() here — the call stack during tq_forward()
+     * is too deep for ASYNCIFY to unwind (matmul → SIMD kernels). Adding
+     * sleep here breaks ASYNCIFY for the entire generate call, including
+     * the token streaming callback. The browser shows "Thinking..." via
+     * requestAnimationFrame before entering this blocking prefill. */
     for (int i = 0; i < n_prompt; i++) {
         tq_forward(model, state, prompt_tokens[i], i);
     }
@@ -15496,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
         }
     }
-    /* Sample first generated token */
+    /* Sample first generated token. The seed is configurable via
+     * config->rng_seed (default 42); 0 falls back to 42 so existing
+     * callers that never set rng_seed get bit-identical behaviour. */
     int pos = n_prompt;
-    unsigned long long rng_state = 42;
+    unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
     int next_token = tq_sample_topp(state->logits, vocab_size,
                                      config->temperature, config->top_p,
                                      &rng_state);
@@ -15663,7 +15624,6 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
     return generated;
 }
 // ============================================================================
 // ============================================================================

quantcpp-0.12.1/quantcpp/cli.py ADDED Viewed

@@ -0,0 +1,390 @@
+"""
+quantcpp CLI — chat with a local LLM in your terminal.
+Ollama-style commands:
+    quantcpp pull MODEL       Download a model from HuggingFace
+    quantcpp list             List cached and available models
+    quantcpp run MODEL [Q]    Chat with a model (auto-pulls if needed)
+    quantcpp serve MODEL      Start OpenAI-compatible HTTP server
+Backwards-compatible shortcut:
+    quantcpp                  Auto-downloads Llama-3.2-1B, starts chat
+    quantcpp "What is X?"     One-shot question with default model
+    quantcpp --model NAME     Use a specific model
+"""
+import sys
+import os
+import json
+# Ollama-style short aliases → canonical _MODEL_REGISTRY keys
+MODEL_ALIASES = {
+    "smollm2":      "SmolLM2-135M",
+    "smollm2:135m": "SmolLM2-135M",
+    "qwen3.5":      "Qwen3.5-0.8B",
+    "qwen3.5:0.8b": "Qwen3.5-0.8B",
+    "llama3.2":     "Llama-3.2-1B",
+    "llama3.2:1b":  "Llama-3.2-1B",
+}
+def _resolve_name(name):
+    """Resolve user input to canonical registry key or local path."""
+    if name is None:
+        return None
+    if os.path.exists(name) and name.endswith(".gguf"):
+        return name
+    return MODEL_ALIASES.get(name.lower(), name)
+def _registry():
+    from quantcpp import _MODEL_REGISTRY, _CACHE_DIR
+    return _MODEL_REGISTRY, _CACHE_DIR
+def cmd_pull(args):
+    """Download a model by alias or canonical name."""
+    import quantcpp
+    name = _resolve_name(args.model)
+    if os.path.exists(name) and name.endswith(".gguf"):
+        print(f"already local: {name}")
+        return 0
+    if name not in quantcpp._MODEL_REGISTRY:
+        avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
+        aliases = ", ".join(sorted(MODEL_ALIASES.keys()))
+        print(f"unknown model: {args.model!r}", file=sys.stderr)
+        print(f"  registry:  {avail}", file=sys.stderr)
+        print(f"  aliases:   {aliases}", file=sys.stderr)
+        return 1
+    print(f"pulling {name}...", file=sys.stderr)
+    try:
+        path = quantcpp.download(name)
+        size_mb = os.path.getsize(path) / (1024 * 1024)
+        print(f"\u2713 {name} \u2192 {path} ({size_mb:.0f} MB)", file=sys.stderr)
+        return 0
+    except Exception as e:
+        print(f"download failed: {e}", file=sys.stderr)
+        return 1
+def cmd_list(args):
+    """List cached and available models."""
+    registry, cache_dir = _registry()
+    rows = []
+    for name, (repo, filename, approx_mb) in sorted(registry.items()):
+        path = cache_dir / filename
+        if path.exists():
+            size_mb = path.stat().st_size / (1024 * 1024)
+            status = "cached"
+            display_path = str(path)
+        else:
+            size_mb = approx_mb
+            status = "remote"
+            display_path = f"~{approx_mb} MB"
+        alias = next((a for a, n in MODEL_ALIASES.items() if n == name and ":" in a), "")
+        rows.append((status, name, alias, size_mb, display_path))
+    if args.json_output:
+        print(json.dumps([
+            {"status": s, "name": n, "alias": a, "size_mb": round(sz, 1), "path": p}
+            for (s, n, a, sz, p) in rows
+        ], indent=2))
+        return 0
+    print(f"\n  Models  cache: {cache_dir}\n")
+    print(f"  {'STATUS':<8} {'NAME':<16} {'ALIAS':<14} {'SIZE':>8}")
+    print(f"  {'-'*8} {'-'*16} {'-'*14} {'-'*8}")
+    for status, name, alias, size_mb, _ in rows:
+        size_str = f"{size_mb:.0f} MB"
+        print(f"  {status:<8} {name:<16} {alias:<14} {size_str:>8}")
+    print()
+    return 0
+def _resolve_to_path(name_or_path):
+    """Resolve alias/name to a local .gguf path, downloading if needed."""
+    import quantcpp
+    name = _resolve_name(name_or_path)
+    if os.path.exists(name) and name.endswith(".gguf"):
+        return name
+    if name not in quantcpp._MODEL_REGISTRY:
+        avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
+        raise ValueError(
+            f"unknown model: {name_or_path!r}. Available: {avail}"
+        )
+    repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
+    cached = quantcpp._CACHE_DIR / filename
+    if cached.exists():
+        return str(cached)
+    print(f"model not cached \u2014 pulling {name}...", file=sys.stderr)
+    return quantcpp.download(name)
+def cmd_run(args):
+    """Chat with a model (auto-pull if needed)."""
+    try:
+        model_path = _resolve_to_path(args.model)
+    except ValueError as e:
+        print(str(e), file=sys.stderr)
+        return 1
+    except Exception as e:
+        print(f"pull failed: {e}", file=sys.stderr)
+        return 1
+    from quantcpp import Model
+    print(f"loading {os.path.basename(model_path)}...", file=sys.stderr)
+    m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
+              n_threads=args.threads)
+    if args.prompt:
+        question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
+        for tok in m.generate(question):
+            print(tok, end="", flush=True)
+        print()
+    else:
+        print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
+        try:
+            while True:
+                question = input("\nYou: ")
+                if not question.strip():
+                    continue
+                print("AI: ", end="", flush=True)
+                for tok in m.generate(question):
+                    print(tok, end="", flush=True)
+                print()
+        except (KeyboardInterrupt, EOFError):
+            print("\nBye!", file=sys.stderr)
+    m.close()
+    return 0
+def cmd_serve(args):
+    """Start OpenAI-compatible HTTP server (requires quant-server binary)."""
+    import shutil
+    import subprocess
+    try:
+        model_path = _resolve_to_path(args.model)
+    except Exception as e:
+        print(f"error: {e}", file=sys.stderr)
+        return 1
+    binary = shutil.which("quant-server")
+    if not binary:
+        # Look in common build dirs relative to repo
+        for guess in ("./build/quant-server", "./build_metal/quant-server"):
+            if os.path.isfile(guess) and os.access(guess, os.X_OK):
+                binary = guess
+                break
+    if not binary:
+        print("quant-server binary not found.", file=sys.stderr)
+        print("  Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
+              file=sys.stderr)
+        print("  Or install via your package manager.", file=sys.stderr)
+        return 2
+    cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
+    print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("OpenAI-compatible endpoints:", file=sys.stderr)
+    print(f"  POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr)
+    print(f"  GET  http://localhost:{args.port}/v1/models", file=sys.stderr)
+    print(f"  GET  http://localhost:{args.port}/health", file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Streaming (SSE — token-by-token):", file=sys.stderr)
+    print(f"  curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
+    print("    -H 'Content-Type: application/json' \\", file=sys.stderr)
+    print('    -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'',
+          file=sys.stderr)
+    print("", file=sys.stderr)
+    print("Non-streaming (single JSON response):", file=sys.stderr)
+    print(f"  curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
+    print("    -H 'Content-Type: application/json' \\", file=sys.stderr)
+    print('    -d \'{"messages":[{"role":"user","content":"Hi"}]}\'',
+          file=sys.stderr)
+    print("", file=sys.stderr)
+    print("OpenAI Python SDK works as-is:", file=sys.stderr)
+    print(f"  client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')",
+          file=sys.stderr)
+    print("  client.chat.completions.create(model='quantcpp', messages=[...], stream=True)",
+          file=sys.stderr)
+    print("", file=sys.stderr)
+    os.execvp(cmd[0], cmd)
+def cmd_client(args):
+    """Send a chat request to a running quantcpp serve endpoint.
+    Default mode is streaming (SSE) — tokens print as they arrive.
+    Use --no-stream for a single JSON response.
+    """
+    import json as _json
+    import urllib.request
+    url = args.url.rstrip("/") + "/v1/chat/completions"
+    payload = {
+        "model": args.model_name,
+        "messages": [{"role": "user", "content": args.prompt}],
+        "max_tokens": args.max_tokens,
+        "temperature": args.temperature,
+        "stream": not args.no_stream,
+    }
+    body = _json.dumps(payload).encode()
+    req = urllib.request.Request(
+        url, data=body,
+        headers={
+            "Content-Type": "application/json",
+            "User-Agent": "quantcpp-client",
+        },
+    )
+    try:
+        with urllib.request.urlopen(req) as resp:
+            if args.no_stream:
+                data = _json.loads(resp.read())
+                print(data["choices"][0]["message"]["content"])
+                return 0
+            # SSE stream — parse `data: {...}\n\n` chunks
+            for line in resp:
+                line = line.decode("utf-8", errors="replace").rstrip()
+                if not line.startswith("data:"):
+                    continue
+                payload_str = line[5:].strip()
+                if payload_str == "[DONE]":
+                    break
+                try:
+                    chunk = _json.loads(payload_str)
+                    delta = chunk["choices"][0]["delta"].get("content", "")
+                    if delta:
+                        print(delta, end="", flush=True)
+                except Exception:
+                    pass
+            print()
+            return 0
+    except urllib.error.URLError as e:
+        print(f"connection failed: {e}", file=sys.stderr)
+        print(f"  Is the server running on {args.url}?", file=sys.stderr)
+        print(f"  Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}",
+              file=sys.stderr)
+        return 1
+def cmd_chat_default(args):
+    """Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
+    args.model = args.model or "Llama-3.2-1B"
+    args.threads = getattr(args, "threads", 4)
+    args.max_tokens = getattr(args, "max_tokens", 256)
+    args.temperature = getattr(args, "temperature", 0.7)
+    args.prompt = args.prompt or None
+    return cmd_run(args)
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(
+        prog="quantcpp",
+        description="Chat with a local LLM. No API key, no GPU, no server.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+commands:
+  pull MODEL            Download a model (e.g. llama3.2:1b)
+  list                  List cached and available models
+  run MODEL [PROMPT]    Chat with a model (auto-pulls if needed)
+  serve MODEL           Start OpenAI-compatible HTTP server
+  client PROMPT         Send a request to a running serve (default: SSE streaming)
+examples:
+  quantcpp pull llama3.2:1b
+  quantcpp list
+  quantcpp run llama3.2:1b
+  quantcpp run llama3.2:1b "What is gravity?"
+  quantcpp serve llama3.2:1b --port 8080
+  quantcpp client "What is gravity?"                  # streams from :8080
+  quantcpp client "Hi" --url http://localhost:8081
+  quantcpp client "Hi" --no-stream                    # single JSON response
+backwards-compat (no subcommand):
+  quantcpp                          # default chat with Llama-3.2-1B
+  quantcpp "What is gravity?"       # one-shot
+  quantcpp --model SmolLM2-135M     # different model
+""",
+    )
+    sub = parser.add_subparsers(dest="command")
+    # pull
+    p_pull = sub.add_parser("pull", help="Download a model from HuggingFace")
+    p_pull.add_argument("model", help="Model name or alias (e.g. llama3.2:1b)")
+    # list
+    p_list = sub.add_parser("list", help="List cached and available models")
+    p_list.add_argument("--json", dest="json_output", action="store_true")
+    # run
+    p_run = sub.add_parser("run", help="Chat with a model (auto-pulls if needed)")
+    p_run.add_argument("model", help="Model name, alias, or .gguf path")
+    p_run.add_argument("prompt", nargs="*", default=None, help="Optional prompt")
+    p_run.add_argument("-j", "--threads", type=int, default=4)
+    p_run.add_argument("-n", "--max-tokens", type=int, default=256)
+    p_run.add_argument("-t", "--temperature", type=float, default=0.7)
+    # serve
+    p_serve = sub.add_parser("serve", help="Start OpenAI-compatible HTTP server")
+    p_serve.add_argument("model", help="Model name, alias, or .gguf path")
+    p_serve.add_argument("-p", "--port", type=int, default=8080)
+    p_serve.add_argument("-j", "--threads", type=int, default=4)
+    # client
+    p_client = sub.add_parser("client",
+        help="Send a chat request to a running quantcpp serve endpoint")
+    p_client.add_argument("prompt", help="Question to send")
+    p_client.add_argument("--url", default="http://localhost:8080",
+                          help="Server URL (default: http://localhost:8080)")
+    p_client.add_argument("--model-name", "-m", default="quantcpp",
+                          help="Model name in the request body (server ignores)")
+    p_client.add_argument("-n", "--max-tokens", type=int, default=256)
+    p_client.add_argument("-t", "--temperature", type=float, default=0.7)
+    p_client.add_argument("--no-stream", action="store_true",
+                          help="Disable SSE streaming (single JSON response)")
+    # Backwards-compat: top-level args for direct chat
+    parser.add_argument("prompt", nargs="*", default=None,
+                        help="(default mode) question to ask")
+    parser.add_argument("--model", "-m", default=None,
+                        help="(default mode) model name or .gguf path")
+    parser.add_argument("--max-tokens", "-n", type=int, default=256)
+    parser.add_argument("--temperature", "-t", type=float, default=0.7)
+    parser.add_argument("--threads", "-j", type=int, default=4)
+    args = parser.parse_args()
+    if args.command == "pull":
+        return cmd_pull(args)
+    if args.command == "list":
+        return cmd_list(args)
+    if args.command == "run":
+        return cmd_run(args)
+    if args.command == "serve":
+        return cmd_serve(args)
+    if args.command == "client":
+        return cmd_client(args)
+    # No subcommand → backwards-compat default chat
+    return cmd_chat_default(args)
+if __name__ == "__main__":
+    sys.exit(main())

{quantcpp-0.11.0 → quantcpp-0.12.1/quantcpp.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: quantcpp
-Version: 0.11.0
+Version: 0.12.1
 Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
 Author-email: quantumaikr <noreply@quantumaikr.com>
 License: Apache-2.0

quantcpp-0.11.0/quantcpp/cli.py DELETED Viewed

@@ -1,64 +0,0 @@
-"""
-quantcpp CLI — chat with a local LLM in your terminal.
-Usage:
-    quantcpp                          # auto-downloads Llama-3.2-1B, starts chat
-    quantcpp "What is gravity?"       # one-shot question
-    quantcpp --model SmolLM2-135M     # use a smaller model (faster download)
-    quantcpp --model path/to/file.gguf  # use your own GGUF file
-"""
-import sys
-import os
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(
-        prog="quantcpp",
-        description="Chat with a local LLM. No API key, no GPU, no server.",
-    )
-    parser.add_argument("prompt", nargs="*", help="Question to ask (omit for interactive chat)")
-    parser.add_argument("--model", "-m", default="Llama-3.2-1B",
-                        help="Model name or path to .gguf file (default: Llama-3.2-1B)")
-    parser.add_argument("--max-tokens", "-n", type=int, default=256)
-    parser.add_argument("--temperature", "-t", type=float, default=0.7)
-    args = parser.parse_args()
-    from quantcpp import Model
-    # Load model
-    model_path = args.model
-    if os.path.isfile(model_path):
-        print(f"Loading {model_path}...", file=sys.stderr)
-        m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature)
-    else:
-        print(f"Downloading {model_path}...", file=sys.stderr)
-        m = Model.from_pretrained(model_path, max_tokens=args.max_tokens,
-                                   temperature=args.temperature)
-    # One-shot or interactive
-    if args.prompt:
-        question = " ".join(args.prompt)
-        for tok in m.generate(question):
-            print(tok, end="", flush=True)
-        print()
-    else:
-        print("quantcpp — type your message, Ctrl+C to exit", file=sys.stderr)
-        try:
-            while True:
-                question = input("\nYou: ")
-                if not question.strip():
-                    continue
-                print("AI: ", end="", flush=True)
-                for tok in m.generate(question):
-                    print(tok, end="", flush=True)
-                print()
-        except (KeyboardInterrupt, EOFError):
-            print("\nBye!", file=sys.stderr)
-    m.close()
-if __name__ == "__main__":
-    main()