quantcpp 0.12.0__tar.gz → 0.12.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: quantcpp
3
- Version: 0.12.0
3
+ Version: 0.12.1
4
4
  Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
5
5
  Author-email: quantumaikr <noreply@quantumaikr.com>
6
6
  License: Apache-2.0
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
7
7
 
8
8
  [project]
9
9
  name = "quantcpp"
10
- version = "0.12.0"
10
+ version = "0.12.1"
11
11
  description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
12
12
  readme = "README.md"
13
13
  license = { text = "Apache-2.0" }
@@ -866,6 +866,7 @@ typedef struct {
866
866
  int n_threads;
867
867
  float rep_penalty; /* repetition penalty (default: 1.1, 1.0 = disabled) */
868
868
  int rep_window; /* how many recent tokens to penalize (default: 32) */
869
+ unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
869
870
  /* Callback for streaming output */
870
871
  void (*on_token)(const char* text, void* user_data);
871
872
  void* user_data;
@@ -4129,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
4129
4130
  config.n_threads = 1;
4130
4131
  config.rep_penalty = 1.1f;
4131
4132
  config.rep_window = 32;
4133
+ config.rng_seed = 42ULL;
4132
4134
  config.on_token = NULL;
4133
4135
  config.user_data = NULL;
4134
4136
  return config;
@@ -15453,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15453
15455
  }
15454
15456
  }
15455
15457
 
15456
- /* Sample first generated token */
15458
+ /* Sample first generated token. The seed is configurable via
15459
+ * config->rng_seed (default 42); 0 falls back to 42 so existing
15460
+ * callers that never set rng_seed get bit-identical behaviour. */
15457
15461
  int pos = n_prompt;
15458
- unsigned long long rng_state = 42;
15462
+ unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
15459
15463
  int next_token = tq_sample_topp(state->logits, vocab_size,
15460
15464
  config->temperature, config->top_p,
15461
15465
  &rng_state);
@@ -15,7 +15,7 @@ try:
15
15
  from importlib.metadata import version as _pkg_version
16
16
  __version__ = _pkg_version("quantcpp")
17
17
  except Exception:
18
- __version__ = "0.12.0" # fallback for editable / source-tree imports
18
+ __version__ = "0.12.1" # fallback for editable / source-tree imports
19
19
 
20
20
  import os
21
21
  import sys
@@ -866,6 +866,7 @@ typedef struct {
866
866
  int n_threads;
867
867
  float rep_penalty; /* repetition penalty (default: 1.1, 1.0 = disabled) */
868
868
  int rep_window; /* how many recent tokens to penalize (default: 32) */
869
+ unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
869
870
  /* Callback for streaming output */
870
871
  void (*on_token)(const char* text, void* user_data);
871
872
  void* user_data;
@@ -4129,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
4129
4130
  config.n_threads = 1;
4130
4131
  config.rep_penalty = 1.1f;
4131
4132
  config.rep_window = 32;
4133
+ config.rng_seed = 42ULL;
4132
4134
  config.on_token = NULL;
4133
4135
  config.user_data = NULL;
4134
4136
  return config;
@@ -15453,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
15453
15455
  }
15454
15456
  }
15455
15457
 
15456
- /* Sample first generated token */
15458
+ /* Sample first generated token. The seed is configurable via
15459
+ * config->rng_seed (default 42); 0 falls back to 42 so existing
15460
+ * callers that never set rng_seed get bit-identical behaviour. */
15457
15461
  int pos = n_prompt;
15458
- unsigned long long rng_state = 42;
15462
+ unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
15459
15463
  int next_token = tq_sample_topp(state->logits, vocab_size,
15460
15464
  config->temperature, config->top_p,
15461
15465
  &rng_state);
@@ -195,10 +195,92 @@ def cmd_serve(args):
195
195
  return 2
196
196
 
197
197
  cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
198
- print(f"quant serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
198
+ print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
199
+ print("", file=sys.stderr)
200
+ print("OpenAI-compatible endpoints:", file=sys.stderr)
201
+ print(f" POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr)
202
+ print(f" GET http://localhost:{args.port}/v1/models", file=sys.stderr)
203
+ print(f" GET http://localhost:{args.port}/health", file=sys.stderr)
204
+ print("", file=sys.stderr)
205
+ print("Streaming (SSE — token-by-token):", file=sys.stderr)
206
+ print(f" curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
207
+ print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
208
+ print(' -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'',
209
+ file=sys.stderr)
210
+ print("", file=sys.stderr)
211
+ print("Non-streaming (single JSON response):", file=sys.stderr)
212
+ print(f" curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
213
+ print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
214
+ print(' -d \'{"messages":[{"role":"user","content":"Hi"}]}\'',
215
+ file=sys.stderr)
216
+ print("", file=sys.stderr)
217
+ print("OpenAI Python SDK works as-is:", file=sys.stderr)
218
+ print(f" client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')",
219
+ file=sys.stderr)
220
+ print(" client.chat.completions.create(model='quantcpp', messages=[...], stream=True)",
221
+ file=sys.stderr)
222
+ print("", file=sys.stderr)
199
223
  os.execvp(cmd[0], cmd)
200
224
 
201
225
 
226
+ def cmd_client(args):
227
+ """Send a chat request to a running quantcpp serve endpoint.
228
+
229
+ Default mode is streaming (SSE) — tokens print as they arrive.
230
+ Use --no-stream for a single JSON response.
231
+ """
232
+ import json as _json
233
+ import urllib.request
234
+
235
+ url = args.url.rstrip("/") + "/v1/chat/completions"
236
+ payload = {
237
+ "model": args.model_name,
238
+ "messages": [{"role": "user", "content": args.prompt}],
239
+ "max_tokens": args.max_tokens,
240
+ "temperature": args.temperature,
241
+ "stream": not args.no_stream,
242
+ }
243
+ body = _json.dumps(payload).encode()
244
+ req = urllib.request.Request(
245
+ url, data=body,
246
+ headers={
247
+ "Content-Type": "application/json",
248
+ "User-Agent": "quantcpp-client",
249
+ },
250
+ )
251
+
252
+ try:
253
+ with urllib.request.urlopen(req) as resp:
254
+ if args.no_stream:
255
+ data = _json.loads(resp.read())
256
+ print(data["choices"][0]["message"]["content"])
257
+ return 0
258
+
259
+ # SSE stream — parse `data: {...}\n\n` chunks
260
+ for line in resp:
261
+ line = line.decode("utf-8", errors="replace").rstrip()
262
+ if not line.startswith("data:"):
263
+ continue
264
+ payload_str = line[5:].strip()
265
+ if payload_str == "[DONE]":
266
+ break
267
+ try:
268
+ chunk = _json.loads(payload_str)
269
+ delta = chunk["choices"][0]["delta"].get("content", "")
270
+ if delta:
271
+ print(delta, end="", flush=True)
272
+ except Exception:
273
+ pass
274
+ print()
275
+ return 0
276
+ except urllib.error.URLError as e:
277
+ print(f"connection failed: {e}", file=sys.stderr)
278
+ print(f" Is the server running on {args.url}?", file=sys.stderr)
279
+ print(f" Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}",
280
+ file=sys.stderr)
281
+ return 1
282
+
283
+
202
284
  def cmd_chat_default(args):
203
285
  """Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
204
286
  args.model = args.model or "Llama-3.2-1B"
@@ -222,6 +304,7 @@ commands:
222
304
  list List cached and available models
223
305
  run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
224
306
  serve MODEL Start OpenAI-compatible HTTP server
307
+ client PROMPT Send a request to a running serve (default: SSE streaming)
225
308
 
226
309
  examples:
227
310
  quantcpp pull llama3.2:1b
@@ -229,6 +312,9 @@ examples:
229
312
  quantcpp run llama3.2:1b
230
313
  quantcpp run llama3.2:1b "What is gravity?"
231
314
  quantcpp serve llama3.2:1b --port 8080
315
+ quantcpp client "What is gravity?" # streams from :8080
316
+ quantcpp client "Hi" --url http://localhost:8081
317
+ quantcpp client "Hi" --no-stream # single JSON response
232
318
 
233
319
  backwards-compat (no subcommand):
234
320
  quantcpp # default chat with Llama-3.2-1B
@@ -261,6 +347,19 @@ backwards-compat (no subcommand):
261
347
  p_serve.add_argument("-p", "--port", type=int, default=8080)
262
348
  p_serve.add_argument("-j", "--threads", type=int, default=4)
263
349
 
350
+ # client
351
+ p_client = sub.add_parser("client",
352
+ help="Send a chat request to a running quantcpp serve endpoint")
353
+ p_client.add_argument("prompt", help="Question to send")
354
+ p_client.add_argument("--url", default="http://localhost:8080",
355
+ help="Server URL (default: http://localhost:8080)")
356
+ p_client.add_argument("--model-name", "-m", default="quantcpp",
357
+ help="Model name in the request body (server ignores)")
358
+ p_client.add_argument("-n", "--max-tokens", type=int, default=256)
359
+ p_client.add_argument("-t", "--temperature", type=float, default=0.7)
360
+ p_client.add_argument("--no-stream", action="store_true",
361
+ help="Disable SSE streaming (single JSON response)")
362
+
264
363
  # Backwards-compat: top-level args for direct chat
265
364
  parser.add_argument("prompt", nargs="*", default=None,
266
365
  help="(default mode) question to ask")
@@ -280,6 +379,8 @@ backwards-compat (no subcommand):
280
379
  return cmd_run(args)
281
380
  if args.command == "serve":
282
381
  return cmd_serve(args)
382
+ if args.command == "client":
383
+ return cmd_client(args)
283
384
 
284
385
  # No subcommand → backwards-compat default chat
285
386
  return cmd_chat_default(args)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: quantcpp
3
- Version: 0.12.0
3
+ Version: 0.12.1
4
4
  Summary: Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)
5
5
  Author-email: quantumaikr <noreply@quantumaikr.com>
6
6
  License: Apache-2.0
File without changes
File without changes
File without changes
File without changes
File without changes