quantcpp 0.12.0__tar.gz → 0.12.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quantcpp-0.12.0/quantcpp.egg-info → quantcpp-0.12.1}/PKG-INFO +1 -1
- {quantcpp-0.12.0 → quantcpp-0.12.1}/pyproject.toml +1 -1
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quant.h +6 -2
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quantcpp/__init__.py +1 -1
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quantcpp/_quant.h +6 -2
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quantcpp/cli.py +102 -1
- {quantcpp-0.12.0 → quantcpp-0.12.1/quantcpp.egg-info}/PKG-INFO +1 -1
- {quantcpp-0.12.0 → quantcpp-0.12.1}/MANIFEST.in +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/README.md +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quantcpp/_binding.py +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quantcpp.egg-info/SOURCES.txt +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quantcpp.egg-info/dependency_links.txt +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quantcpp.egg-info/entry_points.txt +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quantcpp.egg-info/requires.txt +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/quantcpp.egg-info/top_level.txt +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/setup.cfg +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/setup.py +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/tests/test_basic.py +0 -0
- {quantcpp-0.12.0 → quantcpp-0.12.1}/tests/test_python.py +0 -0
|
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
|
|
|
7
7
|
|
|
8
8
|
[project]
|
|
9
9
|
name = "quantcpp"
|
|
10
|
-
version = "0.12.
|
|
10
|
+
version = "0.12.1"
|
|
11
11
|
description = "Single-header LLM inference engine with KV cache compression (7× compression at fp32 parity)"
|
|
12
12
|
readme = "README.md"
|
|
13
13
|
license = { text = "Apache-2.0" }
|
|
@@ -866,6 +866,7 @@ typedef struct {
|
|
|
866
866
|
int n_threads;
|
|
867
867
|
float rep_penalty; /* repetition penalty (default: 1.1, 1.0 = disabled) */
|
|
868
868
|
int rep_window; /* how many recent tokens to penalize (default: 32) */
|
|
869
|
+
unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
|
|
869
870
|
/* Callback for streaming output */
|
|
870
871
|
void (*on_token)(const char* text, void* user_data);
|
|
871
872
|
void* user_data;
|
|
@@ -4129,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
|
|
|
4129
4130
|
config.n_threads = 1;
|
|
4130
4131
|
config.rep_penalty = 1.1f;
|
|
4131
4132
|
config.rep_window = 32;
|
|
4133
|
+
config.rng_seed = 42ULL;
|
|
4132
4134
|
config.on_token = NULL;
|
|
4133
4135
|
config.user_data = NULL;
|
|
4134
4136
|
return config;
|
|
@@ -15453,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
|
|
|
15453
15455
|
}
|
|
15454
15456
|
}
|
|
15455
15457
|
|
|
15456
|
-
/* Sample first generated token
|
|
15458
|
+
/* Sample first generated token. The seed is configurable via
|
|
15459
|
+
* config->rng_seed (default 42); 0 falls back to 42 so existing
|
|
15460
|
+
* callers that never set rng_seed get bit-identical behaviour. */
|
|
15457
15461
|
int pos = n_prompt;
|
|
15458
|
-
unsigned long long rng_state =
|
|
15462
|
+
unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
|
|
15459
15463
|
int next_token = tq_sample_topp(state->logits, vocab_size,
|
|
15460
15464
|
config->temperature, config->top_p,
|
|
15461
15465
|
&rng_state);
|
|
@@ -15,7 +15,7 @@ try:
|
|
|
15
15
|
from importlib.metadata import version as _pkg_version
|
|
16
16
|
__version__ = _pkg_version("quantcpp")
|
|
17
17
|
except Exception:
|
|
18
|
-
__version__ = "0.12.
|
|
18
|
+
__version__ = "0.12.1" # fallback for editable / source-tree imports
|
|
19
19
|
|
|
20
20
|
import os
|
|
21
21
|
import sys
|
|
@@ -866,6 +866,7 @@ typedef struct {
|
|
|
866
866
|
int n_threads;
|
|
867
867
|
float rep_penalty; /* repetition penalty (default: 1.1, 1.0 = disabled) */
|
|
868
868
|
int rep_window; /* how many recent tokens to penalize (default: 32) */
|
|
869
|
+
unsigned long long rng_seed; /* sampling seed (default: 42, 0 = use 42 for back-compat) */
|
|
869
870
|
/* Callback for streaming output */
|
|
870
871
|
void (*on_token)(const char* text, void* user_data);
|
|
871
872
|
void* user_data;
|
|
@@ -4129,6 +4130,7 @@ tq_gen_config_t tq_default_gen_config(void) {
|
|
|
4129
4130
|
config.n_threads = 1;
|
|
4130
4131
|
config.rep_penalty = 1.1f;
|
|
4131
4132
|
config.rep_window = 32;
|
|
4133
|
+
config.rng_seed = 42ULL;
|
|
4132
4134
|
config.on_token = NULL;
|
|
4133
4135
|
config.user_data = NULL;
|
|
4134
4136
|
return config;
|
|
@@ -15453,9 +15455,11 @@ int tq_generate(tq_model_t* model, tq_tokenizer_t* tokenizer,
|
|
|
15453
15455
|
}
|
|
15454
15456
|
}
|
|
15455
15457
|
|
|
15456
|
-
/* Sample first generated token
|
|
15458
|
+
/* Sample first generated token. The seed is configurable via
|
|
15459
|
+
* config->rng_seed (default 42); 0 falls back to 42 so existing
|
|
15460
|
+
* callers that never set rng_seed get bit-identical behaviour. */
|
|
15457
15461
|
int pos = n_prompt;
|
|
15458
|
-
unsigned long long rng_state =
|
|
15462
|
+
unsigned long long rng_state = config->rng_seed ? config->rng_seed : 42ULL;
|
|
15459
15463
|
int next_token = tq_sample_topp(state->logits, vocab_size,
|
|
15460
15464
|
config->temperature, config->top_p,
|
|
15461
15465
|
&rng_state);
|
|
@@ -195,10 +195,92 @@ def cmd_serve(args):
|
|
|
195
195
|
return 2
|
|
196
196
|
|
|
197
197
|
cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
|
|
198
|
-
print(f"
|
|
198
|
+
print(f"quantcpp serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
|
|
199
|
+
print("", file=sys.stderr)
|
|
200
|
+
print("OpenAI-compatible endpoints:", file=sys.stderr)
|
|
201
|
+
print(f" POST http://localhost:{args.port}/v1/chat/completions", file=sys.stderr)
|
|
202
|
+
print(f" GET http://localhost:{args.port}/v1/models", file=sys.stderr)
|
|
203
|
+
print(f" GET http://localhost:{args.port}/health", file=sys.stderr)
|
|
204
|
+
print("", file=sys.stderr)
|
|
205
|
+
print("Streaming (SSE — token-by-token):", file=sys.stderr)
|
|
206
|
+
print(f" curl -N http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
|
|
207
|
+
print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
|
|
208
|
+
print(' -d \'{"messages":[{"role":"user","content":"Hi"}],"stream":true}\'',
|
|
209
|
+
file=sys.stderr)
|
|
210
|
+
print("", file=sys.stderr)
|
|
211
|
+
print("Non-streaming (single JSON response):", file=sys.stderr)
|
|
212
|
+
print(f" curl http://localhost:{args.port}/v1/chat/completions \\", file=sys.stderr)
|
|
213
|
+
print(" -H 'Content-Type: application/json' \\", file=sys.stderr)
|
|
214
|
+
print(' -d \'{"messages":[{"role":"user","content":"Hi"}]}\'',
|
|
215
|
+
file=sys.stderr)
|
|
216
|
+
print("", file=sys.stderr)
|
|
217
|
+
print("OpenAI Python SDK works as-is:", file=sys.stderr)
|
|
218
|
+
print(f" client = OpenAI(base_url='http://localhost:{args.port}/v1', api_key='none')",
|
|
219
|
+
file=sys.stderr)
|
|
220
|
+
print(" client.chat.completions.create(model='quantcpp', messages=[...], stream=True)",
|
|
221
|
+
file=sys.stderr)
|
|
222
|
+
print("", file=sys.stderr)
|
|
199
223
|
os.execvp(cmd[0], cmd)
|
|
200
224
|
|
|
201
225
|
|
|
226
|
+
def cmd_client(args):
|
|
227
|
+
"""Send a chat request to a running quantcpp serve endpoint.
|
|
228
|
+
|
|
229
|
+
Default mode is streaming (SSE) — tokens print as they arrive.
|
|
230
|
+
Use --no-stream for a single JSON response.
|
|
231
|
+
"""
|
|
232
|
+
import json as _json
|
|
233
|
+
import urllib.request
|
|
234
|
+
|
|
235
|
+
url = args.url.rstrip("/") + "/v1/chat/completions"
|
|
236
|
+
payload = {
|
|
237
|
+
"model": args.model_name,
|
|
238
|
+
"messages": [{"role": "user", "content": args.prompt}],
|
|
239
|
+
"max_tokens": args.max_tokens,
|
|
240
|
+
"temperature": args.temperature,
|
|
241
|
+
"stream": not args.no_stream,
|
|
242
|
+
}
|
|
243
|
+
body = _json.dumps(payload).encode()
|
|
244
|
+
req = urllib.request.Request(
|
|
245
|
+
url, data=body,
|
|
246
|
+
headers={
|
|
247
|
+
"Content-Type": "application/json",
|
|
248
|
+
"User-Agent": "quantcpp-client",
|
|
249
|
+
},
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
with urllib.request.urlopen(req) as resp:
|
|
254
|
+
if args.no_stream:
|
|
255
|
+
data = _json.loads(resp.read())
|
|
256
|
+
print(data["choices"][0]["message"]["content"])
|
|
257
|
+
return 0
|
|
258
|
+
|
|
259
|
+
# SSE stream — parse `data: {...}\n\n` chunks
|
|
260
|
+
for line in resp:
|
|
261
|
+
line = line.decode("utf-8", errors="replace").rstrip()
|
|
262
|
+
if not line.startswith("data:"):
|
|
263
|
+
continue
|
|
264
|
+
payload_str = line[5:].strip()
|
|
265
|
+
if payload_str == "[DONE]":
|
|
266
|
+
break
|
|
267
|
+
try:
|
|
268
|
+
chunk = _json.loads(payload_str)
|
|
269
|
+
delta = chunk["choices"][0]["delta"].get("content", "")
|
|
270
|
+
if delta:
|
|
271
|
+
print(delta, end="", flush=True)
|
|
272
|
+
except Exception:
|
|
273
|
+
pass
|
|
274
|
+
print()
|
|
275
|
+
return 0
|
|
276
|
+
except urllib.error.URLError as e:
|
|
277
|
+
print(f"connection failed: {e}", file=sys.stderr)
|
|
278
|
+
print(f" Is the server running on {args.url}?", file=sys.stderr)
|
|
279
|
+
print(f" Start it with: quantcpp serve llama3.2:1b -p {args.url.rsplit(':', 1)[-1].rstrip('/')}",
|
|
280
|
+
file=sys.stderr)
|
|
281
|
+
return 1
|
|
282
|
+
|
|
283
|
+
|
|
202
284
|
def cmd_chat_default(args):
|
|
203
285
|
"""Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
|
|
204
286
|
args.model = args.model or "Llama-3.2-1B"
|
|
@@ -222,6 +304,7 @@ commands:
|
|
|
222
304
|
list List cached and available models
|
|
223
305
|
run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
|
|
224
306
|
serve MODEL Start OpenAI-compatible HTTP server
|
|
307
|
+
client PROMPT Send a request to a running serve (default: SSE streaming)
|
|
225
308
|
|
|
226
309
|
examples:
|
|
227
310
|
quantcpp pull llama3.2:1b
|
|
@@ -229,6 +312,9 @@ examples:
|
|
|
229
312
|
quantcpp run llama3.2:1b
|
|
230
313
|
quantcpp run llama3.2:1b "What is gravity?"
|
|
231
314
|
quantcpp serve llama3.2:1b --port 8080
|
|
315
|
+
quantcpp client "What is gravity?" # streams from :8080
|
|
316
|
+
quantcpp client "Hi" --url http://localhost:8081
|
|
317
|
+
quantcpp client "Hi" --no-stream # single JSON response
|
|
232
318
|
|
|
233
319
|
backwards-compat (no subcommand):
|
|
234
320
|
quantcpp # default chat with Llama-3.2-1B
|
|
@@ -261,6 +347,19 @@ backwards-compat (no subcommand):
|
|
|
261
347
|
p_serve.add_argument("-p", "--port", type=int, default=8080)
|
|
262
348
|
p_serve.add_argument("-j", "--threads", type=int, default=4)
|
|
263
349
|
|
|
350
|
+
# client
|
|
351
|
+
p_client = sub.add_parser("client",
|
|
352
|
+
help="Send a chat request to a running quantcpp serve endpoint")
|
|
353
|
+
p_client.add_argument("prompt", help="Question to send")
|
|
354
|
+
p_client.add_argument("--url", default="http://localhost:8080",
|
|
355
|
+
help="Server URL (default: http://localhost:8080)")
|
|
356
|
+
p_client.add_argument("--model-name", "-m", default="quantcpp",
|
|
357
|
+
help="Model name in the request body (server ignores)")
|
|
358
|
+
p_client.add_argument("-n", "--max-tokens", type=int, default=256)
|
|
359
|
+
p_client.add_argument("-t", "--temperature", type=float, default=0.7)
|
|
360
|
+
p_client.add_argument("--no-stream", action="store_true",
|
|
361
|
+
help="Disable SSE streaming (single JSON response)")
|
|
362
|
+
|
|
264
363
|
# Backwards-compat: top-level args for direct chat
|
|
265
364
|
parser.add_argument("prompt", nargs="*", default=None,
|
|
266
365
|
help="(default mode) question to ask")
|
|
@@ -280,6 +379,8 @@ backwards-compat (no subcommand):
|
|
|
280
379
|
return cmd_run(args)
|
|
281
380
|
if args.command == "serve":
|
|
282
381
|
return cmd_serve(args)
|
|
382
|
+
if args.command == "client":
|
|
383
|
+
return cmd_client(args)
|
|
283
384
|
|
|
284
385
|
# No subcommand → backwards-compat default chat
|
|
285
386
|
return cmd_chat_default(args)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|