vllm-i64 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +179 -0
  2. package/package.json +1 -1
package/README.md ADDED
@@ -0,0 +1,179 @@
1
+ # vllm-i64
2
+
3
+ TypeScript SDK for [vllm-i64](https://github.com/Complexity-ML/vllm-i64) — the integer-first inference engine for token-routed language models.
4
+
5
+ Zero dependencies. Node >= 18.
6
+
7
+ ```bash
8
+ npm install vllm-i64
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```ts
14
+ import { I64Client } from "vllm-i64";
15
+
16
+ const client = new I64Client("http://localhost:8000");
17
+
18
+ // Chat completion
19
+ const res = await client.chat.create([
20
+ { role: "user", content: "Write a fibonacci function in Python" }
21
+ ]);
22
+ console.log(res.choices[0].message.content);
23
+ ```
24
+
25
+ ## Streaming
26
+
27
+ ```ts
28
+ for await (const chunk of client.chat.stream([
29
+ { role: "user", content: "Explain transformers" }
30
+ ])) {
31
+ process.stdout.write(chunk);
32
+ }
33
+ ```
34
+
35
+ ## Tool Calls (OpenAI-compatible)
36
+
37
+ ```ts
38
+ const res = await client.chat.create(
39
+ [{ role: "user", content: "What's the weather in Paris?" }],
40
+ {
41
+ tools: [{
42
+ type: "function",
43
+ function: {
44
+ name: "get_weather",
45
+ description: "Get current weather",
46
+ parameters: {
47
+ type: "object",
48
+ properties: { city: { type: "string" } },
49
+ required: ["city"]
50
+ }
51
+ }
52
+ }]
53
+ }
54
+ );
55
+
56
+ if (res.choices[0].message.tool_calls) {
57
+ console.log(res.choices[0].message.tool_calls);
58
+ }
59
+ ```
60
+
61
+ ## Text Completions
62
+
63
+ ```ts
64
+ const res = await client.completions.create("def fibonacci(n):", {
65
+ max_tokens: 200,
66
+ temperature: 0.2,
67
+ });
68
+ console.log(res.choices[0].text);
69
+
70
+ // Batch — multiple prompts at once
71
+ const batch = await client.completions.batch(
72
+ ["Hello", "Bonjour", "Hola"],
73
+ { max_tokens: 50 }
74
+ );
75
+ ```
76
+
77
+ ## Monitoring
78
+
79
+ ```ts
80
+ // Live snapshot — batch size, KV cache, tok/s, GPU
81
+ const snap = await client.monitor.snapshot();
82
+ console.log(`${snap.engine.total_tokens_generated} tokens generated`);
83
+ console.log(`${snap.perf?.tok_per_s} tok/s`);
84
+ console.log(`KV cache: ${snap.kv_cache?.usage_pct}% used`);
85
+
86
+ // Health check
87
+ const health = await client.monitor.health();
88
+ console.log(health.status); // "ok" | "degraded"
89
+
90
+ // Expert routing distribution (MoE models)
91
+ const experts = await client.monitor.experts();
92
+ console.log(`${experts.num_experts} experts, imbalance: ${experts.imbalance}`);
93
+ ```
94
+
95
+ ## KV Cache Management
96
+
97
+ ```ts
98
+ // Cache statistics
99
+ const stats = await client.cache.stats();
100
+ console.log(`${stats.used_blocks}/${stats.num_blocks} blocks used`);
101
+ console.log(`${stats.prefix_cached_blocks} prefix blocks cached`);
102
+
103
+ // Purge prefix cache (admin)
104
+ await client.cache.purge();
105
+ ```
106
+
107
+ ## LoRA Hot-Swap
108
+
109
+ ```ts
110
+ // Load an adapter at runtime
111
+ await client.lora.load({
112
+ adapter_id: 1,
113
+ path: "/models/lora-python-v2",
114
+ name: "python-specialist",
115
+ scaling: 0.8,
116
+ });
117
+
118
+ // List loaded adapters
119
+ const { adapters } = await client.lora.list();
120
+ console.log(adapters); // [{ id: 1, name: "python-specialist" }]
121
+
122
+ // Swap to a different adapter
123
+ await client.lora.load({ adapter_id: 2, path: "/models/lora-chat-v3" });
124
+
125
+ // Unload when done
126
+ await client.lora.unload(1);
127
+ ```
128
+
129
+ ## RAG (Retrieval-Augmented Generation)
130
+
131
+ ```ts
132
+ // Index documents
133
+ await client.rag.index({ text: "Paris is the capital of France." });
134
+ await client.rag.index({ file: "/data/docs/handbook.pdf" });
135
+
136
+ // Search
137
+ const results = await client.rag.search("capital of France", 3);
138
+ console.log(results.results[0].text);
139
+
140
+ // Stats
141
+ const ragStats = await client.rag.stats();
142
+ console.log(`${ragStats.total_chunks} chunks indexed`);
143
+ ```
144
+
145
+ ## Authentication
146
+
147
+ ```ts
148
+ const client = new I64Client("http://localhost:8000", {
149
+ apiKey: "sk-your-api-key",
150
+ timeoutMs: 30_000,
151
+ });
152
+ ```
153
+
154
+ ## API Reference
155
+
156
+ | Namespace | Methods |
157
+ |---|---|
158
+ | `client.chat` | `create()`, `stream()`, `streamRaw()` |
159
+ | `client.completions` | `create()`, `stream()`, `batch()` |
160
+ | `client.cache` | `stats()`, `purge()` |
161
+ | `client.lora` | `load()`, `unload()`, `list()` |
162
+ | `client.monitor` | `health()`, `isReady()`, `models()`, `snapshot()`, `metrics()`, `experts()`, `cancel()` |
163
+ | `client.rag` | `index()`, `search()`, `stats()` |
164
+
165
+ ## What is vllm-i64?
166
+
167
+ An integer-first inference engine for token-routed Mixture-of-Experts models. Key features:
168
+
169
+ - **Token routing**: `expert_id = token_id % num_experts` — deterministic, no learned router
170
+ - **Continuous batching**: mixed prefill + decode in every step
171
+ - **Paged KV cache**: with prefix caching, LRU eviction, FP8 compression
172
+ - **LoRA hot-swap**: load/unload adapters at runtime without restart
173
+ - **OpenAI-compatible API**: drop-in replacement for any OpenAI client
174
+
175
+ Built by [Complexity-ML](https://github.com/Complexity-ML) / INL.
176
+
177
+ ## License
178
+
179
+ Apache-2.0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "vllm-i64",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "TypeScript SDK for vllm-i64 — integer-first inference engine",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",