moflo 4.10.1 → 4.10.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/healer/SKILL.md +3 -1
- package/bin/session-start-launcher.mjs +112 -5
- package/dist/src/cli/commands/doctor-checks-config.js +4 -4
- package/dist/src/cli/commands/doctor-checks-memory-access.js +27 -1
- package/dist/src/cli/commands/doctor-embedding-hygiene.js +48 -12
- package/dist/src/cli/commands/doctor-render.js +118 -74
- package/dist/src/cli/commands/doctor-version.js +1 -1
- package/dist/src/cli/commands/doctor.js +70 -25
- package/dist/src/cli/commands/index.js +0 -6
- package/dist/src/cli/init/executor.js +2 -2
- package/dist/src/cli/mcp-tools/swarm-tools.js +3 -4
- package/dist/src/cli/memory/bridge-core.js +36 -0
- package/dist/src/cli/services/moflo-paths.js +6 -5
- package/dist/src/cli/services/moflo-require.js +2 -2
- package/dist/src/cli/shared/core/config/loader.js +2 -2
- package/dist/src/cli/version.js +1 -1
- package/package.json +2 -2
- package/dist/src/cli/appliance/gguf-engine.js +0 -425
- package/dist/src/cli/appliance/ruvllm-bridge.js +0 -231
- package/dist/src/cli/appliance/rvfa-builder.js +0 -325
- package/dist/src/cli/appliance/rvfa-distribution.js +0 -370
- package/dist/src/cli/appliance/rvfa-format.js +0 -393
- package/dist/src/cli/appliance/rvfa-runner.js +0 -238
- package/dist/src/cli/appliance/rvfa-signing.js +0 -351
- package/dist/src/cli/commands/appliance-advanced.js +0 -213
- package/dist/src/cli/commands/appliance.js +0 -404
|
@@ -1,425 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* ruvLLM GGUF Inference Engine -- Pure Node.js GGUF Model Interface
|
|
3
|
-
*
|
|
4
|
-
* Provides:
|
|
5
|
-
* 1. GGUF binary header parsing (metadata without loading weights)
|
|
6
|
-
* 2. Model loading abstraction (node-llama-cpp when available, metadata-only fallback)
|
|
7
|
-
* 3. Token generation interface with async iterator streaming
|
|
8
|
-
* 4. KV-cache persistence to RVF-compatible binary format
|
|
9
|
-
*
|
|
10
|
-
* Zero external dependencies. node-llama-cpp is an optional peer.
|
|
11
|
-
*
|
|
12
|
-
* @module moflo/appliance/gguf-engine
|
|
13
|
-
*/
|
|
14
|
-
import { open, readFile, writeFile, stat as fsStat } from 'node:fs/promises';
|
|
15
|
-
import { createHash } from 'node:crypto';
|
|
16
|
-
import { basename } from 'node:path';
|
|
17
|
-
// ── GGUF Metadata Value Types ───────────────────────────────
|
|
18
|
-
var GgufValueType;
|
|
19
|
-
(function (GgufValueType) {
|
|
20
|
-
GgufValueType[GgufValueType["UINT8"] = 0] = "UINT8";
|
|
21
|
-
GgufValueType[GgufValueType["INT8"] = 1] = "INT8";
|
|
22
|
-
GgufValueType[GgufValueType["UINT16"] = 2] = "UINT16";
|
|
23
|
-
GgufValueType[GgufValueType["INT16"] = 3] = "INT16";
|
|
24
|
-
GgufValueType[GgufValueType["UINT32"] = 4] = "UINT32";
|
|
25
|
-
GgufValueType[GgufValueType["INT32"] = 5] = "INT32";
|
|
26
|
-
GgufValueType[GgufValueType["FLOAT32"] = 6] = "FLOAT32";
|
|
27
|
-
GgufValueType[GgufValueType["BOOL"] = 7] = "BOOL";
|
|
28
|
-
GgufValueType[GgufValueType["STRING"] = 8] = "STRING";
|
|
29
|
-
GgufValueType[GgufValueType["ARRAY"] = 9] = "ARRAY";
|
|
30
|
-
GgufValueType[GgufValueType["UINT64"] = 10] = "UINT64";
|
|
31
|
-
GgufValueType[GgufValueType["INT64"] = 11] = "INT64";
|
|
32
|
-
GgufValueType[GgufValueType["FLOAT64"] = 12] = "FLOAT64";
|
|
33
|
-
})(GgufValueType || (GgufValueType = {}));
|
|
34
|
-
const GGUF_MAGIC = 0x46554747; // "GGUF" in little-endian
|
|
35
|
-
const RVKV_MAGIC = 0x564B5652; // "RVKV" in little-endian
|
|
36
|
-
const RVKV_VERSION = 1;
|
|
37
|
-
// ── Internal Buffer Reader ──────────────────────────────────
|
|
38
|
-
/** Stateful cursor over a Buffer for sequential binary reads. */
|
|
39
|
-
class BufferReader {
|
|
40
|
-
buf;
|
|
41
|
-
offset = 0;
|
|
42
|
-
constructor(buf) {
|
|
43
|
-
this.buf = buf;
|
|
44
|
-
}
|
|
45
|
-
get remaining() { return this.buf.length - this.offset; }
|
|
46
|
-
readU8() { const v = this.buf.readUInt8(this.offset); this.offset += 1; return v; }
|
|
47
|
-
readI8() { const v = this.buf.readInt8(this.offset); this.offset += 1; return v; }
|
|
48
|
-
readU16() { const v = this.buf.readUInt16LE(this.offset); this.offset += 2; return v; }
|
|
49
|
-
readI16() { const v = this.buf.readInt16LE(this.offset); this.offset += 2; return v; }
|
|
50
|
-
readU32() { const v = this.buf.readUInt32LE(this.offset); this.offset += 4; return v; }
|
|
51
|
-
readI32() { const v = this.buf.readInt32LE(this.offset); this.offset += 4; return v; }
|
|
52
|
-
readF32() { const v = this.buf.readFloatLE(this.offset); this.offset += 4; return v; }
|
|
53
|
-
readF64() { const v = this.buf.readDoubleLE(this.offset); this.offset += 8; return v; }
|
|
54
|
-
readU64() { const v = this.buf.readBigUInt64LE(this.offset); this.offset += 8; return v; }
|
|
55
|
-
readI64() { const v = this.buf.readBigInt64LE(this.offset); this.offset += 8; return v; }
|
|
56
|
-
/** Safe for values up to 2^53. Real GGUF files never exceed this for tensor/kv counts. */
|
|
57
|
-
readU64AsNumber() { return Number(this.readU64()); }
|
|
58
|
-
readBool() { return this.readU8() !== 0; }
|
|
59
|
-
/** GGUF string: [length u64 LE][utf-8 bytes]. */
|
|
60
|
-
readString() {
|
|
61
|
-
const len = this.readU64AsNumber();
|
|
62
|
-
if (len === 0)
|
|
63
|
-
return '';
|
|
64
|
-
if (len > this.remaining)
|
|
65
|
-
throw new Error(`String length ${len} exceeds remaining buffer`);
|
|
66
|
-
const s = this.buf.toString('utf-8', this.offset, this.offset + len);
|
|
67
|
-
this.offset += len;
|
|
68
|
-
return s;
|
|
69
|
-
}
|
|
70
|
-
}
|
|
71
|
-
// ── GGUF Value Reading ──────────────────────────────────────
|
|
72
|
-
/** Read a typed scalar from the buffer (shared by value and array-element readers). */
|
|
73
|
-
function readScalar(reader, t) {
|
|
74
|
-
switch (t) {
|
|
75
|
-
case GgufValueType.UINT8: return reader.readU8();
|
|
76
|
-
case GgufValueType.INT8: return reader.readI8();
|
|
77
|
-
case GgufValueType.UINT16: return reader.readU16();
|
|
78
|
-
case GgufValueType.INT16: return reader.readI16();
|
|
79
|
-
case GgufValueType.UINT32: return reader.readU32();
|
|
80
|
-
case GgufValueType.INT32: return reader.readI32();
|
|
81
|
-
case GgufValueType.FLOAT32: return reader.readF32();
|
|
82
|
-
case GgufValueType.BOOL: return reader.readBool();
|
|
83
|
-
case GgufValueType.STRING: return reader.readString();
|
|
84
|
-
case GgufValueType.UINT64: return Number(reader.readU64());
|
|
85
|
-
case GgufValueType.INT64: return Number(reader.readI64());
|
|
86
|
-
case GgufValueType.FLOAT64: return reader.readF64();
|
|
87
|
-
default: return undefined;
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
/** Read a single GGUF typed value (scalar or array) from the buffer. */
|
|
91
|
-
function readGgufValue(reader) {
|
|
92
|
-
const valueType = reader.readU32();
|
|
93
|
-
if (valueType === GgufValueType.ARRAY) {
|
|
94
|
-
const elemType = reader.readU32();
|
|
95
|
-
const len = reader.readU64AsNumber();
|
|
96
|
-
const arr = [];
|
|
97
|
-
for (let i = 0; i < len; i++) {
|
|
98
|
-
const v = readScalar(reader, elemType);
|
|
99
|
-
if (v === undefined)
|
|
100
|
-
throw new Error(`Unknown GGUF array element type: ${elemType}`);
|
|
101
|
-
arr.push(v);
|
|
102
|
-
}
|
|
103
|
-
return arr;
|
|
104
|
-
}
|
|
105
|
-
const v = readScalar(reader, valueType);
|
|
106
|
-
if (v === undefined)
|
|
107
|
-
throw new Error(`Unknown GGUF value type: ${valueType}`);
|
|
108
|
-
return v;
|
|
109
|
-
}
|
|
110
|
-
// ── GGUF Header Parsing ─────────────────────────────────────
|
|
111
|
-
/**
|
|
112
|
-
* Parse the header and metadata from a GGUF file without loading tensors.
|
|
113
|
-
* Reads only the first 256 KB of the file.
|
|
114
|
-
*/
|
|
115
|
-
export async function parseGgufHeader(path) {
|
|
116
|
-
const fileInfo = await fsStat(path);
|
|
117
|
-
const readSize = Math.min(fileInfo.size, 256 * 1024);
|
|
118
|
-
const fh = await open(path, 'r');
|
|
119
|
-
try {
|
|
120
|
-
const buf = Buffer.alloc(readSize);
|
|
121
|
-
await fh.read(buf, 0, readSize, 0);
|
|
122
|
-
return parseGgufBuffer(buf, fileInfo.size, path);
|
|
123
|
-
}
|
|
124
|
-
finally {
|
|
125
|
-
await fh.close();
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
function parseGgufBuffer(buf, fileSize, filePath) {
|
|
129
|
-
const reader = new BufferReader(buf);
|
|
130
|
-
const magic = reader.readU32();
|
|
131
|
-
if (magic !== GGUF_MAGIC) {
|
|
132
|
-
throw new Error(`Invalid GGUF magic: 0x${magic.toString(16)} (expected 0x${GGUF_MAGIC.toString(16)})`);
|
|
133
|
-
}
|
|
134
|
-
const version = reader.readU32();
|
|
135
|
-
if (version < 2 || version > 3) {
|
|
136
|
-
throw new Error(`Unsupported GGUF version: ${version} (expected 2 or 3)`);
|
|
137
|
-
}
|
|
138
|
-
const tensorCount = reader.readU64AsNumber();
|
|
139
|
-
const kvCount = reader.readU64AsNumber();
|
|
140
|
-
const metadata = {};
|
|
141
|
-
for (let i = 0; i < kvCount; i++) {
|
|
142
|
-
if (reader.remaining < 12)
|
|
143
|
-
break;
|
|
144
|
-
try {
|
|
145
|
-
const key = reader.readString();
|
|
146
|
-
metadata[key] = readGgufValue(reader);
|
|
147
|
-
}
|
|
148
|
-
catch {
|
|
149
|
-
break; // reached end of read window
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
const arch = asString(metadata['general.architecture']);
|
|
153
|
-
const pfx = arch || 'llama'; // fallback prefix for well-known keys
|
|
154
|
-
return {
|
|
155
|
-
magic: 'GGUF', version, tensorCount, kvCount,
|
|
156
|
-
architecture: arch,
|
|
157
|
-
name: asString(metadata['general.name']),
|
|
158
|
-
contextLength: asNumber(metadata[`${pfx}.context_length`]),
|
|
159
|
-
embeddingLength: asNumber(metadata[`${pfx}.embedding_length`]),
|
|
160
|
-
blockCount: asNumber(metadata[`${pfx}.block_count`]),
|
|
161
|
-
vocabSize: inferVocabSize(metadata),
|
|
162
|
-
quantization: inferQuantFromMetadata(metadata, filePath),
|
|
163
|
-
fileSize, metadata,
|
|
164
|
-
};
|
|
165
|
-
}
|
|
166
|
-
// ── Metadata Helpers ────────────────────────────────────────
|
|
167
|
-
function asString(v) { return typeof v === 'string' ? v : undefined; }
|
|
168
|
-
function asNumber(v) { return typeof v === 'number' ? v : undefined; }
|
|
169
|
-
const QUANT_RE = [
|
|
170
|
-
[/q2_k/i, 'Q2_K'], [/q3_k_s/i, 'Q3_K_S'], [/q3_k_m/i, 'Q3_K_M'], [/q3_k_l/i, 'Q3_K_L'],
|
|
171
|
-
[/q4_k_s/i, 'Q4_K_S'], [/q4_k_m/i, 'Q4_K_M'], [/q4_0/i, 'Q4_0'], [/q4_1/i, 'Q4_1'],
|
|
172
|
-
[/q5_k_s/i, 'Q5_K_S'], [/q5_k_m/i, 'Q5_K_M'], [/q5_0/i, 'Q5_0'], [/q5_1/i, 'Q5_1'],
|
|
173
|
-
[/q6_k/i, 'Q6_K'], [/q8_0/i, 'Q8_0'], [/f16/i, 'F16'], [/f32/i, 'F32'],
|
|
174
|
-
];
|
|
175
|
-
function inferQuantFromMetadata(meta, filePath) {
|
|
176
|
-
const ft = meta['general.file_type'];
|
|
177
|
-
if (typeof ft === 'number')
|
|
178
|
-
return `file_type_${ft}`;
|
|
179
|
-
const name = basename(filePath);
|
|
180
|
-
for (const [re, label] of QUANT_RE)
|
|
181
|
-
if (re.test(name))
|
|
182
|
-
return label;
|
|
183
|
-
return 'unknown';
|
|
184
|
-
}
|
|
185
|
-
function inferVocabSize(meta) {
|
|
186
|
-
const tokens = meta['tokenizer.ggml.tokens'];
|
|
187
|
-
if (Array.isArray(tokens))
|
|
188
|
-
return tokens.length;
|
|
189
|
-
return asNumber(meta['tokenizer.ggml.vocab_size']);
|
|
190
|
-
}
|
|
191
|
-
// ── GGUF Engine ─────────────────────────────────────────────
|
|
192
|
-
export class GgufEngine {
|
|
193
|
-
config;
|
|
194
|
-
llamaCpp = null;
|
|
195
|
-
llamaModel = null;
|
|
196
|
-
llamaContext = null;
|
|
197
|
-
loadedModels = new Map();
|
|
198
|
-
activeModelPath = null;
|
|
199
|
-
kvCache = new Map();
|
|
200
|
-
constructor(config) {
|
|
201
|
-
this.config = {
|
|
202
|
-
contextSize: config.contextSize ?? 4096,
|
|
203
|
-
maxTokens: config.maxTokens ?? 512,
|
|
204
|
-
temperature: config.temperature ?? 0.7,
|
|
205
|
-
kvCachePath: config.kvCachePath ?? '',
|
|
206
|
-
verbose: config.verbose ?? false,
|
|
207
|
-
};
|
|
208
|
-
}
|
|
209
|
-
/** Probe for node-llama-cpp availability. */
|
|
210
|
-
async initialize() {
|
|
211
|
-
this.llamaCpp = await this.tryLoadLlamaCpp();
|
|
212
|
-
if (this.config.verbose) {
|
|
213
|
-
console.log(`[gguf-engine] node-llama-cpp: ${this.llamaCpp ? 'available' : 'not found (metadata-only mode)'}`);
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
/** Parse GGUF header and optionally load the model for inference. */
|
|
217
|
-
async loadModel(path) {
|
|
218
|
-
const meta = await parseGgufHeader(path);
|
|
219
|
-
this.loadedModels.set(path, meta);
|
|
220
|
-
this.activeModelPath = path;
|
|
221
|
-
if (this.llamaCpp) {
|
|
222
|
-
try {
|
|
223
|
-
const { getLlama } = this.llamaCpp;
|
|
224
|
-
const llama = await getLlama();
|
|
225
|
-
this.llamaModel = await llama.loadModel({ modelPath: path });
|
|
226
|
-
this.llamaContext = await this.llamaModel.createContext({ contextSize: this.config.contextSize });
|
|
227
|
-
if (this.config.verbose)
|
|
228
|
-
console.log(`[gguf-engine] Model loaded: ${basename(path)}`);
|
|
229
|
-
}
|
|
230
|
-
catch (err) {
|
|
231
|
-
if (this.config.verbose)
|
|
232
|
-
console.warn('[gguf-engine] node-llama-cpp load failed:', err);
|
|
233
|
-
this.llamaModel = null;
|
|
234
|
-
this.llamaContext = null;
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
return meta;
|
|
238
|
-
}
|
|
239
|
-
/** Generate text. Delegates to node-llama-cpp or returns a metadata-only stub. */
|
|
240
|
-
async generate(request) {
|
|
241
|
-
const start = performance.now();
|
|
242
|
-
const modelPath = request.model ?? this.activeModelPath;
|
|
243
|
-
const modelName = modelPath ? basename(modelPath) : 'none';
|
|
244
|
-
if (this.llamaContext && this.llamaModel) {
|
|
245
|
-
try {
|
|
246
|
-
const session = new this.llamaCpp.LlamaChatSession({
|
|
247
|
-
contextSequence: this.llamaContext.getSequence(),
|
|
248
|
-
});
|
|
249
|
-
const text = await session.prompt(request.prompt, {
|
|
250
|
-
maxTokens: request.maxTokens ?? this.config.maxTokens,
|
|
251
|
-
temperature: request.temperature ?? this.config.temperature,
|
|
252
|
-
stopGenerationTrigger: request.stopSequences
|
|
253
|
-
? request.stopSequences.map((s) => new this.llamaCpp.LlamaText([s]))
|
|
254
|
-
: undefined,
|
|
255
|
-
});
|
|
256
|
-
// Use llama.cpp tokenizer for accurate count when available, else estimate
|
|
257
|
-
let tokensUsed;
|
|
258
|
-
try {
|
|
259
|
-
const seq = this.llamaContext.getSequence();
|
|
260
|
-
tokensUsed = seq.tokenCount ?? Math.ceil(text.length / 4);
|
|
261
|
-
}
|
|
262
|
-
catch {
|
|
263
|
-
tokensUsed = Math.ceil(text.length / 4); // ~4 chars per token heuristic
|
|
264
|
-
}
|
|
265
|
-
return {
|
|
266
|
-
text, model: modelName, tokensUsed,
|
|
267
|
-
latencyMs: performance.now() - start, metadataOnly: false,
|
|
268
|
-
};
|
|
269
|
-
}
|
|
270
|
-
catch (err) {
|
|
271
|
-
if (this.config.verbose)
|
|
272
|
-
console.warn('[gguf-engine] Generation failed:', err);
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
// Metadata-only fallback
|
|
276
|
-
const meta = modelPath ? this.loadedModels.get(modelPath) : undefined;
|
|
277
|
-
return {
|
|
278
|
-
text: meta
|
|
279
|
-
? `[metadata-only] Model: ${meta.name ?? modelName}, arch: ${meta.architecture ?? 'unknown'}, ctx: ${meta.contextLength ?? 'unknown'}`
|
|
280
|
-
: '[metadata-only] No model loaded',
|
|
281
|
-
model: modelName, tokensUsed: 0,
|
|
282
|
-
latencyMs: performance.now() - start, metadataOnly: true,
|
|
283
|
-
};
|
|
284
|
-
}
|
|
285
|
-
/** Stream tokens via async iterator. Falls back to yielding full response. */
|
|
286
|
-
async *stream(request) {
|
|
287
|
-
if (this.llamaContext && this.llamaModel) {
|
|
288
|
-
try {
|
|
289
|
-
const session = new this.llamaCpp.LlamaChatSession({
|
|
290
|
-
contextSequence: this.llamaContext.getSequence(),
|
|
291
|
-
});
|
|
292
|
-
const it = session.promptWithMeta(request.prompt, {
|
|
293
|
-
maxTokens: request.maxTokens ?? this.config.maxTokens,
|
|
294
|
-
temperature: request.temperature ?? this.config.temperature,
|
|
295
|
-
});
|
|
296
|
-
if (it && typeof it[Symbol.asyncIterator] === 'function') {
|
|
297
|
-
for await (const chunk of it) {
|
|
298
|
-
if (typeof chunk === 'string')
|
|
299
|
-
yield chunk;
|
|
300
|
-
else if (chunk?.text)
|
|
301
|
-
yield chunk.text;
|
|
302
|
-
}
|
|
303
|
-
return;
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
catch { /* fall through to single-chunk fallback */ }
|
|
307
|
-
}
|
|
308
|
-
const response = await this.generate(request);
|
|
309
|
-
yield response.text;
|
|
310
|
-
}
|
|
311
|
-
/**
|
|
312
|
-
* Persist the KV cache to an RVF-compatible binary file.
|
|
313
|
-
* Format: RVKV magic | version u32 | model SHA-256 (32B) | entry count u32
|
|
314
|
-
* entries: [key_len u32, key, val_len u32, val] | footer SHA-256 (32B)
|
|
315
|
-
*/
|
|
316
|
-
async persistKvCache(outputPath) {
|
|
317
|
-
const path = outputPath || this.config.kvCachePath;
|
|
318
|
-
if (!path)
|
|
319
|
-
throw new Error('No KV cache output path specified');
|
|
320
|
-
const modelHash = createHash('sha256').update(this.activeModelPath ?? 'no-model').digest();
|
|
321
|
-
const entryBufs = [];
|
|
322
|
-
for (const [key, value] of this.kvCache) {
|
|
323
|
-
const keyBuf = Buffer.from(key, 'utf-8');
|
|
324
|
-
const hdr = Buffer.alloc(8);
|
|
325
|
-
hdr.writeUInt32LE(keyBuf.length, 0);
|
|
326
|
-
hdr.writeUInt32LE(value.length, 4);
|
|
327
|
-
entryBufs.push(hdr, keyBuf, value);
|
|
328
|
-
}
|
|
329
|
-
const entryData = Buffer.concat(entryBufs);
|
|
330
|
-
const footer = createHash('sha256').update(entryData).digest();
|
|
331
|
-
const header = Buffer.alloc(44);
|
|
332
|
-
header.writeUInt32LE(RVKV_MAGIC, 0);
|
|
333
|
-
header.writeUInt32LE(RVKV_VERSION, 4);
|
|
334
|
-
modelHash.copy(header, 8);
|
|
335
|
-
header.writeUInt32LE(this.kvCache.size, 40);
|
|
336
|
-
await writeFile(path, Buffer.concat([header, entryData, footer]));
|
|
337
|
-
if (this.config.verbose)
|
|
338
|
-
console.log(`[gguf-engine] KV cache persisted: ${this.kvCache.size} entries`);
|
|
339
|
-
}
|
|
340
|
-
/** Restore KV cache from an RVF-compatible binary file. */
|
|
341
|
-
async loadKvCache(inputPath) {
|
|
342
|
-
const data = await readFile(inputPath);
|
|
343
|
-
if (data.length < 44)
|
|
344
|
-
throw new Error('KV cache file too small');
|
|
345
|
-
const magic = data.readUInt32LE(0);
|
|
346
|
-
if (magic !== RVKV_MAGIC)
|
|
347
|
-
throw new Error(`Invalid KV cache magic: 0x${magic.toString(16)}`);
|
|
348
|
-
const version = data.readUInt32LE(4);
|
|
349
|
-
if (version !== RVKV_VERSION)
|
|
350
|
-
throw new Error(`Unsupported KV cache version: ${version}`);
|
|
351
|
-
const entryCount = data.readUInt32LE(40);
|
|
352
|
-
let offset = 44;
|
|
353
|
-
const entries = new Map();
|
|
354
|
-
for (let i = 0; i < entryCount; i++) {
|
|
355
|
-
if (offset + 8 > data.length)
|
|
356
|
-
throw new Error('KV cache file truncated');
|
|
357
|
-
const keyLen = data.readUInt32LE(offset);
|
|
358
|
-
const valLen = data.readUInt32LE(offset + 4);
|
|
359
|
-
offset += 8;
|
|
360
|
-
if (offset + keyLen + valLen > data.length)
|
|
361
|
-
throw new Error('KV cache file truncated');
|
|
362
|
-
entries.set(data.toString('utf-8', offset, offset + keyLen), Buffer.from(data.subarray(offset + keyLen, offset + keyLen + valLen)));
|
|
363
|
-
offset += keyLen + valLen;
|
|
364
|
-
}
|
|
365
|
-
// Verify footer hash (mandatory)
|
|
366
|
-
if (offset + 32 > data.length) {
|
|
367
|
-
throw new Error('KV cache file missing SHA256 footer');
|
|
368
|
-
}
|
|
369
|
-
const stored = data.subarray(offset, offset + 32);
|
|
370
|
-
const computed = createHash('sha256').update(data.subarray(44, offset)).digest();
|
|
371
|
-
if (!stored.equals(computed))
|
|
372
|
-
throw new Error('KV cache integrity check failed: hash mismatch');
|
|
373
|
-
this.kvCache = entries;
|
|
374
|
-
if (this.config.verbose)
|
|
375
|
-
console.log(`[gguf-engine] KV cache loaded: ${entries.size} entries`);
|
|
376
|
-
}
|
|
377
|
-
/** Return metadata for all loaded models. */
|
|
378
|
-
getLoadedModels() { return Array.from(this.loadedModels.values()); }
|
|
379
|
-
/** Store a key-value pair in the in-memory KV cache. */
|
|
380
|
-
setKvEntry(key, value) { this.kvCache.set(key, value); }
|
|
381
|
-
/** Retrieve a key-value pair from the in-memory KV cache. */
|
|
382
|
-
getKvEntry(key) { return this.kvCache.get(key); }
|
|
383
|
-
/** Release resources, unload models, and optionally persist the KV cache. */
|
|
384
|
-
async shutdown() {
|
|
385
|
-
if (this.config.kvCachePath && this.kvCache.size > 0) {
|
|
386
|
-
try {
|
|
387
|
-
await this.persistKvCache(this.config.kvCachePath);
|
|
388
|
-
}
|
|
389
|
-
catch (err) {
|
|
390
|
-
if (this.config.verbose)
|
|
391
|
-
console.warn('[gguf-engine] KV persist failed:', err);
|
|
392
|
-
}
|
|
393
|
-
}
|
|
394
|
-
if (this.llamaContext?.dispose) {
|
|
395
|
-
try {
|
|
396
|
-
await this.llamaContext.dispose();
|
|
397
|
-
}
|
|
398
|
-
catch { /* ignore */ }
|
|
399
|
-
}
|
|
400
|
-
if (this.llamaModel?.dispose) {
|
|
401
|
-
try {
|
|
402
|
-
await this.llamaModel.dispose();
|
|
403
|
-
}
|
|
404
|
-
catch { /* ignore */ }
|
|
405
|
-
}
|
|
406
|
-
this.llamaContext = null;
|
|
407
|
-
this.llamaModel = null;
|
|
408
|
-
this.activeModelPath = null;
|
|
409
|
-
this.loadedModels.clear();
|
|
410
|
-
this.kvCache.clear();
|
|
411
|
-
if (this.config.verbose)
|
|
412
|
-
console.log('[gguf-engine] Shutdown complete');
|
|
413
|
-
}
|
|
414
|
-
// ── Private ───────────────────────────────────────────────
|
|
415
|
-
async tryLoadLlamaCpp() {
|
|
416
|
-
// @ts-ignore -- optional peer dependency, may not be installed
|
|
417
|
-
try {
|
|
418
|
-
return await import('node-llama-cpp');
|
|
419
|
-
}
|
|
420
|
-
catch {
|
|
421
|
-
return null;
|
|
422
|
-
}
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
//# sourceMappingURL=gguf-engine.js.map
|
|
@@ -1,231 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* ruvLLM Bridge -- Local Language Model Inference
|
|
3
|
-
*
|
|
4
|
-
* Provides 3-tier routing for on-device GGUF model inference:
|
|
5
|
-
* Tier 1: Agent Booster (WASM, <1ms) -- simple transforms
|
|
6
|
-
* Tier 2: Local model via GGUF engine (~200ms) -- routing, classification
|
|
7
|
-
* Tier 3: Cloud API (2-5s) -- complex reasoning
|
|
8
|
-
*
|
|
9
|
-
* The bridge degrades gracefully when no local models are available.
|
|
10
|
-
*
|
|
11
|
-
* @module moflo/appliance/ruvllm-bridge
|
|
12
|
-
*/
|
|
13
|
-
import { readdir, stat } from 'node:fs/promises';
|
|
14
|
-
import { join, extname, basename } from 'node:path';
|
|
15
|
-
const DEFAULT_CONFIG = {
|
|
16
|
-
modelsDir: './models', defaultModel: '', maxTokens: 512,
|
|
17
|
-
temperature: 0.7, contextSize: 4096, kvCachePath: '', verbose: false,
|
|
18
|
-
};
|
|
19
|
-
// ── Quantization / parameter heuristics ─────────────────────
|
|
20
|
-
const QUANT_PATTERNS = [
|
|
21
|
-
[/q4_k_m/i, 'q4_k_m'], [/q4_k_s/i, 'q4_k_s'], [/q4_0/i, 'q4_0'],
|
|
22
|
-
[/q5_k_m/i, 'q5_k_m'], [/q5_0/i, 'q5_0'], [/q8_0/i, 'q8_0'],
|
|
23
|
-
[/f16/i, 'f16'], [/f32/i, 'f32'],
|
|
24
|
-
];
|
|
25
|
-
function inferQuantization(filename) {
|
|
26
|
-
for (const [re, label] of QUANT_PATTERNS)
|
|
27
|
-
if (re.test(filename))
|
|
28
|
-
return label;
|
|
29
|
-
return 'unknown';
|
|
30
|
-
}
|
|
31
|
-
function inferParameters(filename) {
|
|
32
|
-
const m = filename.match(/(\d+)[._-]?b/i);
|
|
33
|
-
return m ? m[0].toUpperCase().replace(/[._-]/g, '') : 'unknown';
|
|
34
|
-
}
|
|
35
|
-
// ── Complexity heuristic ────────────────────────────────────
|
|
36
|
-
const HIGH = new Set([
|
|
37
|
-
'architect', 'design', 'refactor', 'security', 'audit', 'complex',
|
|
38
|
-
'analyze', 'distributed', 'concurrent', 'algorithm', 'investigate',
|
|
39
|
-
'optimize', 'debug', 'system', 'integration',
|
|
40
|
-
]);
|
|
41
|
-
const LOW = new Set([
|
|
42
|
-
'rename', 'typo', 'format', 'comment', 'version', 'bump',
|
|
43
|
-
'move', 'copy', 'delete', 'simple', 'config',
|
|
44
|
-
]);
|
|
45
|
-
function estimateComplexity(desc) {
|
|
46
|
-
const words = desc.toLowerCase().split(/\s+/);
|
|
47
|
-
let score = 0.3;
|
|
48
|
-
for (const w of words) {
|
|
49
|
-
if (HIGH.has(w))
|
|
50
|
-
score += 0.15;
|
|
51
|
-
if (LOW.has(w))
|
|
52
|
-
score -= 0.1;
|
|
53
|
-
}
|
|
54
|
-
return Math.max(0, Math.min(1, score + Math.min(0.2, words.length / 200)));
|
|
55
|
-
}
|
|
56
|
-
// ── Bridge ──────────────────────────────────────────────────
|
|
57
|
-
export class RuvllmBridge {
|
|
58
|
-
config;
|
|
59
|
-
models = new Map();
|
|
60
|
-
activeModel = null;
|
|
61
|
-
kvCacheEntries = 0;
|
|
62
|
-
ggufEngine = null;
|
|
63
|
-
constructor(config) {
|
|
64
|
-
if (!config.modelsDir)
|
|
65
|
-
throw new Error('RuvllmConfig.modelsDir is required');
|
|
66
|
-
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
67
|
-
}
|
|
68
|
-
/** Initialize GGUF engine and scan modelsDir. */
|
|
69
|
-
async initialize() {
|
|
70
|
-
// Initialize GGUF engine for local model inference
|
|
71
|
-
try {
|
|
72
|
-
const { GgufEngine } = await import('./gguf-engine.js');
|
|
73
|
-
this.ggufEngine = new GgufEngine({
|
|
74
|
-
contextSize: this.config.contextSize,
|
|
75
|
-
maxTokens: this.config.maxTokens,
|
|
76
|
-
temperature: this.config.temperature,
|
|
77
|
-
kvCachePath: this.config.kvCachePath,
|
|
78
|
-
verbose: this.config.verbose,
|
|
79
|
-
});
|
|
80
|
-
await this.ggufEngine.initialize();
|
|
81
|
-
}
|
|
82
|
-
catch {
|
|
83
|
-
// GGUF engine is optional
|
|
84
|
-
}
|
|
85
|
-
await this.scanModelsDir();
|
|
86
|
-
if (this.config.verbose) {
|
|
87
|
-
const pkgs = [
|
|
88
|
-
this.ggufEngine && 'gguf-engine',
|
|
89
|
-
].filter(Boolean);
|
|
90
|
-
if (pkgs.length)
|
|
91
|
-
console.log(`[ruvLLM] Loaded: ${pkgs.join(', ')}`);
|
|
92
|
-
console.log(`[ruvLLM] ${this.models.size} model(s) in ${this.config.modelsDir}`);
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
/** Return all discovered GGUF models. */
|
|
96
|
-
async listModels() {
|
|
97
|
-
return Array.from(this.models.values());
|
|
98
|
-
}
|
|
99
|
-
/** Load a model into memory (delegates to GGUF engine). */
|
|
100
|
-
async loadModel(name) {
|
|
101
|
-
const info = this.models.get(name);
|
|
102
|
-
if (!info)
|
|
103
|
-
throw new Error(`Model "${name}" not found. Available: ${[...this.models.keys()].join(', ')}`);
|
|
104
|
-
if (this.ggufEngine) {
|
|
105
|
-
const meta = await this.ggufEngine.loadModel(info.path);
|
|
106
|
-
if (meta.architecture)
|
|
107
|
-
info.parameters = meta.architecture;
|
|
108
|
-
if (meta.quantization)
|
|
109
|
-
info.quantization = meta.quantization;
|
|
110
|
-
}
|
|
111
|
-
info.loaded = true;
|
|
112
|
-
this.activeModel = name;
|
|
113
|
-
}
|
|
114
|
-
/**
|
|
115
|
-
* Generate text from a prompt. Routes through tiers:
|
|
116
|
-
* 1. Agent Booster (trivial transforms, no LLM).
|
|
117
|
-
* 2. Local GGUF model.
|
|
118
|
-
* 3. Cloud fallback (empty response -- caller handles upstream).
|
|
119
|
-
*/
|
|
120
|
-
async generate(request) {
|
|
121
|
-
const start = performance.now();
|
|
122
|
-
const modelName = request.model ?? this.config.defaultModel ?? this.activeModel ?? '';
|
|
123
|
-
// Tier 1: Agent Booster
|
|
124
|
-
const booster = this.tryAgentBooster(request.prompt);
|
|
125
|
-
if (booster !== null) {
|
|
126
|
-
return { text: booster, model: 'agent-booster', tokensUsed: 0, latencyMs: performance.now() - start, tier: 1, cached: false };
|
|
127
|
-
}
|
|
128
|
-
// Tier 2: Local model via GGUF engine
|
|
129
|
-
const info = this.models.get(modelName);
|
|
130
|
-
if (info?.loaded && this.ggufEngine) {
|
|
131
|
-
try {
|
|
132
|
-
const r = await this.ggufEngine.generate({
|
|
133
|
-
prompt: request.prompt,
|
|
134
|
-
maxTokens: request.maxTokens ?? this.config.maxTokens,
|
|
135
|
-
temperature: request.temperature ?? this.config.temperature,
|
|
136
|
-
stopSequences: request.stopSequences,
|
|
137
|
-
});
|
|
138
|
-
return { text: r.text, model: modelName, tokensUsed: r.tokensUsed, latencyMs: performance.now() - start, tier: 2, cached: false };
|
|
139
|
-
}
|
|
140
|
-
catch (err) {
|
|
141
|
-
if (this.config.verbose)
|
|
142
|
-
console.warn('[ruvLLM] Local generation failed, tier 3 fallback:', err);
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
// Tier 3: Cloud fallback
|
|
146
|
-
return { text: '', model: 'cloud-fallback', tokensUsed: 0, latencyMs: performance.now() - start, tier: 3, cached: false };
|
|
147
|
-
}
|
|
148
|
-
/** Route a task description to the optimal tier using complexity heuristics. */
|
|
149
|
-
async routeTask(description) {
|
|
150
|
-
const complexity = estimateComplexity(description);
|
|
151
|
-
const words = description.split(/\s+/).length;
|
|
152
|
-
if (words < 15 && complexity < 0.25)
|
|
153
|
-
return { tier: 1, model: 'agent-booster', confidence: 0.9 };
|
|
154
|
-
if (complexity < 0.55 && this.activeModel)
|
|
155
|
-
return { tier: 2, model: this.activeModel, confidence: 0.7 };
|
|
156
|
-
return { tier: 3, model: 'cloud', confidence: 0.6 };
|
|
157
|
-
}
|
|
158
|
-
/** Return current bridge status. */
|
|
159
|
-
async getStatus() {
|
|
160
|
-
return {
|
|
161
|
-
available: this.models.size > 0,
|
|
162
|
-
modelsLoaded: [...this.models.values()].filter((m) => m.loaded).map((m) => m.name),
|
|
163
|
-
kvCacheSize: this.kvCacheEntries,
|
|
164
|
-
};
|
|
165
|
-
}
|
|
166
|
-
/** Unload models and clean up. */
|
|
167
|
-
async shutdown() {
|
|
168
|
-
if (this.ggufEngine) {
|
|
169
|
-
await this.ggufEngine.shutdown();
|
|
170
|
-
this.ggufEngine = null;
|
|
171
|
-
}
|
|
172
|
-
for (const info of this.models.values())
|
|
173
|
-
info.loaded = false;
|
|
174
|
-
this.activeModel = null;
|
|
175
|
-
this.kvCacheEntries = 0;
|
|
176
|
-
}
|
|
177
|
-
// ── Private ───────────────────────────────────────────────
|
|
178
|
-
async scanModelsDir() {
|
|
179
|
-
try {
|
|
180
|
-
const entries = await readdir(this.config.modelsDir);
|
|
181
|
-
for (const entry of entries) {
|
|
182
|
-
if (extname(entry).toLowerCase() !== '.gguf')
|
|
183
|
-
continue;
|
|
184
|
-
const fullPath = join(this.config.modelsDir, entry);
|
|
185
|
-
const s = await stat(fullPath);
|
|
186
|
-
if (!s.isFile())
|
|
187
|
-
continue;
|
|
188
|
-
const name = basename(entry, '.gguf');
|
|
189
|
-
this.models.set(name, {
|
|
190
|
-
name, path: fullPath, format: 'gguf',
|
|
191
|
-
quantization: inferQuantization(entry), size: s.size,
|
|
192
|
-
parameters: inferParameters(entry), loaded: false,
|
|
193
|
-
});
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
catch {
|
|
197
|
-
// modelsDir may not exist -- tier 1 and tier 3 still work
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
/** Tier-1 Agent Booster: handle trivial transforms without any LLM. */
|
|
201
|
-
tryAgentBooster(prompt) {
|
|
202
|
-
const t = prompt.trim();
|
|
203
|
-
if (t.length > 200)
|
|
204
|
-
return null;
|
|
205
|
-
if (/^(convert|change)\s+(var|let)\s+to\s+const$/i.test(t)) {
|
|
206
|
-
return 'Use the Edit tool to replace `var`/`let` declarations with `const`.';
|
|
207
|
-
}
|
|
208
|
-
if (/^remove\s+console\.(log|warn|error|debug|info)$/i.test(t)) {
|
|
209
|
-
const m = t.toLowerCase().match(/console\.(\w+)/)?.[1] ?? 'log';
|
|
210
|
-
return `Use the Edit tool to remove all \`console.${m}\` calls.`;
|
|
211
|
-
}
|
|
212
|
-
return null;
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
// ── Singleton accessor ──────────────────────────────────────
|
|
216
|
-
let instance = null;
|
|
217
|
-
/** Get or create the singleton RuvllmBridge. Config required on first call. */
|
|
218
|
-
export function getRuvllmBridge(config) {
|
|
219
|
-
if (!instance && config)
|
|
220
|
-
instance = new RuvllmBridge(config);
|
|
221
|
-
if (!instance)
|
|
222
|
-
throw new Error('ruvLLM bridge not initialized. Call with config first.');
|
|
223
|
-
return instance;
|
|
224
|
-
}
|
|
225
|
-
/** Reset the singleton (useful for tests). */
|
|
226
|
-
export function resetRuvllmBridge() { instance = null; }
|
|
227
|
-
/** Check whether ruvLLM bridge is available (GGUF engine is the primary backend). */
|
|
228
|
-
export async function isRuvllmAvailable() {
|
|
229
|
-
return true; // Pure TS implementation — always available
|
|
230
|
-
}
|
|
231
|
-
//# sourceMappingURL=ruvllm-bridge.js.map
|