ruvnet-kb-first 6.4.0 → 6.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,472 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * KB Export to WASM-Compatible Binary Format
5
+ *
6
+ * Exports the ask_ruvnet knowledge base to a compact binary format
7
+ * that can be loaded by RvLite WASM at runtime.
8
+ *
9
+ * Features:
10
+ * - Binary quantization (32x compression)
11
+ * - Version hashing for auto-update detection
12
+ * - Cross-platform compatible (Windows, Mac, Linux)
13
+ * - Chunked export for memory efficiency
14
+ *
15
+ * Usage:
16
+ * node scripts/kb-export-wasm.js --schema ask_ruvnet --output kb-data/
17
+ */
18
+
19
+ import pg from 'pg';
20
+ import fs from 'fs';
21
+ import path from 'path';
22
+ import crypto from 'crypto';
23
+
24
+ // ============================================================================
25
+ // Configuration
26
+ // ============================================================================
27
+
28
+ const CONFIG = {
29
+ db: {
30
+ host: process.env.KB_HOST || 'localhost',
31
+ port: parseInt(process.env.KB_PORT || '5435'),
32
+ database: 'postgres',
33
+ user: 'postgres',
34
+ password: process.env.KB_PASSWORD || 'guruKB2025',
35
+ },
36
+ export: {
37
+ chunkSize: 1000, // Rows per chunk
38
+ embeddingDim: 384, // all-MiniLM-L6-v2 dimensions
39
+ quantization: 'binary', // 'binary' (32x), 'scalar' (4x), or 'none'
40
+ }
41
+ };
42
+
43
+ // ============================================================================
44
+ // Binary Quantization
45
+ // ============================================================================
46
+
47
+ /**
48
+ * Binary quantize a float32 embedding to bits (32x compression)
49
+ * Each float > 0 becomes 1, else 0
50
+ */
51
+ function binaryQuantize(embedding) {
52
+ const bytes = new Uint8Array(Math.ceil(embedding.length / 8));
53
+ for (let i = 0; i < embedding.length; i++) {
54
+ if (embedding[i] > 0) {
55
+ bytes[Math.floor(i / 8)] |= (1 << (7 - (i % 8)));
56
+ }
57
+ }
58
+ return bytes;
59
+ }
60
+
61
+ /**
62
+ * Compute Hamming distance between two binary vectors
63
+ */
64
+ function hammingDistance(a, b) {
65
+ let distance = 0;
66
+ for (let i = 0; i < a.length; i++) {
67
+ let xor = a[i] ^ b[i];
68
+ while (xor) {
69
+ distance += xor & 1;
70
+ xor >>= 1;
71
+ }
72
+ }
73
+ return distance;
74
+ }
75
+
76
+ // ============================================================================
77
+ // Export Functions
78
+ // ============================================================================
79
+
80
+ async function exportKB(schema, outputDir) {
81
+ const client = new pg.Client(CONFIG.db);
82
+
83
+ try {
84
+ await client.connect();
85
+ console.log(`\nšŸ“¦ KB Export to WASM Format`);
86
+ console.log(` Schema: ${schema}`);
87
+ console.log(` Output: ${outputDir}`);
88
+ console.log(` Quantization: ${CONFIG.export.quantization}`);
89
+
90
+ // Create output directory
91
+ fs.mkdirSync(outputDir, { recursive: true });
92
+
93
+ // Get total count
94
+ const countResult = await client.query(`
95
+ SELECT COUNT(*) as total FROM ${schema}.architecture_docs
96
+ WHERE embedding IS NOT NULL AND is_duplicate = false
97
+ `);
98
+ const totalEntries = parseInt(countResult.rows[0].total);
99
+ console.log(`\n Total entries: ${totalEntries.toLocaleString()}`);
100
+
101
+ // Get category stats
102
+ const categoryResult = await client.query(`
103
+ SELECT category, COUNT(*) as count
104
+ FROM ${schema}.architecture_docs
105
+ WHERE embedding IS NOT NULL AND is_duplicate = false
106
+ GROUP BY category ORDER BY count DESC
107
+ `);
108
+
109
+ // Initialize data structures
110
+ const metadata = {
111
+ version: '1.0.0',
112
+ schema: schema,
113
+ exportedAt: new Date().toISOString(),
114
+ totalEntries: totalEntries,
115
+ embeddingDim: CONFIG.export.embeddingDim,
116
+ quantization: CONFIG.export.quantization,
117
+ categories: categoryResult.rows.map(r => ({ name: r.category, count: parseInt(r.count) })),
118
+ contentHash: null, // Computed after export
119
+ };
120
+
121
+ // Export in chunks
122
+ const entries = [];
123
+ const embeddings = [];
124
+ let offset = 0;
125
+ let hashInput = '';
126
+
127
+ console.log(`\n Exporting chunks...`);
128
+
129
+ while (offset < totalEntries) {
130
+ const result = await client.query(`
131
+ SELECT id, title, content, category, quality_score, embedding
132
+ FROM ${schema}.architecture_docs
133
+ WHERE embedding IS NOT NULL AND is_duplicate = false
134
+ ORDER BY id
135
+ LIMIT $1 OFFSET $2
136
+ `, [CONFIG.export.chunkSize, offset]);
137
+
138
+ for (const row of result.rows) {
139
+ // Parse embedding
140
+ const embeddingArray = row.embedding;
141
+
142
+ // Create entry metadata (without embedding)
143
+ const entry = {
144
+ id: row.id,
145
+ title: row.title,
146
+ content: row.content.substring(0, 500), // Truncate for size
147
+ category: row.category,
148
+ quality: row.quality_score,
149
+ };
150
+ entries.push(entry);
151
+
152
+ // Quantize embedding
153
+ if (CONFIG.export.quantization === 'binary') {
154
+ embeddings.push(binaryQuantize(embeddingArray));
155
+ } else {
156
+ embeddings.push(new Float32Array(embeddingArray));
157
+ }
158
+
159
+ // Add to hash input
160
+ hashInput += `${row.id}:${row.title}:${row.category}|`;
161
+ }
162
+
163
+ offset += CONFIG.export.chunkSize;
164
+ const progress = Math.min(100, Math.round((offset / totalEntries) * 100));
165
+ process.stdout.write(`\r Progress: ${progress}% (${Math.min(offset, totalEntries).toLocaleString()}/${totalEntries.toLocaleString()})`);
166
+ }
167
+
168
+ console.log('\n');
169
+
170
+ // Compute content hash
171
+ metadata.contentHash = crypto.createHash('sha256').update(hashInput).digest('hex').substring(0, 16);
172
+ console.log(` Content Hash: ${metadata.contentHash}`);
173
+
174
+ // Write metadata
175
+ const metadataPath = path.join(outputDir, 'kb-metadata.json');
176
+ fs.writeFileSync(metadataPath, JSON.stringify(metadata, null, 2));
177
+ console.log(` āœ“ Wrote ${metadataPath}`);
178
+
179
+ // Write entries (JSON, gzipped in production)
180
+ const entriesPath = path.join(outputDir, 'kb-entries.json');
181
+ fs.writeFileSync(entriesPath, JSON.stringify(entries));
182
+ const entriesSize = fs.statSync(entriesPath).size;
183
+ console.log(` āœ“ Wrote ${entriesPath} (${(entriesSize / 1024 / 1024).toFixed(2)} MB)`);
184
+
185
+ // Write embeddings (binary format)
186
+ const embeddingsPath = path.join(outputDir, 'kb-embeddings.bin');
187
+ const bytesPerVector = CONFIG.export.quantization === 'binary'
188
+ ? Math.ceil(CONFIG.export.embeddingDim / 8) // 48 bytes for binary
189
+ : CONFIG.export.embeddingDim * 4; // 1536 bytes for float32
190
+
191
+ const embeddingsBuffer = Buffer.alloc(embeddings.length * bytesPerVector);
192
+ for (let i = 0; i < embeddings.length; i++) {
193
+ const embedding = embeddings[i];
194
+ if (CONFIG.export.quantization === 'binary') {
195
+ Buffer.from(embedding).copy(embeddingsBuffer, i * bytesPerVector);
196
+ } else {
197
+ Buffer.from(embedding.buffer).copy(embeddingsBuffer, i * bytesPerVector);
198
+ }
199
+ }
200
+ fs.writeFileSync(embeddingsPath, embeddingsBuffer);
201
+ const embeddingsSize = fs.statSync(embeddingsPath).size;
202
+ console.log(` āœ“ Wrote ${embeddingsPath} (${(embeddingsSize / 1024 / 1024).toFixed(2)} MB)`);
203
+
204
+ // Calculate total size
205
+ const totalSize = entriesSize + embeddingsSize + fs.statSync(metadataPath).size;
206
+ console.log(`\n šŸ“Š Total Export Size: ${(totalSize / 1024 / 1024).toFixed(2)} MB`);
207
+
208
+ // Compression ratio
209
+ const originalSize = totalEntries * CONFIG.export.embeddingDim * 4; // Float32
210
+ const compressionRatio = originalSize / embeddingsSize;
211
+ console.log(` šŸ“‰ Compression Ratio: ${compressionRatio.toFixed(1)}x`);
212
+
213
+ // Generate loader module
214
+ await generateLoader(outputDir, metadata);
215
+
216
+ return metadata;
217
+
218
+ } finally {
219
+ await client.end();
220
+ }
221
+ }
222
+
223
+ // ============================================================================
224
+ // Loader Generator
225
+ // ============================================================================
226
+
227
+ async function generateLoader(outputDir, metadata) {
228
+ const loaderCode = `/**
229
+ * WASM KB Loader - Auto-generated
230
+ *
231
+ * Loads the embedded knowledge base into RvLite WASM.
232
+ * Provides semantic search with ~5ms latency.
233
+ *
234
+ * Content Hash: ${metadata.contentHash}
235
+ * Generated: ${metadata.exportedAt}
236
+ * Entries: ${metadata.totalEntries.toLocaleString()}
237
+ */
238
+
239
+ import { RvLite, RvLiteConfig, Embedder } from '@ruvector/edge-full/rvlite';
240
+ import fs from 'fs';
241
+ import path from 'path';
242
+ import { fileURLToPath } from 'url';
243
+
244
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
245
+
246
+ // KB Version Info
247
+ export const KB_VERSION = {
248
+ hash: '${metadata.contentHash}',
249
+ exportedAt: '${metadata.exportedAt}',
250
+ totalEntries: ${metadata.totalEntries},
251
+ embeddingDim: ${metadata.embeddingDim},
252
+ quantization: '${metadata.quantization}',
253
+ };
254
+
255
+ // Singleton instance
256
+ let db = null;
257
+ let embedder = null;
258
+ let entries = null;
259
+ let isLoaded = false;
260
+
261
+ /**
262
+ * Check if a newer KB version is available from PostgreSQL
263
+ */
264
+ export async function checkForUpdates(pgConfig) {
265
+ try {
266
+ const pg = await import('pg');
267
+ const client = new pg.default.Client(pgConfig || {
268
+ host: 'localhost',
269
+ port: 5435,
270
+ database: 'postgres',
271
+ user: 'postgres',
272
+ password: 'guruKB2025',
273
+ });
274
+
275
+ await client.connect();
276
+
277
+ // Get current hash from PostgreSQL
278
+ const result = await client.query(\`
279
+ SELECT MD5(STRING_AGG(id::text || ':' || title || ':' || category, '|' ORDER BY id))::text as hash
280
+ FROM ask_ruvnet.architecture_docs
281
+ WHERE embedding IS NOT NULL AND is_duplicate = false
282
+ \`);
283
+
284
+ await client.end();
285
+
286
+ const currentHash = result.rows[0]?.hash?.substring(0, 16) || 'unknown';
287
+ const needsUpdate = currentHash !== KB_VERSION.hash;
288
+
289
+ return {
290
+ embeddedHash: KB_VERSION.hash,
291
+ currentHash: currentHash,
292
+ needsUpdate: needsUpdate,
293
+ message: needsUpdate
294
+ ? 'KB has been updated. Run: npm update ruvnet-kb-first'
295
+ : 'KB is up to date',
296
+ };
297
+ } catch (e) {
298
+ return {
299
+ embeddedHash: KB_VERSION.hash,
300
+ currentHash: 'unavailable',
301
+ needsUpdate: false,
302
+ message: 'Could not check for updates (PostgreSQL not available)',
303
+ };
304
+ }
305
+ }
306
+
307
+ /**
308
+ * Load the embedded KB into RvLite WASM
309
+ */
310
+ export async function loadKB() {
311
+ if (isLoaded) return { db, embedder, entries };
312
+
313
+ console.log('Loading embedded KB...');
314
+ const startTime = Date.now();
315
+
316
+ // Load metadata
317
+ const metadataPath = path.join(__dirname, 'kb-metadata.json');
318
+ const metadata = JSON.parse(fs.readFileSync(metadataPath, 'utf-8'));
319
+
320
+ // Load entries
321
+ const entriesPath = path.join(__dirname, 'kb-entries.json');
322
+ entries = JSON.parse(fs.readFileSync(entriesPath, 'utf-8'));
323
+
324
+ // Load embeddings
325
+ const embeddingsPath = path.join(__dirname, 'kb-embeddings.bin');
326
+ const embeddingsBuffer = fs.readFileSync(embeddingsPath);
327
+
328
+ // Initialize RvLite with 384 dimensions
329
+ const config = new RvLiteConfig();
330
+ config.dimensions = metadata.embeddingDim;
331
+ config.distance_metric = 'cosine';
332
+ db = new RvLite(config);
333
+
334
+ // Initialize embedder for query embedding
335
+ embedder = new Embedder();
336
+
337
+ // Load vectors into RvLite
338
+ const bytesPerVector = metadata.quantization === 'binary'
339
+ ? Math.ceil(metadata.embeddingDim / 8)
340
+ : metadata.embeddingDim * 4;
341
+
342
+ for (let i = 0; i < entries.length; i++) {
343
+ const entry = entries[i];
344
+ const offset = i * bytesPerVector;
345
+
346
+ // For binary quantization, we need to convert back to float32 for RvLite
347
+ // This is a lightweight operation since we're just expanding bits
348
+ let vector;
349
+ if (metadata.quantization === 'binary') {
350
+ const binaryBytes = embeddingsBuffer.slice(offset, offset + bytesPerVector);
351
+ vector = new Float32Array(metadata.embeddingDim);
352
+ for (let j = 0; j < metadata.embeddingDim; j++) {
353
+ const byteIdx = Math.floor(j / 8);
354
+ const bitIdx = 7 - (j % 8);
355
+ vector[j] = (binaryBytes[byteIdx] & (1 << bitIdx)) ? 1.0 : -1.0;
356
+ }
357
+ } else {
358
+ vector = new Float32Array(embeddingsBuffer.buffer, offset, metadata.embeddingDim);
359
+ }
360
+
361
+ // Insert with metadata
362
+ db.insert_with_id(entry.id.toString(), vector, {
363
+ title: entry.title,
364
+ category: entry.category,
365
+ quality: entry.quality,
366
+ });
367
+ }
368
+
369
+ isLoaded = true;
370
+ const loadTime = Date.now() - startTime;
371
+ console.log(\`KB loaded: \${entries.length.toLocaleString()} entries in \${loadTime}ms\`);
372
+
373
+ return { db, embedder, entries };
374
+ }
375
+
376
+ /**
377
+ * Semantic search using embedded KB
378
+ */
379
+ export async function search(query, limit = 10, filter = null) {
380
+ if (!isLoaded) await loadKB();
381
+
382
+ const startTime = Date.now();
383
+
384
+ // Generate query embedding
385
+ const queryVector = embedder.embed(query);
386
+
387
+ // Search
388
+ let results;
389
+ if (filter) {
390
+ results = db.search_with_filter(queryVector, limit, filter);
391
+ } else {
392
+ results = db.search(queryVector, limit);
393
+ }
394
+
395
+ // Enrich results with entry data
396
+ const enrichedResults = results.map(r => {
397
+ const entry = entries.find(e => e.id.toString() === r.id);
398
+ return {
399
+ id: r.id,
400
+ distance: r.distance,
401
+ title: entry?.title || 'Unknown',
402
+ content: entry?.content || '',
403
+ category: entry?.category || 'general',
404
+ quality: entry?.quality || 0,
405
+ };
406
+ });
407
+
408
+ const searchTime = Date.now() - startTime;
409
+
410
+ return {
411
+ results: enrichedResults,
412
+ searchTimeMs: searchTime,
413
+ query: query,
414
+ totalEntries: entries.length,
415
+ };
416
+ }
417
+
418
+ /**
419
+ * Get KB statistics
420
+ */
421
+ export function getStats() {
422
+ return {
423
+ version: KB_VERSION,
424
+ loaded: isLoaded,
425
+ entries: entries?.length || 0,
426
+ categories: KB_VERSION.totalEntries > 0 ? 15 : 0,
427
+ };
428
+ }
429
+
430
+ // Export for CommonJS compatibility
431
+ export default {
432
+ loadKB,
433
+ search,
434
+ checkForUpdates,
435
+ getStats,
436
+ KB_VERSION,
437
+ };
438
+ `;
439
+
440
+ const loaderPath = path.join(outputDir, 'kb-loader.js');
441
+ fs.writeFileSync(loaderPath, loaderCode);
442
+ console.log(` āœ“ Wrote ${loaderPath}`);
443
+ }
444
+
445
+ // ============================================================================
446
+ // Main
447
+ // ============================================================================
448
+
449
+ async function main() {
450
+ const args = process.argv.slice(2);
451
+ const schemaIdx = args.indexOf('--schema');
452
+ const outputIdx = args.indexOf('--output');
453
+
454
+ const schema = schemaIdx >= 0 ? args[schemaIdx + 1] : 'ask_ruvnet';
455
+ const outputDir = outputIdx >= 0 ? args[outputIdx + 1] : 'kb-data';
456
+
457
+ try {
458
+ const metadata = await exportKB(schema, outputDir);
459
+
460
+ console.log(`\nāœ… Export complete!`);
461
+ console.log(`\nTo use the embedded KB:`);
462
+ console.log(` import { loadKB, search } from './${outputDir}/kb-loader.js';`);
463
+ console.log(` await loadKB();`);
464
+ console.log(` const results = await search('how to create agents', 5);`);
465
+
466
+ } catch (error) {
467
+ console.error(`\nāŒ Export failed: ${error.message}`);
468
+ process.exit(1);
469
+ }
470
+ }
471
+
472
+ main();