ruvnet-kb-first 6.2.0 → 6.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,548 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * KB Ingest Template - Optimized Knowledge Base Ingestion
5
+ *
6
+ * Usage:
7
+ * node scripts/kb-ingest-template.js --source ./docs --schema my_project
8
+ * node scripts/kb-ingest-template.js --source ./content --schema ask_ruvnet --update
9
+ *
10
+ * What it does:
11
+ * 1. Hash - SHA-256 of normalized content
12
+ * 2. Dedupe - Skip if hash exists (or update if higher quality)
13
+ * 3. Categorize - Auto-detect from 15 category rules
14
+ * 4. Score - Calculate quality 0-100
15
+ * 5. Embed - PostgreSQL ruvector_embed() (Rust, not WASM)
16
+ * 6. View - Auto-creates {schema}.kb optimized view
17
+ */
18
+
19
+ import { createHash } from 'crypto';
20
+ import { readFileSync, readdirSync, statSync, existsSync } from 'fs';
21
+ import { join, extname, basename, relative } from 'path';
22
+ import pg from 'pg';
23
+
24
+ // ============================================================================
25
+ // Configuration
26
+ // ============================================================================
27
+
28
+ const CONFIG = {
29
+ db: {
30
+ host: process.env.KB_HOST || 'localhost',
31
+ port: parseInt(process.env.KB_PORT || '5435'),
32
+ database: 'postgres',
33
+ user: 'postgres',
34
+ password: process.env.KB_PASSWORD || 'guruKB2025',
35
+ },
36
+ supportedExtensions: ['.md', '.txt', '.json', '.yaml', '.yml', '.ts', '.js', '.py', '.sql'],
37
+ minContentLength: 100,
38
+ maxContentLength: 50000,
39
+ };
40
+
41
+ // ============================================================================
42
+ // 15 Category Rules
43
+ // ============================================================================
44
+
45
+ const CATEGORY_RULES = [
46
+ // Agents & Orchestration
47
+ { category: 'agents', patterns: [/agent/i, /swarm/i, /orchestrat/i, /coordinator/i, /spawn/i] },
48
+ { category: 'workflows', patterns: [/workflow/i, /pipeline/i, /dag\b/i, /task.?flow/i] },
49
+
50
+ // AI/ML
51
+ { category: 'embeddings', patterns: [/embed/i, /vector/i, /semantic/i, /onnx/i, /transformer/i] },
52
+ { category: 'neural', patterns: [/neural/i, /gnn\b/i, /attention/i, /sona\b/i, /lora\b/i] },
53
+ { category: 'inference', patterns: [/inference/i, /predict/i, /model\b/i, /llm\b/i, /prompt/i] },
54
+
55
+ // Data & Storage
56
+ { category: 'database', patterns: [/database/i, /postgres/i, /sql\b/i, /schema/i, /migration/i] },
57
+ { category: 'storage', patterns: [/storage/i, /persist/i, /cache/i, /memory/i, /store/i] },
58
+
59
+ // Security & Auth
60
+ { category: 'security', patterns: [/security/i, /auth/i, /encrypt/i, /token/i, /credential/i] },
61
+ { category: 'compliance', patterns: [/compliance/i, /audit/i, /gdpr/i, /pci/i, /hipaa/i] },
62
+
63
+ // Infrastructure
64
+ { category: 'deployment', patterns: [/deploy/i, /docker/i, /kubernetes/i, /k8s/i, /ci.?cd/i] },
65
+ { category: 'config', patterns: [/config/i, /setting/i, /environment/i, /\.env/i] },
66
+
67
+ // Development
68
+ { category: 'api', patterns: [/api\b/i, /endpoint/i, /rest\b/i, /graphql/i, /grpc/i] },
69
+ { category: 'testing', patterns: [/test/i, /spec\b/i, /mock/i, /fixture/i, /assert/i] },
70
+ { category: 'patterns', patterns: [/pattern/i, /best.?practice/i, /architecture/i, /design/i] },
71
+
72
+ // Fallback
73
+ { category: 'general', patterns: [/.*/] },
74
+ ];
75
+
76
+ // ============================================================================
77
+ // Content Hashing
78
+ // ============================================================================
79
+
80
+ function normalizeContent(content) {
81
+ return content
82
+ .toLowerCase()
83
+ .replace(/\s+/g, ' ')
84
+ .replace(/[^\w\s]/g, '')
85
+ .trim();
86
+ }
87
+
88
+ function hashContent(content) {
89
+ const normalized = normalizeContent(content);
90
+ return createHash('sha256').update(normalized).digest('hex');
91
+ }
92
+
93
+ // ============================================================================
94
+ // Auto-Categorization
95
+ // ============================================================================
96
+
97
+ function categorize(title, content, filePath) {
98
+ const searchText = `${title} ${content} ${filePath}`;
99
+
100
+ for (const rule of CATEGORY_RULES) {
101
+ for (const pattern of rule.patterns) {
102
+ if (pattern.test(searchText)) {
103
+ return rule.category;
104
+ }
105
+ }
106
+ }
107
+
108
+ return 'general';
109
+ }
110
+
111
+ // ============================================================================
112
+ // Quality Scoring (0-100)
113
+ // ============================================================================
114
+
115
+ function scoreQuality(entry) {
116
+ let score = 50; // Base score
117
+
118
+ const content = entry.content || '';
119
+ const title = entry.title || '';
120
+
121
+ // Length scoring (10-20 points)
122
+ const length = content.length;
123
+ if (length >= 500 && length <= 5000) score += 15;
124
+ else if (length >= 200 && length <= 10000) score += 10;
125
+ else if (length < 100) score -= 20;
126
+
127
+ // Structure scoring (up to 15 points)
128
+ if (content.includes('##') || content.includes('###')) score += 5;
129
+ if (content.includes('```')) score += 5; // Code blocks
130
+ if (content.includes('|') && content.includes('-')) score += 5; // Tables
131
+
132
+ // Source attribution (up to 10 points)
133
+ if (entry.source_expert) score += 5;
134
+ if (entry.source_url) score += 5;
135
+
136
+ // Title quality (up to 10 points)
137
+ if (title.length >= 10 && title.length <= 100) score += 5;
138
+ if (!title.includes('untitled') && !title.includes('undefined')) score += 5;
139
+
140
+ // Content quality indicators (up to 15 points)
141
+ if (/example|implementation|usage/i.test(content)) score += 5;
142
+ if (/best practice|recommended|should/i.test(content)) score += 5;
143
+ if (/warning|caution|avoid|don't/i.test(content)) score += 5;
144
+
145
+ // Penalties
146
+ if (/todo|fixme|hack|temporary/i.test(content)) score -= 10;
147
+ if (content.split('\n').length < 3) score -= 10;
148
+
149
+ return Math.max(0, Math.min(100, score));
150
+ }
151
+
152
+ // ============================================================================
153
+ // File Processing
154
+ // ============================================================================
155
+
156
+ function extractTitleFromContent(content, filePath) {
157
+ // Try to find a markdown title
158
+ const h1Match = content.match(/^#\s+(.+)$/m);
159
+ if (h1Match) return h1Match[1].trim();
160
+
161
+ // Try first non-empty line
162
+ const firstLine = content.split('\n').find(line => line.trim().length > 0);
163
+ if (firstLine && firstLine.length < 100) {
164
+ return firstLine.replace(/^#+\s*/, '').trim();
165
+ }
166
+
167
+ // Fall back to filename
168
+ return basename(filePath, extname(filePath))
169
+ .replace(/[-_]/g, ' ')
170
+ .replace(/\b\w/g, c => c.toUpperCase());
171
+ }
172
+
173
+ function processFile(filePath, baseDir) {
174
+ const ext = extname(filePath).toLowerCase();
175
+ if (!CONFIG.supportedExtensions.includes(ext)) {
176
+ return null;
177
+ }
178
+
179
+ const content = readFileSync(filePath, 'utf-8');
180
+
181
+ if (content.length < CONFIG.minContentLength) {
182
+ return null;
183
+ }
184
+
185
+ const truncatedContent = content.slice(0, CONFIG.maxContentLength);
186
+ const title = extractTitleFromContent(truncatedContent, filePath);
187
+ const relativePath = relative(baseDir, filePath);
188
+
189
+ const entry = {
190
+ title,
191
+ content: truncatedContent,
192
+ source_path: relativePath,
193
+ source_file: basename(filePath),
194
+ content_hash: hashContent(truncatedContent),
195
+ category: null,
196
+ quality_score: null,
197
+ source_expert: null,
198
+ source_url: null,
199
+ };
200
+
201
+ entry.category = categorize(title, truncatedContent, relativePath);
202
+ entry.quality_score = scoreQuality(entry);
203
+
204
+ return entry;
205
+ }
206
+
207
+ function walkDirectory(dir, baseDir = dir) {
208
+ const entries = [];
209
+
210
+ const items = readdirSync(dir);
211
+ for (const item of items) {
212
+ const fullPath = join(dir, item);
213
+ const stat = statSync(fullPath);
214
+
215
+ // Skip hidden files and node_modules
216
+ if (item.startsWith('.') || item === 'node_modules') continue;
217
+
218
+ if (stat.isDirectory()) {
219
+ entries.push(...walkDirectory(fullPath, baseDir));
220
+ } else {
221
+ const entry = processFile(fullPath, baseDir);
222
+ if (entry) {
223
+ entries.push(entry);
224
+ }
225
+ }
226
+ }
227
+
228
+ return entries;
229
+ }
230
+
231
+ // ============================================================================
232
+ // Database Operations
233
+ // ============================================================================
234
+
235
+ async function ensureSchema(client, schema) {
236
+ await client.query(`CREATE SCHEMA IF NOT EXISTS ${schema}`);
237
+
238
+ // Create main table
239
+ await client.query(`
240
+ CREATE TABLE IF NOT EXISTS ${schema}.kb_entries (
241
+ id SERIAL PRIMARY KEY,
242
+ title TEXT NOT NULL,
243
+ content TEXT NOT NULL,
244
+ content_hash VARCHAR(64) UNIQUE NOT NULL,
245
+ category VARCHAR(50) DEFAULT 'general',
246
+ quality_score INTEGER DEFAULT 50,
247
+ source_path TEXT,
248
+ source_file TEXT,
249
+ source_expert TEXT,
250
+ source_url TEXT,
251
+ embedding real[],
252
+ created_at TIMESTAMP DEFAULT NOW(),
253
+ updated_at TIMESTAMP DEFAULT NOW()
254
+ )
255
+ `);
256
+
257
+ // Create indexes
258
+ await client.query(`
259
+ CREATE INDEX IF NOT EXISTS idx_${schema}_kb_category
260
+ ON ${schema}.kb_entries(category)
261
+ `);
262
+
263
+ await client.query(`
264
+ CREATE INDEX IF NOT EXISTS idx_${schema}_kb_quality
265
+ ON ${schema}.kb_entries(quality_score DESC)
266
+ `);
267
+
268
+ await client.query(`
269
+ CREATE INDEX IF NOT EXISTS idx_${schema}_kb_hash
270
+ ON ${schema}.kb_entries(content_hash)
271
+ `);
272
+ }
273
+
274
+ async function createOptimizedView(client, schema) {
275
+ // Drop and recreate the optimized view
276
+ await client.query(`DROP VIEW IF EXISTS ${schema}.kb`);
277
+
278
+ await client.query(`
279
+ CREATE VIEW ${schema}.kb AS
280
+ SELECT
281
+ id,
282
+ title,
283
+ content,
284
+ category,
285
+ quality_score,
286
+ source_expert,
287
+ source_url,
288
+ embedding,
289
+ created_at
290
+ FROM ${schema}.kb_entries
291
+ WHERE quality_score >= 40
292
+ ORDER BY quality_score DESC, created_at DESC
293
+ `);
294
+
295
+ console.log(` ✓ Created optimized view: ${schema}.kb`);
296
+ }
297
+
298
+ async function checkExistingHash(client, schema, contentHash) {
299
+ const result = await client.query(
300
+ `SELECT id, quality_score FROM ${schema}.kb_entries WHERE content_hash = $1`,
301
+ [contentHash]
302
+ );
303
+ return result.rows[0] || null;
304
+ }
305
+
306
+ async function insertEntry(client, schema, entry) {
307
+ const result = await client.query(`
308
+ INSERT INTO ${schema}.kb_entries
309
+ (title, content, content_hash, category, quality_score, source_path, source_file, source_expert, source_url, embedding)
310
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, ruvector_embed($2))
311
+ RETURNING id
312
+ `, [
313
+ entry.title,
314
+ entry.content,
315
+ entry.content_hash,
316
+ entry.category,
317
+ entry.quality_score,
318
+ entry.source_path,
319
+ entry.source_file,
320
+ entry.source_expert,
321
+ entry.source_url,
322
+ ]);
323
+ return result.rows[0].id;
324
+ }
325
+
326
+ async function updateEntry(client, schema, id, entry) {
327
+ await client.query(`
328
+ UPDATE ${schema}.kb_entries
329
+ SET title = $1, content = $2, category = $3, quality_score = $4,
330
+ source_path = $5, source_file = $6, embedding = ruvector_embed($2),
331
+ updated_at = NOW()
332
+ WHERE id = $7
333
+ `, [
334
+ entry.title,
335
+ entry.content,
336
+ entry.category,
337
+ entry.quality_score,
338
+ entry.source_path,
339
+ entry.source_file,
340
+ id,
341
+ ]);
342
+ }
343
+
344
+ // ============================================================================
345
+ // Main Ingestion Process
346
+ // ============================================================================
347
+
348
+ async function ingest(sourceDir, schema, options = {}) {
349
+ const { update = false, minQuality = 30 } = options;
350
+
351
+ console.log('\n╔════════════════════════════════════════════════════════════════╗');
352
+ console.log('║ RuvNet KB Ingest Template v1.0.0 ║');
353
+ console.log('╚════════════════════════════════════════════════════════════════╝\n');
354
+
355
+ console.log(` Source: ${sourceDir}`);
356
+ console.log(` Schema: ${schema}`);
357
+ console.log(` Mode: ${update ? 'Update (replace higher quality)' : 'Insert (skip duplicates)'}`);
358
+ console.log(` Min Score: ${minQuality}\n`);
359
+
360
+ // Validate source directory
361
+ if (!existsSync(sourceDir)) {
362
+ console.error(` ✗ Source directory not found: ${sourceDir}`);
363
+ process.exit(1);
364
+ }
365
+
366
+ // Connect to database
367
+ const client = new pg.Client(CONFIG.db);
368
+
369
+ try {
370
+ await client.connect();
371
+ console.log(' ✓ Connected to ruvector-postgres');
372
+
373
+ // Ensure schema and table exist
374
+ await ensureSchema(client, schema);
375
+ console.log(` ✓ Schema ${schema} ready`);
376
+
377
+ // Process files
378
+ console.log('\n Processing files...\n');
379
+ const entries = walkDirectory(sourceDir);
380
+
381
+ const stats = {
382
+ total: entries.length,
383
+ inserted: 0,
384
+ updated: 0,
385
+ skipped: 0,
386
+ lowQuality: 0,
387
+ duplicates: 0,
388
+ };
389
+
390
+ for (const entry of entries) {
391
+ // Skip low quality
392
+ if (entry.quality_score < minQuality) {
393
+ stats.lowQuality++;
394
+ continue;
395
+ }
396
+
397
+ // Check for existing hash
398
+ const existing = await checkExistingHash(client, schema, entry.content_hash);
399
+
400
+ if (existing) {
401
+ if (update && entry.quality_score > existing.quality_score) {
402
+ // Update with higher quality version
403
+ await updateEntry(client, schema, existing.id, entry);
404
+ stats.updated++;
405
+ console.log(` ↻ Updated: ${entry.title} (${entry.quality_score} > ${existing.quality_score})`);
406
+ } else {
407
+ stats.duplicates++;
408
+ }
409
+ } else {
410
+ // Insert new entry
411
+ await insertEntry(client, schema, entry);
412
+ stats.inserted++;
413
+ console.log(` + Inserted: ${entry.title} [${entry.category}] (${entry.quality_score}/100)`);
414
+ }
415
+ }
416
+
417
+ // Create optimized view
418
+ await createOptimizedView(client, schema);
419
+
420
+ // Get final count
421
+ const countResult = await client.query(`SELECT COUNT(*) FROM ${schema}.kb_entries`);
422
+ const totalEntries = parseInt(countResult.rows[0].count);
423
+
424
+ // Print summary
425
+ console.log('\n ══════════════════════════════════════════════════════════════');
426
+ console.log(' INGESTION COMPLETE');
427
+ console.log(' ══════════════════════════════════════════════════════════════');
428
+ console.log(` Files Processed: ${stats.total}`);
429
+ console.log(` Inserted: ${stats.inserted}`);
430
+ console.log(` Updated: ${stats.updated}`);
431
+ console.log(` Skipped (dup): ${stats.duplicates}`);
432
+ console.log(` Skipped (quality): ${stats.lowQuality}`);
433
+ console.log(` Total in KB: ${totalEntries}`);
434
+ console.log(' ══════════════════════════════════════════════════════════════\n');
435
+
436
+ // Category breakdown
437
+ const categoryResult = await client.query(`
438
+ SELECT category, COUNT(*) as count, AVG(quality_score)::int as avg_score
439
+ FROM ${schema}.kb_entries
440
+ GROUP BY category
441
+ ORDER BY count DESC
442
+ `);
443
+
444
+ console.log(' Category Breakdown:');
445
+ for (const row of categoryResult.rows) {
446
+ console.log(` ${row.category.padEnd(15)} ${row.count.toString().padStart(5)} entries (avg score: ${row.avg_score})`);
447
+ }
448
+
449
+ console.log('\n Query examples:');
450
+ console.log(` SELECT * FROM ${schema}.kb WHERE category = 'agents' LIMIT 5;`);
451
+ console.log(` SELECT * FROM ${schema}.kb WHERE quality_score >= 80;`);
452
+ console.log(` SELECT title, embedding <=> ruvector_embed('your query') AS distance`);
453
+ console.log(` FROM ${schema}.kb_entries ORDER BY distance LIMIT 10;\n`);
454
+
455
+ } catch (error) {
456
+ console.error(` ✗ Error: ${error.message}`);
457
+ process.exit(1);
458
+ } finally {
459
+ await client.end();
460
+ }
461
+ }
462
+
463
+ // ============================================================================
464
+ // CLI
465
+ // ============================================================================
466
+
467
+ function parseArgs() {
468
+ const args = process.argv.slice(2);
469
+ const options = {
470
+ source: null,
471
+ schema: null,
472
+ update: false,
473
+ minQuality: 30,
474
+ };
475
+
476
+ for (let i = 0; i < args.length; i++) {
477
+ switch (args[i]) {
478
+ case '--source':
479
+ case '-s':
480
+ options.source = args[++i];
481
+ break;
482
+ case '--schema':
483
+ options.schema = args[++i];
484
+ break;
485
+ case '--update':
486
+ case '-u':
487
+ options.update = true;
488
+ break;
489
+ case '--min-quality':
490
+ options.minQuality = parseInt(args[++i]);
491
+ break;
492
+ case '--help':
493
+ case '-h':
494
+ printHelp();
495
+ process.exit(0);
496
+ }
497
+ }
498
+
499
+ return options;
500
+ }
501
+
502
+ function printHelp() {
503
+ console.log(`
504
+ KB Ingest Template - Optimized Knowledge Base Ingestion
505
+
506
+ Usage:
507
+ node scripts/kb-ingest-template.js --source <dir> --schema <name> [options]
508
+
509
+ Options:
510
+ --source, -s <dir> Source directory to ingest
511
+ --schema <name> PostgreSQL schema name
512
+ --update, -u Update existing entries if quality is higher
513
+ --min-quality <n> Minimum quality score (default: 30)
514
+ --help, -h Show this help
515
+
516
+ Examples:
517
+ # Ingest documentation
518
+ node scripts/kb-ingest-template.js --source ./docs --schema my_project
519
+
520
+ # Update mode (replace if better quality)
521
+ node scripts/kb-ingest-template.js --source ./content --schema ask_ruvnet --update
522
+
523
+ # High quality only
524
+ node scripts/kb-ingest-template.js --source ./docs --schema my_kb --min-quality 60
525
+
526
+ Environment Variables:
527
+ KB_HOST Database host (default: localhost)
528
+ KB_PORT Database port (default: 5435)
529
+ KB_PASSWORD Database password (default: guruKB2025)
530
+ `);
531
+ }
532
+
533
+ // ============================================================================
534
+ // Entry Point
535
+ // ============================================================================
536
+
537
+ const options = parseArgs();
538
+
539
+ if (!options.source || !options.schema) {
540
+ console.error('Error: --source and --schema are required');
541
+ printHelp();
542
+ process.exit(1);
543
+ }
544
+
545
+ ingest(options.source, options.schema, {
546
+ update: options.update,
547
+ minQuality: options.minQuality,
548
+ }).catch(console.error);