ruvnet-kb-first 6.3.0 → 6.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,956 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * KB Quality Audit & Optimization
5
+ *
6
+ * Scores ANY existing knowledge base and provides specific optimization steps.
7
+ *
8
+ * Usage:
9
+ * node scripts/kb-quality-audit.js --schema ask_ruvnet
10
+ * node scripts/kb-quality-audit.js --schema ask_ruvnet --fix
11
+ * node scripts/kb-quality-audit.js --schema ask_ruvnet --detailed
12
+ *
13
+ * Scores 7 Dimensions:
14
+ * 1. Embedding Quality - Are embeddings complete and well-distributed?
15
+ * 2. Deduplication - Hash-based duplicate detection
16
+ * 3. Category Coverage - Balanced distribution across domains
17
+ * 4. Structural Integrity - Hierarchy, relationships, navigation paths
18
+ * 5. Content Quality - Length, formatting, source attribution
19
+ * 6. Recall Performance - Search returns relevant results
20
+ * 7. Index Optimization - HNSW parameters, quantization readiness
21
+ */
22
+
23
+ import pg from 'pg';
24
+ import chalk from 'chalk';
25
+
26
+ // ============================================================================
27
+ // Configuration
28
+ // ============================================================================
29
+
30
+ const CONFIG = {
31
+ db: {
32
+ host: process.env.KB_HOST || 'localhost',
33
+ port: parseInt(process.env.KB_PORT || '5435'),
34
+ database: 'postgres',
35
+ user: 'postgres',
36
+ password: process.env.KB_PASSWORD || 'guruKB2025',
37
+ },
38
+ };
39
+
40
+ // ============================================================================
41
+ // Audit Dimensions
42
+ // ============================================================================
43
+
44
+ class KBAudit {
45
+ constructor(client, schema, tableName = null) {
46
+ this.client = client;
47
+ this.schema = schema;
48
+ this.tableName = tableName;
49
+ this.fullTable = null; // Set during detection
50
+ this.scores = {};
51
+ this.issues = [];
52
+ this.fixes = [];
53
+ }
54
+
55
+ async detectTable() {
56
+ // Auto-detect the main KB table
57
+ const tables = await this.client.query(`
58
+ SELECT table_name FROM information_schema.tables
59
+ WHERE table_schema = $1
60
+ AND table_type = 'BASE TABLE'
61
+ ORDER BY
62
+ CASE
63
+ WHEN table_name = 'kb_entries' THEN 1
64
+ WHEN table_name = 'architecture_docs' THEN 2
65
+ WHEN table_name LIKE '%kb%' THEN 3
66
+ WHEN table_name LIKE '%entries%' THEN 4
67
+ ELSE 5
68
+ END
69
+ `, [this.schema]);
70
+
71
+ if (tables.rows.length === 0) {
72
+ throw new Error(`No tables found in schema ${this.schema}`);
73
+ }
74
+
75
+ this.tableName = this.tableName || tables.rows[0].table_name;
76
+ this.fullTable = `${this.schema}.${this.tableName}`;
77
+ console.log(chalk.gray(` Table: ${this.fullTable}\n`));
78
+
79
+ // Detect column mappings
80
+ const columns = await this.client.query(`
81
+ SELECT column_name FROM information_schema.columns
82
+ WHERE table_schema = $1 AND table_name = $2
83
+ `, [this.schema, this.tableName]);
84
+
85
+ this.columns = columns.rows.map(r => r.column_name);
86
+
87
+ // Map standard columns to actual columns
88
+ this.colMap = {
89
+ hash: this.columns.includes('content_hash') ? 'content_hash' :
90
+ this.columns.includes('file_hash') ? 'file_hash' : null,
91
+ path: this.columns.includes('source_path') ? 'source_path' :
92
+ this.columns.includes('file_path') ? 'file_path' : null,
93
+ expert: this.columns.includes('source_expert') ? 'source_expert' :
94
+ this.columns.includes('package_name') ? 'package_name' : null,
95
+ url: this.columns.includes('source_url') ? 'source_url' : null,
96
+ isDuplicate: this.columns.includes('is_duplicate') ? 'is_duplicate' : null,
97
+ };
98
+
99
+ return this.tableName;
100
+ }
101
+
102
+ // ---------------------------------------------------------------------------
103
+ // 1. Embedding Quality (0-100)
104
+ // ---------------------------------------------------------------------------
105
+ async auditEmbeddings() {
106
+ const dimension = 'Embedding Quality';
107
+ let score = 100;
108
+ const details = [];
109
+
110
+ // Check for null embeddings
111
+ const nullEmbeddings = await this.client.query(`
112
+ SELECT COUNT(*) as count FROM ${this.fullTable}
113
+ WHERE embedding IS NULL OR array_length(embedding, 1) IS NULL
114
+ `);
115
+ const nullCount = parseInt(nullEmbeddings.rows[0].count);
116
+
117
+ const totalResult = await this.client.query(`
118
+ SELECT COUNT(*) as count FROM ${this.fullTable}
119
+ `);
120
+ const total = parseInt(totalResult.rows[0].count);
121
+
122
+ if (nullCount > 0) {
123
+ const penalty = Math.min(50, (nullCount / total) * 100);
124
+ score -= penalty;
125
+ this.issues.push({
126
+ dimension,
127
+ severity: 'HIGH',
128
+ message: `${nullCount} entries (${((nullCount/total)*100).toFixed(1)}%) have NULL embeddings`,
129
+ });
130
+ this.fixes.push({
131
+ dimension,
132
+ sql: `UPDATE ${this.fullTable} SET embedding = ruvector_embed(content) WHERE embedding IS NULL`,
133
+ description: 'Generate embeddings for entries missing them',
134
+ });
135
+ }
136
+
137
+ // Check embedding dimensions consistency
138
+ const dimCheck = await this.client.query(`
139
+ SELECT array_length(embedding, 1) as dims, COUNT(*) as count
140
+ FROM ${this.fullTable}
141
+ WHERE embedding IS NOT NULL
142
+ GROUP BY array_length(embedding, 1)
143
+ `);
144
+
145
+ if (dimCheck.rows.length > 1) {
146
+ score -= 20;
147
+ this.issues.push({
148
+ dimension,
149
+ severity: 'HIGH',
150
+ message: `Inconsistent embedding dimensions: ${dimCheck.rows.map(r => `${r.dims}D (${r.count})`).join(', ')}`,
151
+ });
152
+ details.push(`Multiple dimensions found - re-embed with consistent model`);
153
+ } else if (dimCheck.rows.length === 1) {
154
+ details.push(`Consistent ${dimCheck.rows[0].dims}D embeddings`);
155
+ }
156
+
157
+ // Check embedding distribution (variance)
158
+ const varianceCheck = await this.client.query(`
159
+ SELECT
160
+ AVG(embedding[1]) as mean_dim1,
161
+ STDDEV(embedding[1]) as std_dim1
162
+ FROM ${this.fullTable}
163
+ WHERE embedding IS NOT NULL
164
+ LIMIT 10000
165
+ `);
166
+
167
+ if (varianceCheck.rows[0].std_dim1 !== null) {
168
+ const std = parseFloat(varianceCheck.rows[0].std_dim1);
169
+ if (std < 0.1) {
170
+ score -= 15;
171
+ this.issues.push({
172
+ dimension,
173
+ severity: 'MEDIUM',
174
+ message: `Low embedding variance (${std.toFixed(3)}) - may indicate poor model or data quality`,
175
+ });
176
+ } else {
177
+ details.push(`Healthy embedding variance: ${std.toFixed(3)}`);
178
+ }
179
+ }
180
+
181
+ this.scores.embeddings = { score: Math.max(0, score), dimension, details };
182
+ return score;
183
+ }
184
+
185
+ // ---------------------------------------------------------------------------
186
+ // 2. Deduplication (0-100)
187
+ // ---------------------------------------------------------------------------
188
+ async auditDeduplication() {
189
+ const dimension = 'Deduplication';
190
+ let score = 100;
191
+ const details = [];
192
+
193
+ const hashCol = this.colMap.hash || 'content_hash';
194
+
195
+ // Check if we have built-in duplicate tracking
196
+ if (this.colMap.isDuplicate) {
197
+ const dupCount = await this.client.query(`
198
+ SELECT COUNT(*) as count FROM ${this.fullTable}
199
+ WHERE ${this.colMap.isDuplicate} = true
200
+ `);
201
+ const dups = parseInt(dupCount.rows[0].count);
202
+ if (dups > 0) {
203
+ details.push(`${dups} entries marked as duplicates (already handled)`);
204
+ }
205
+ }
206
+
207
+ // Check for duplicate content hashes
208
+ const hashCheck = await this.client.query(`
209
+ SELECT ${hashCol}, COUNT(*) as count
210
+ FROM ${this.fullTable}
211
+ WHERE ${hashCol} IS NOT NULL
212
+ GROUP BY ${hashCol}
213
+ HAVING COUNT(*) > 1
214
+ LIMIT 50
215
+ `);
216
+
217
+ if (hashCheck.rows.length > 0) {
218
+ const totalDups = hashCheck.rows.reduce((sum, r) => sum + parseInt(r.count) - 1, 0);
219
+ const penalty = Math.min(40, totalDups / 10);
220
+ score -= penalty;
221
+ this.issues.push({
222
+ dimension,
223
+ severity: 'MEDIUM',
224
+ message: `${totalDups} duplicate entries found (${hashCheck.rows.length} unique hashes with duplicates)`,
225
+ });
226
+ this.fixes.push({
227
+ dimension,
228
+ sql: `
229
+ DELETE FROM ${this.fullTable} a
230
+ USING ${this.fullTable} b
231
+ WHERE a.${hashCol} = b.${hashCol}
232
+ AND a.id > b.id
233
+ AND a.quality_score <= b.quality_score`,
234
+ description: 'Remove duplicate entries (keeps higher quality)',
235
+ });
236
+ } else {
237
+ details.push('No duplicate content detected');
238
+ }
239
+
240
+ // Check for missing content hashes
241
+ const missingHash = await this.client.query(`
242
+ SELECT COUNT(*) as count FROM ${this.fullTable}
243
+ WHERE ${hashCol} IS NULL
244
+ `);
245
+ const missingCount = parseInt(missingHash.rows[0].count);
246
+
247
+ if (missingCount > 0) {
248
+ score -= 20;
249
+ this.issues.push({
250
+ dimension,
251
+ severity: 'HIGH',
252
+ message: `${missingCount} entries missing ${hashCol} (cannot deduplicate)`,
253
+ });
254
+ this.fixes.push({
255
+ dimension,
256
+ sql: `
257
+ UPDATE ${this.fullTable}
258
+ SET ${hashCol} = encode(sha256(lower(regexp_replace(content, '\\s+', ' ', 'g'))::bytea), 'hex')
259
+ WHERE ${hashCol} IS NULL`,
260
+ description: 'Generate content hashes for deduplication',
261
+ });
262
+ }
263
+
264
+ // Check for near-duplicates (similar titles)
265
+ const titleDups = await this.client.query(`
266
+ SELECT title, COUNT(*) as count
267
+ FROM ${this.fullTable}
268
+ GROUP BY title
269
+ HAVING COUNT(*) > 1
270
+ LIMIT 20
271
+ `);
272
+
273
+ if (titleDups.rows.length > 0) {
274
+ score -= 10;
275
+ this.issues.push({
276
+ dimension,
277
+ severity: 'LOW',
278
+ message: `${titleDups.rows.length} titles appear multiple times (may be near-duplicates)`,
279
+ });
280
+ details.push(`Example: "${titleDups.rows[0].title}" (${titleDups.rows[0].count}x)`);
281
+ }
282
+
283
+ this.scores.deduplication = { score: Math.max(0, score), dimension, details };
284
+ return score;
285
+ }
286
+
287
+ // ---------------------------------------------------------------------------
288
+ // 3. Category Coverage (0-100)
289
+ // ---------------------------------------------------------------------------
290
+ async auditCategories() {
291
+ const dimension = 'Category Coverage';
292
+ let score = 100;
293
+ const details = [];
294
+
295
+ // Get category distribution
296
+ const catDist = await this.client.query(`
297
+ SELECT
298
+ COALESCE(category, 'uncategorized') as category,
299
+ COUNT(*) as count
300
+ FROM ${this.fullTable}
301
+ GROUP BY category
302
+ ORDER BY count DESC
303
+ `);
304
+
305
+ const total = catDist.rows.reduce((sum, r) => sum + parseInt(r.count), 0);
306
+ const categories = catDist.rows.map(r => ({
307
+ name: r.category,
308
+ count: parseInt(r.count),
309
+ pct: (parseInt(r.count) / total * 100).toFixed(1),
310
+ }));
311
+
312
+ // Check for uncategorized
313
+ const uncategorized = categories.find(c => c.name === 'uncategorized' || c.name === 'general');
314
+ if (uncategorized && parseFloat(uncategorized.pct) > 30) {
315
+ score -= 20;
316
+ this.issues.push({
317
+ dimension,
318
+ severity: 'MEDIUM',
319
+ message: `${uncategorized.pct}% entries are uncategorized/general`,
320
+ });
321
+ this.fixes.push({
322
+ dimension,
323
+ sql: `-- Re-categorize based on content patterns (requires custom logic)`,
324
+ description: 'Run categorization algorithm on uncategorized entries',
325
+ });
326
+ }
327
+
328
+ // Check category balance (Gini coefficient)
329
+ const sortedCounts = categories.map(c => c.count).sort((a, b) => a - b);
330
+ const n = sortedCounts.length;
331
+ let gini = 0;
332
+ if (n > 1) {
333
+ const cumSum = sortedCounts.reduce((acc, v, i) => {
334
+ acc.push((acc[i] || 0) + v);
335
+ return acc;
336
+ }, []);
337
+ const sumOfSums = cumSum.reduce((a, b) => a + b, 0);
338
+ gini = (n + 1 - 2 * sumOfSums / cumSum[n - 1]) / n;
339
+ }
340
+
341
+ if (gini > 0.5) {
342
+ score -= Math.min(25, gini * 30);
343
+ this.issues.push({
344
+ dimension,
345
+ severity: 'MEDIUM',
346
+ message: `Imbalanced category distribution (Gini: ${gini.toFixed(2)})`,
347
+ });
348
+ details.push(`Top category "${categories[0].name}" has ${categories[0].pct}% of entries`);
349
+ } else {
350
+ details.push(`Category balance is good (Gini: ${gini.toFixed(2)})`);
351
+ }
352
+
353
+ // Check minimum categories
354
+ if (categories.length < 5) {
355
+ score -= 15;
356
+ this.issues.push({
357
+ dimension,
358
+ severity: 'LOW',
359
+ message: `Only ${categories.length} categories - consider more granular categorization`,
360
+ });
361
+ }
362
+
363
+ details.push(`${categories.length} categories: ${categories.slice(0, 5).map(c => `${c.name}(${c.count})`).join(', ')}`);
364
+
365
+ this.scores.categories = { score: Math.max(0, score), dimension, details, breakdown: categories };
366
+ return score;
367
+ }
368
+
369
+ // ---------------------------------------------------------------------------
370
+ // 4. Structural Integrity (0-100)
371
+ // ---------------------------------------------------------------------------
372
+ async auditStructure() {
373
+ const dimension = 'Structural Integrity';
374
+ let score = 100;
375
+ const details = [];
376
+
377
+ const pathCol = this.colMap.path || 'source_path';
378
+
379
+ // Check for hierarchical structure (path field)
380
+ const hasPath = await this.client.query(`
381
+ SELECT COUNT(*) as count FROM ${this.fullTable}
382
+ WHERE ${pathCol} IS NOT NULL AND ${pathCol} != ''
383
+ `);
384
+ const pathCount = parseInt(hasPath.rows[0].count);
385
+ const totalResult = await this.client.query(`SELECT COUNT(*) as count FROM ${this.fullTable}`);
386
+ const total = parseInt(totalResult.rows[0].count);
387
+
388
+ if (pathCount < total * 0.5) {
389
+ score -= 15;
390
+ this.issues.push({
391
+ dimension,
392
+ severity: 'MEDIUM',
393
+ message: `Only ${((pathCount/total)*100).toFixed(0)}% entries have navigation paths`,
394
+ });
395
+ } else {
396
+ details.push(`${((pathCount/total)*100).toFixed(0)}% entries have paths`);
397
+ }
398
+
399
+ // Check for source attribution
400
+ const expertCol = this.colMap.expert || 'source_expert';
401
+ const hasExpert = await this.client.query(`
402
+ SELECT COUNT(*) as count FROM ${this.fullTable}
403
+ WHERE ${expertCol} IS NOT NULL AND ${expertCol} != ''
404
+ `);
405
+ const expertCount = parseInt(hasExpert.rows[0].count);
406
+
407
+ if (expertCount < total * 0.3) {
408
+ score -= 15;
409
+ this.issues.push({
410
+ dimension,
411
+ severity: 'MEDIUM',
412
+ message: `Only ${((expertCount/total)*100).toFixed(0)}% entries have source attribution`,
413
+ });
414
+ this.fixes.push({
415
+ dimension,
416
+ description: 'Add source attribution field to entries during ingestion',
417
+ sql: `-- Requires manual attribution or extraction from content`,
418
+ });
419
+ } else {
420
+ details.push(`${((expertCount/total)*100).toFixed(0)}% entries have source attribution`);
421
+ }
422
+
423
+ // Check for URLs (if column exists)
424
+ const urlCol = this.colMap.url;
425
+ if (urlCol) {
426
+ const hasUrl = await this.client.query(`
427
+ SELECT COUNT(*) as count FROM ${this.fullTable}
428
+ WHERE ${urlCol} IS NOT NULL AND ${urlCol} != ''
429
+ `);
430
+ const urlCount = parseInt(hasUrl.rows[0].count);
431
+
432
+ if (urlCount < total * 0.2) {
433
+ score -= 10;
434
+ this.issues.push({
435
+ dimension,
436
+ severity: 'LOW',
437
+ message: `Only ${((urlCount/total)*100).toFixed(0)}% entries have source URLs`,
438
+ });
439
+ }
440
+ }
441
+
442
+ // Check for orphan entries (no category, no path)
443
+ const orphans = await this.client.query(`
444
+ SELECT COUNT(*) as count FROM ${this.fullTable}
445
+ WHERE (category IS NULL OR category = 'general')
446
+ AND (${pathCol} IS NULL OR ${pathCol} = '')
447
+ `);
448
+ const orphanCount = parseInt(orphans.rows[0].count);
449
+
450
+ if (orphanCount > total * 0.1) {
451
+ score -= 15;
452
+ this.issues.push({
453
+ dimension,
454
+ severity: 'MEDIUM',
455
+ message: `${orphanCount} orphan entries (no category or path)`,
456
+ });
457
+ }
458
+
459
+ this.scores.structure = { score: Math.max(0, score), dimension, details };
460
+ return score;
461
+ }
462
+
463
+ // ---------------------------------------------------------------------------
464
+ // 5. Content Quality (0-100)
465
+ // ---------------------------------------------------------------------------
466
+ async auditContent() {
467
+ const dimension = 'Content Quality';
468
+ let score = 100;
469
+ const details = [];
470
+
471
+ // Length distribution
472
+ const lengthStats = await this.client.query(`
473
+ SELECT
474
+ AVG(length(content)) as avg_len,
475
+ MIN(length(content)) as min_len,
476
+ MAX(length(content)) as max_len,
477
+ STDDEV(length(content)) as std_len,
478
+ COUNT(*) FILTER (WHERE length(content) < 100) as too_short,
479
+ COUNT(*) FILTER (WHERE length(content) > 50000) as too_long,
480
+ COUNT(*) as total
481
+ FROM ${this.fullTable}
482
+ `);
483
+
484
+ const stats = lengthStats.rows[0];
485
+ const tooShortPct = (parseInt(stats.too_short) / parseInt(stats.total)) * 100;
486
+ const tooLongPct = (parseInt(stats.too_long) / parseInt(stats.total)) * 100;
487
+
488
+ if (tooShortPct > 10) {
489
+ score -= Math.min(20, tooShortPct);
490
+ this.issues.push({
491
+ dimension,
492
+ severity: 'MEDIUM',
493
+ message: `${tooShortPct.toFixed(1)}% entries are too short (<100 chars)`,
494
+ });
495
+ this.fixes.push({
496
+ dimension,
497
+ sql: `DELETE FROM ${this.fullTable} WHERE length(content) < 100`,
498
+ description: 'Remove entries too short to be useful',
499
+ });
500
+ }
501
+
502
+ if (tooLongPct > 5) {
503
+ score -= Math.min(10, tooLongPct);
504
+ this.issues.push({
505
+ dimension,
506
+ severity: 'LOW',
507
+ message: `${tooLongPct.toFixed(1)}% entries are very long (>50K chars) - consider chunking`,
508
+ });
509
+ }
510
+
511
+ details.push(`Avg length: ${parseInt(stats.avg_len).toLocaleString()} chars`);
512
+
513
+ // Quality score distribution
514
+ const qualityStats = await this.client.query(`
515
+ SELECT
516
+ AVG(quality_score) as avg_score,
517
+ COUNT(*) FILTER (WHERE quality_score < 40) as low_quality,
518
+ COUNT(*) FILTER (WHERE quality_score >= 80) as high_quality,
519
+ COUNT(*) as total
520
+ FROM ${this.fullTable}
521
+ WHERE quality_score IS NOT NULL
522
+ `);
523
+
524
+ const qStats = qualityStats.rows[0];
525
+ if (qStats.avg_score !== null) {
526
+ const avgQuality = parseFloat(qStats.avg_score);
527
+ const lowQualityPct = (parseInt(qStats.low_quality) / parseInt(qStats.total)) * 100;
528
+
529
+ if (avgQuality < 50) {
530
+ score -= 20;
531
+ this.issues.push({
532
+ dimension,
533
+ severity: 'HIGH',
534
+ message: `Average quality score is only ${avgQuality.toFixed(0)}/100`,
535
+ });
536
+ }
537
+
538
+ if (lowQualityPct > 20) {
539
+ score -= 15;
540
+ this.issues.push({
541
+ dimension,
542
+ severity: 'MEDIUM',
543
+ message: `${lowQualityPct.toFixed(0)}% entries have low quality scores (<40)`,
544
+ });
545
+ }
546
+
547
+ details.push(`Avg quality: ${avgQuality.toFixed(0)}/100, High quality: ${parseInt(qStats.high_quality)}`);
548
+ } else {
549
+ score -= 30;
550
+ this.issues.push({
551
+ dimension,
552
+ severity: 'HIGH',
553
+ message: 'No quality scores - run quality scoring on entries',
554
+ });
555
+ }
556
+
557
+ this.scores.content = { score: Math.max(0, score), dimension, details };
558
+ return score;
559
+ }
560
+
561
+ // ---------------------------------------------------------------------------
562
+ // 6. Recall Performance (0-100)
563
+ // ---------------------------------------------------------------------------
564
+ async auditRecall() {
565
+ const dimension = 'Recall Performance';
566
+ let score = 100;
567
+ const details = [];
568
+
569
+ // Test queries with expected results
570
+ const testQueries = [
571
+ { query: 'agent orchestration swarm', expected_category: 'agents' },
572
+ { query: 'vector embedding semantic search', expected_category: 'embeddings' },
573
+ { query: 'authentication security token', expected_category: 'security' },
574
+ { query: 'deployment docker kubernetes', expected_category: 'deployment' },
575
+ { query: 'api endpoint rest graphql', expected_category: 'api' },
576
+ ];
577
+
578
+ let passedTests = 0;
579
+
580
+ for (const test of testQueries) {
581
+ try {
582
+ // Use cosine_distance for ruvector-postgres (real[] embeddings)
583
+ const result = await this.client.query(`
584
+ SELECT category, title,
585
+ cosine_distance(embedding, ruvector_embed($1)::real[]) as distance
586
+ FROM ${this.fullTable}
587
+ WHERE embedding IS NOT NULL
588
+ ORDER BY cosine_distance(embedding, ruvector_embed($1)::real[])
589
+ LIMIT 5
590
+ `, [test.query]);
591
+
592
+ if (result.rows.length > 0) {
593
+ const topCategory = result.rows[0].category;
594
+ const topCategories = result.rows.map(r => r.category);
595
+
596
+ if (topCategory === test.expected_category || topCategories.includes(test.expected_category)) {
597
+ passedTests++;
598
+ } else {
599
+ details.push(`Query "${test.query}" returned ${topCategory} instead of ${test.expected_category}`);
600
+ }
601
+ }
602
+ } catch (e) {
603
+ // ruvector_embed may not exist
604
+ score -= 20;
605
+ this.issues.push({
606
+ dimension,
607
+ severity: 'HIGH',
608
+ message: `Cannot test recall - ruvector_embed() function missing or error: ${e.message}`,
609
+ });
610
+ break;
611
+ }
612
+ }
613
+
614
+ const recallRate = (passedTests / testQueries.length) * 100;
615
+ if (recallRate < 80) {
616
+ score -= (80 - recallRate);
617
+ this.issues.push({
618
+ dimension,
619
+ severity: recallRate < 50 ? 'HIGH' : 'MEDIUM',
620
+ message: `Recall rate: ${recallRate.toFixed(0)}% (${passedTests}/${testQueries.length} tests passed)`,
621
+ });
622
+ } else {
623
+ details.push(`Recall rate: ${recallRate.toFixed(0)}% - good performance`);
624
+ }
625
+
626
+ // Check for supporting indexes (ruvector uses real[] without HNSW)
627
+ // Note: ruvector-postgres uses brute-force cosine search which is fast for <500K entries
628
+ const indexCheck = await this.client.query(`
629
+ SELECT indexname FROM pg_indexes
630
+ WHERE tablename = $1
631
+ AND schemaname = $2
632
+ AND (indexdef LIKE '%embedding%' OR indexdef LIKE '%is_duplicate%')
633
+ `, [this.tableName, this.schema]);
634
+
635
+ if (indexCheck.rows.length === 0) {
636
+ score -= 10;
637
+ this.issues.push({
638
+ dimension,
639
+ severity: 'LOW',
640
+ message: 'No embedding filter indexes - consider adding partial indexes',
641
+ });
642
+ this.fixes.push({
643
+ dimension,
644
+ sql: `CREATE INDEX idx_${this.schema}_has_embedding ON ${this.fullTable}(id) WHERE embedding IS NOT NULL`,
645
+ description: 'Create partial index for entries with embeddings',
646
+ });
647
+ } else {
648
+ details.push(`${indexCheck.rows.length} embedding-related indexes present`);
649
+ }
650
+
651
+ // Note about ruvector architecture
652
+ details.push('Using ruvector cosine_distance() for semantic search (real[] arrays)');
653
+
654
+ this.scores.recall = { score: Math.max(0, score), dimension, details };
655
+ return score;
656
+ }
657
+
658
+ // ---------------------------------------------------------------------------
659
+ // 7. Index Optimization (0-100)
660
+ // ---------------------------------------------------------------------------
661
+ async auditIndexes() {
662
+ const dimension = 'Index Optimization';
663
+ let score = 100;
664
+ const details = [];
665
+
666
+ // Check all indexes
667
+ const indexes = await this.client.query(`
668
+ SELECT indexname, indexdef
669
+ FROM pg_indexes
670
+ WHERE schemaname = $1
671
+ AND tablename = $2
672
+ `, [this.schema, this.tableName]);
673
+
674
+ // ruvector-postgres uses real[] arrays with cosine_distance() function
675
+ // HNSW requires pgvector extension which is not available in ruvector
676
+ const hasEmbeddingIdx = indexes.rows.some(r => r.indexdef.includes('embedding'));
677
+ const hasCategory = indexes.rows.some(r => r.indexdef.includes('category'));
678
+ const hashCol = this.colMap.hash || 'content_hash';
679
+ const hasHash = indexes.rows.some(r => r.indexdef.includes(hashCol) || r.indexdef.includes('file_hash'));
680
+ const hasQuality = indexes.rows.some(r => r.indexdef.includes('quality_score'));
681
+ const hasNonDuplicate = indexes.rows.some(r => r.indexdef.includes('is_duplicate'));
682
+
683
+ if (!hasEmbeddingIdx && !hasNonDuplicate) {
684
+ score -= 15;
685
+ this.issues.push({ dimension, severity: 'MEDIUM', message: 'Missing embedding filter indexes' });
686
+ this.fixes.push({
687
+ dimension,
688
+ sql: `CREATE INDEX idx_${this.schema}_has_embedding ON ${this.fullTable}(id) WHERE embedding IS NOT NULL`,
689
+ description: 'Create partial index for embedding existence filter',
690
+ });
691
+ }
692
+
693
+ if (!hasCategory) {
694
+ score -= 10;
695
+ this.fixes.push({
696
+ dimension,
697
+ sql: `CREATE INDEX idx_${this.schema}_kb_category ON ${this.fullTable}(category)`,
698
+ description: 'Create category index for filtered queries',
699
+ });
700
+ }
701
+
702
+ if (!hasHash) {
703
+ score -= 10;
704
+ this.fixes.push({
705
+ dimension,
706
+ sql: `CREATE UNIQUE INDEX idx_${this.schema}_kb_hash ON ${this.fullTable}(${hashCol})`,
707
+ description: 'Create hash index for deduplication',
708
+ });
709
+ }
710
+
711
+ if (!hasQuality) {
712
+ score -= 5;
713
+ this.fixes.push({
714
+ dimension,
715
+ sql: `CREATE INDEX idx_${this.schema}_kb_quality ON ${this.fullTable}(quality_score DESC)`,
716
+ description: 'Create quality score index for filtering',
717
+ });
718
+ }
719
+
720
+ // Check table statistics
721
+ const tableStats = await this.client.query(`
722
+ SELECT
723
+ pg_size_pretty(pg_total_relation_size('${this.fullTable}')) as total_size,
724
+ pg_size_pretty(pg_table_size('${this.fullTable}')) as table_size,
725
+ pg_size_pretty(pg_indexes_size('${this.fullTable}')) as index_size
726
+ `);
727
+
728
+ const stats = tableStats.rows[0];
729
+ details.push(`Table size: ${stats.table_size}, Index size: ${stats.index_size}`);
730
+ details.push(`Indexes: ${indexes.rows.length} (HNSW: ${hasHnsw ? 'yes' : 'no'})`);
731
+
732
+ // Check for quantization readiness
733
+ const embeddingDims = await this.client.query(`
734
+ SELECT array_length(embedding, 1) as dims
735
+ FROM ${this.fullTable}
736
+ WHERE embedding IS NOT NULL
737
+ LIMIT 1
738
+ `);
739
+
740
+ if (embeddingDims.rows.length > 0) {
741
+ const dims = embeddingDims.rows[0].dims;
742
+ details.push(`Embedding dims: ${dims} (${dims === 384 ? 'optimal for quantization' : dims > 768 ? 'consider dimensionality reduction' : 'OK'})`);
743
+ }
744
+
745
+ this.scores.indexes = { score: Math.max(0, score), dimension, details };
746
+ return score;
747
+ }
748
+
749
+ // ---------------------------------------------------------------------------
750
+ // Run Full Audit
751
+ // ---------------------------------------------------------------------------
752
+ async runFullAudit() {
753
+ console.log(chalk.cyan('\n╔════════════════════════════════════════════════════════════════╗'));
754
+ console.log(chalk.cyan('║ KB Quality Audit & Optimization Report ║'));
755
+ console.log(chalk.cyan('╚════════════════════════════════════════════════════════════════╝\n'));
756
+
757
+ console.log(chalk.gray(` Schema: ${this.schema}`));
758
+ console.log(chalk.gray(` Time: ${new Date().toISOString()}`));
759
+
760
+ // Auto-detect table structure
761
+ await this.detectTable();
762
+
763
+ // Run all audits
764
+ await this.auditEmbeddings();
765
+ await this.auditDeduplication();
766
+ await this.auditCategories();
767
+ await this.auditStructure();
768
+ await this.auditContent();
769
+ await this.auditRecall();
770
+ await this.auditIndexes();
771
+
772
+ // Calculate overall score
773
+ const dimensions = Object.values(this.scores);
774
+ const overallScore = Math.round(
775
+ dimensions.reduce((sum, d) => sum + d.score, 0) / dimensions.length
776
+ );
777
+
778
+ // Print dimension scores
779
+ console.log(chalk.white.bold(' DIMENSION SCORES'));
780
+ console.log(chalk.gray(' ──────────────────────────────────────────────────────────────\n'));
781
+
782
+ for (const dim of dimensions) {
783
+ const bar = this.renderBar(dim.score);
784
+ const color = dim.score >= 80 ? chalk.green : dim.score >= 60 ? chalk.yellow : chalk.red;
785
+ console.log(` ${dim.dimension.padEnd(22)} ${bar} ${color(dim.score.toString().padStart(3))}/100`);
786
+
787
+ if (dim.details && dim.details.length > 0) {
788
+ for (const detail of dim.details) {
789
+ console.log(chalk.gray(` └─ ${detail}`));
790
+ }
791
+ }
792
+ }
793
+
794
+ // Overall score
795
+ console.log(chalk.gray('\n ──────────────────────────────────────────────────────────────'));
796
+ const overallBar = this.renderBar(overallScore);
797
+ const overallColor = overallScore >= 80 ? chalk.green : overallScore >= 60 ? chalk.yellow : chalk.red;
798
+ console.log(` ${chalk.bold('OVERALL SCORE'.padEnd(22))} ${overallBar} ${overallColor.bold(overallScore.toString().padStart(3))}/100`);
799
+ console.log(chalk.gray(' ──────────────────────────────────────────────────────────────\n'));
800
+
801
+ // Print issues
802
+ if (this.issues.length > 0) {
803
+ console.log(chalk.white.bold(' ISSUES FOUND'));
804
+ console.log(chalk.gray(' ──────────────────────────────────────────────────────────────\n'));
805
+
806
+ const highIssues = this.issues.filter(i => i.severity === 'HIGH');
807
+ const mediumIssues = this.issues.filter(i => i.severity === 'MEDIUM');
808
+ const lowIssues = this.issues.filter(i => i.severity === 'LOW');
809
+
810
+ for (const issue of highIssues) {
811
+ console.log(chalk.red(` ✗ [HIGH] ${issue.message}`));
812
+ }
813
+ for (const issue of mediumIssues) {
814
+ console.log(chalk.yellow(` ⚠ [MED] ${issue.message}`));
815
+ }
816
+ for (const issue of lowIssues) {
817
+ console.log(chalk.blue(` ○ [LOW] ${issue.message}`));
818
+ }
819
+ console.log();
820
+ }
821
+
822
+ // Print fixes
823
+ if (this.fixes.length > 0) {
824
+ console.log(chalk.white.bold(' RECOMMENDED FIXES'));
825
+ console.log(chalk.gray(' ──────────────────────────────────────────────────────────────\n'));
826
+
827
+ for (let i = 0; i < this.fixes.length; i++) {
828
+ const fix = this.fixes[i];
829
+ console.log(chalk.cyan(` ${i + 1}. ${fix.description}`));
830
+ console.log(chalk.gray(` ${fix.sql.split('\n')[0].trim()}`));
831
+ console.log();
832
+ }
833
+ }
834
+
835
+ // Grade
836
+ const grade = overallScore >= 95 ? 'A+' :
837
+ overallScore >= 90 ? 'A' :
838
+ overallScore >= 85 ? 'A-' :
839
+ overallScore >= 80 ? 'B+' :
840
+ overallScore >= 75 ? 'B' :
841
+ overallScore >= 70 ? 'B-' :
842
+ overallScore >= 65 ? 'C+' :
843
+ overallScore >= 60 ? 'C' :
844
+ overallScore >= 55 ? 'C-' :
845
+ overallScore >= 50 ? 'D' : 'F';
846
+
847
+ console.log(chalk.gray(' ══════════════════════════════════════════════════════════════'));
848
+ console.log(` ${chalk.bold('GRADE:')} ${overallColor.bold(grade)} | ${chalk.bold('Score:')} ${overallColor(overallScore)}/100 | ${chalk.bold('Issues:')} ${this.issues.length}`);
849
+ console.log(chalk.gray(' ══════════════════════════════════════════════════════════════\n'));
850
+
851
+ return { overallScore, grade, scores: this.scores, issues: this.issues, fixes: this.fixes };
852
+ }
853
+
854
+ renderBar(score) {
855
+ const filled = Math.round(score / 5);
856
+ const empty = 20 - filled;
857
+ const color = score >= 80 ? chalk.green : score >= 60 ? chalk.yellow : chalk.red;
858
+ return color('█'.repeat(filled)) + chalk.gray('░'.repeat(empty));
859
+ }
860
+ }
861
+
862
+ // ============================================================================
863
+ // Apply Fixes
864
+ // ============================================================================
865
+
866
+ async function applyFixes(client, schema, fixes) {
867
+ console.log(chalk.cyan('\n Applying fixes...\n'));
868
+
869
+ for (const fix of fixes) {
870
+ if (fix.sql && !fix.sql.includes('--')) {
871
+ console.log(chalk.gray(` → ${fix.description}`));
872
+ try {
873
+ await client.query(fix.sql);
874
+ console.log(chalk.green(` ✓ Applied successfully`));
875
+ } catch (e) {
876
+ console.log(chalk.red(` ✗ Failed: ${e.message}`));
877
+ }
878
+ }
879
+ }
880
+ }
881
+
882
+ // ============================================================================
883
+ // CLI
884
+ // ============================================================================
885
+
886
+ function parseArgs() {
887
+ const args = process.argv.slice(2);
888
+ const options = { schema: null, fix: false, detailed: false };
889
+
890
+ for (let i = 0; i < args.length; i++) {
891
+ switch (args[i]) {
892
+ case '--schema':
893
+ case '-s':
894
+ options.schema = args[++i];
895
+ break;
896
+ case '--fix':
897
+ options.fix = true;
898
+ break;
899
+ case '--detailed':
900
+ case '-d':
901
+ options.detailed = true;
902
+ break;
903
+ case '--help':
904
+ case '-h':
905
+ console.log(`
906
+ KB Quality Audit - Score and optimize any knowledge base
907
+
908
+ Usage:
909
+ node scripts/kb-quality-audit.js --schema <name> [options]
910
+
911
+ Options:
912
+ --schema, -s <name> Schema to audit (required)
913
+ --fix Apply recommended fixes automatically
914
+ --detailed, -d Show detailed breakdown
915
+ --help, -h Show this help
916
+ `);
917
+ process.exit(0);
918
+ }
919
+ }
920
+
921
+ return options;
922
+ }
923
+
924
+ // ============================================================================
925
+ // Main
926
+ // ============================================================================
927
+
928
+ async function main() {
929
+ const options = parseArgs();
930
+
931
+ if (!options.schema) {
932
+ console.error('Error: --schema is required');
933
+ process.exit(1);
934
+ }
935
+
936
+ const client = new pg.Client(CONFIG.db);
937
+
938
+ try {
939
+ await client.connect();
940
+
941
+ const audit = new KBAudit(client, options.schema);
942
+ const result = await audit.runFullAudit();
943
+
944
+ if (options.fix && result.fixes.length > 0) {
945
+ await applyFixes(client, options.schema, result.fixes);
946
+ }
947
+
948
+ } catch (e) {
949
+ console.error(chalk.red(`Error: ${e.message}`));
950
+ process.exit(1);
951
+ } finally {
952
+ await client.end();
953
+ }
954
+ }
955
+
956
+ main();