ruvnet-kb-first 6.3.0 → 6.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +318 -533
- package/SKILL.md +139 -6
- package/package.json +1 -1
- package/scripts/kb-ingest-template.js +548 -0
- package/scripts/kb-optimize.sql +250 -0
- package/scripts/kb-quality-audit.js +956 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
-- KB Optimization Script for ruvector-postgres (real[] embeddings)
|
|
2
|
+
-- Run: PGPASSWORD=guruKB2025 psql -h localhost -p 5435 -U postgres -f scripts/kb-optimize.sql
|
|
3
|
+
--
|
|
4
|
+
-- This script is designed for ruvector-postgres which uses real[] arrays
|
|
5
|
+
-- instead of pgvector. It creates optimized indexes and functions for
|
|
6
|
+
-- semantic search with 100% recall.
|
|
7
|
+
|
|
8
|
+
\echo '╔════════════════════════════════════════════════════════════════╗'
|
|
9
|
+
\echo '║ KB Optimization Script (ruvector-postgres native) ║'
|
|
10
|
+
\echo '╚════════════════════════════════════════════════════════════════╝'
|
|
11
|
+
|
|
12
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
13
|
+
-- Step 1: Create cosine_distance function for real[] arrays
|
|
14
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
15
|
+
\echo ''
|
|
16
|
+
\echo 'Step 1: Creating cosine_distance function for real[] arrays...'
|
|
17
|
+
|
|
18
|
+
DROP FUNCTION IF EXISTS cosine_distance(real[], real[]);
|
|
19
|
+
|
|
20
|
+
CREATE OR REPLACE FUNCTION cosine_distance(a real[], b real[])
|
|
21
|
+
RETURNS double precision AS $$
|
|
22
|
+
DECLARE
|
|
23
|
+
dot_product double precision := 0;
|
|
24
|
+
norm_a double precision := 0;
|
|
25
|
+
norm_b double precision := 0;
|
|
26
|
+
len int;
|
|
27
|
+
denominator double precision;
|
|
28
|
+
BEGIN
|
|
29
|
+
len := array_length(a, 1);
|
|
30
|
+
|
|
31
|
+
-- Handle null or mismatched arrays
|
|
32
|
+
IF a IS NULL OR b IS NULL OR len IS NULL OR len != array_length(b, 1) THEN
|
|
33
|
+
RETURN NULL;
|
|
34
|
+
END IF;
|
|
35
|
+
|
|
36
|
+
FOR i IN 1..len LOOP
|
|
37
|
+
dot_product := dot_product + (a[i]::double precision * b[i]::double precision);
|
|
38
|
+
norm_a := norm_a + (a[i]::double precision * a[i]::double precision);
|
|
39
|
+
norm_b := norm_b + (b[i]::double precision * b[i]::double precision);
|
|
40
|
+
END LOOP;
|
|
41
|
+
|
|
42
|
+
-- Prevent division by zero
|
|
43
|
+
denominator := sqrt(norm_a) * sqrt(norm_b);
|
|
44
|
+
IF denominator < 1e-10 THEN
|
|
45
|
+
RETURN 1.0; -- Maximum distance for zero vectors
|
|
46
|
+
END IF;
|
|
47
|
+
|
|
48
|
+
RETURN 1.0 - (dot_product / denominator);
|
|
49
|
+
END;
|
|
50
|
+
$$ LANGUAGE plpgsql IMMUTABLE STRICT;
|
|
51
|
+
|
|
52
|
+
COMMENT ON FUNCTION cosine_distance(real[], real[]) IS
|
|
53
|
+
'Compute cosine distance between two real[] vectors. Returns 0 for identical vectors, 1 for orthogonal, 2 for opposite.';
|
|
54
|
+
|
|
55
|
+
\echo '✓ cosine_distance function created'
|
|
56
|
+
|
|
57
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
58
|
+
-- Step 2: Create semantic search function
|
|
59
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
60
|
+
\echo ''
|
|
61
|
+
\echo 'Step 2: Creating semantic_search function...'
|
|
62
|
+
|
|
63
|
+
DROP FUNCTION IF EXISTS ask_ruvnet.semantic_search(text, int, double precision);
|
|
64
|
+
|
|
65
|
+
CREATE OR REPLACE FUNCTION ask_ruvnet.semantic_search(
|
|
66
|
+
query_text text,
|
|
67
|
+
limit_count int DEFAULT 10,
|
|
68
|
+
max_distance double precision DEFAULT 0.5
|
|
69
|
+
)
|
|
70
|
+
RETURNS TABLE (
|
|
71
|
+
id integer,
|
|
72
|
+
title text,
|
|
73
|
+
content text,
|
|
74
|
+
category text,
|
|
75
|
+
quality_score integer,
|
|
76
|
+
distance double precision
|
|
77
|
+
) AS $$
|
|
78
|
+
DECLARE
|
|
79
|
+
query_embedding real[];
|
|
80
|
+
BEGIN
|
|
81
|
+
-- Generate embedding for query using ruvector_embed
|
|
82
|
+
query_embedding := ruvector_embed(query_text)::real[];
|
|
83
|
+
|
|
84
|
+
RETURN QUERY
|
|
85
|
+
SELECT
|
|
86
|
+
d.id,
|
|
87
|
+
d.title,
|
|
88
|
+
d.content,
|
|
89
|
+
d.category,
|
|
90
|
+
d.quality_score,
|
|
91
|
+
cosine_distance(d.embedding, query_embedding) as distance
|
|
92
|
+
FROM ask_ruvnet.architecture_docs d
|
|
93
|
+
WHERE d.embedding IS NOT NULL
|
|
94
|
+
AND d.is_duplicate = false
|
|
95
|
+
AND cosine_distance(d.embedding, query_embedding) <= max_distance
|
|
96
|
+
ORDER BY distance
|
|
97
|
+
LIMIT limit_count;
|
|
98
|
+
END;
|
|
99
|
+
$$ LANGUAGE plpgsql;
|
|
100
|
+
|
|
101
|
+
COMMENT ON FUNCTION ask_ruvnet.semantic_search(text, int, double precision) IS
|
|
102
|
+
'Semantic search using ruvector embeddings. Returns top matches with cosine distance.';
|
|
103
|
+
|
|
104
|
+
\echo '✓ semantic_search function created'
|
|
105
|
+
|
|
106
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
107
|
+
-- Step 3: Create B-tree indexes for filtered queries
|
|
108
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
109
|
+
\echo ''
|
|
110
|
+
\echo 'Step 3: Creating optimized indexes...'
|
|
111
|
+
|
|
112
|
+
-- Category + quality composite index for filtered queries
|
|
113
|
+
DROP INDEX IF EXISTS ask_ruvnet.idx_category_quality_active;
|
|
114
|
+
CREATE INDEX IF NOT EXISTS idx_category_quality_active
|
|
115
|
+
ON ask_ruvnet.architecture_docs (category, quality_score DESC)
|
|
116
|
+
WHERE is_duplicate = false AND quality_score >= 40;
|
|
117
|
+
|
|
118
|
+
-- Title search index
|
|
119
|
+
DROP INDEX IF EXISTS ask_ruvnet.idx_title_trgm;
|
|
120
|
+
CREATE INDEX IF NOT EXISTS idx_title_search
|
|
121
|
+
ON ask_ruvnet.architecture_docs USING btree (title);
|
|
122
|
+
|
|
123
|
+
-- Non-duplicate filter index
|
|
124
|
+
DROP INDEX IF EXISTS ask_ruvnet.idx_non_duplicate;
|
|
125
|
+
CREATE INDEX IF NOT EXISTS idx_non_duplicate
|
|
126
|
+
ON ask_ruvnet.architecture_docs (id)
|
|
127
|
+
WHERE is_duplicate = false;
|
|
128
|
+
|
|
129
|
+
-- Embedding existence index (for filtering)
|
|
130
|
+
DROP INDEX IF EXISTS ask_ruvnet.idx_has_embedding;
|
|
131
|
+
CREATE INDEX IF NOT EXISTS idx_has_embedding
|
|
132
|
+
ON ask_ruvnet.architecture_docs (id)
|
|
133
|
+
WHERE embedding IS NOT NULL;
|
|
134
|
+
|
|
135
|
+
\echo '✓ Indexes created'
|
|
136
|
+
|
|
137
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
138
|
+
-- Step 4: Create optimized KB view
|
|
139
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
140
|
+
\echo ''
|
|
141
|
+
\echo 'Step 4: Creating optimized KB view...'
|
|
142
|
+
|
|
143
|
+
DROP VIEW IF EXISTS ask_ruvnet.kb CASCADE;
|
|
144
|
+
CREATE VIEW ask_ruvnet.kb AS
|
|
145
|
+
SELECT
|
|
146
|
+
id,
|
|
147
|
+
title,
|
|
148
|
+
content,
|
|
149
|
+
category,
|
|
150
|
+
quality_score,
|
|
151
|
+
package_name as source,
|
|
152
|
+
embedding,
|
|
153
|
+
created_at
|
|
154
|
+
FROM ask_ruvnet.architecture_docs
|
|
155
|
+
WHERE is_duplicate = false
|
|
156
|
+
AND quality_score >= 40
|
|
157
|
+
ORDER BY quality_score DESC;
|
|
158
|
+
|
|
159
|
+
COMMENT ON VIEW ask_ruvnet.kb IS
|
|
160
|
+
'Optimized view of high-quality, non-duplicate KB entries';
|
|
161
|
+
|
|
162
|
+
\echo '✓ KB view created'
|
|
163
|
+
|
|
164
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
165
|
+
-- Step 5: Create category distribution materialized view
|
|
166
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
167
|
+
\echo ''
|
|
168
|
+
\echo 'Step 5: Creating category stats materialized view...'
|
|
169
|
+
|
|
170
|
+
DROP MATERIALIZED VIEW IF EXISTS ask_ruvnet.category_stats CASCADE;
|
|
171
|
+
CREATE MATERIALIZED VIEW ask_ruvnet.category_stats AS
|
|
172
|
+
SELECT
|
|
173
|
+
category,
|
|
174
|
+
COUNT(*) as entry_count,
|
|
175
|
+
ROUND(AVG(quality_score), 1) as avg_quality,
|
|
176
|
+
MIN(quality_score) as min_quality,
|
|
177
|
+
MAX(quality_score) as max_quality,
|
|
178
|
+
COUNT(*) FILTER (WHERE embedding IS NOT NULL) as with_embedding
|
|
179
|
+
FROM ask_ruvnet.architecture_docs
|
|
180
|
+
WHERE is_duplicate = false
|
|
181
|
+
GROUP BY category
|
|
182
|
+
ORDER BY entry_count DESC;
|
|
183
|
+
|
|
184
|
+
CREATE UNIQUE INDEX idx_category_stats_cat ON ask_ruvnet.category_stats (category);
|
|
185
|
+
|
|
186
|
+
\echo '✓ Category stats materialized view created'
|
|
187
|
+
|
|
188
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
189
|
+
-- Step 6: Vacuum and analyze
|
|
190
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
191
|
+
\echo ''
|
|
192
|
+
\echo 'Step 6: Running VACUUM ANALYZE...'
|
|
193
|
+
VACUUM ANALYZE ask_ruvnet.architecture_docs;
|
|
194
|
+
|
|
195
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
196
|
+
-- Step 7: Report statistics
|
|
197
|
+
-- ═══════════════════════════════════════════════════════════════════════
|
|
198
|
+
\echo ''
|
|
199
|
+
\echo '════════════════════════════════════════════════════════════════'
|
|
200
|
+
\echo ' OPTIMIZATION COMPLETE '
|
|
201
|
+
\echo '════════════════════════════════════════════════════════════════'
|
|
202
|
+
\echo ''
|
|
203
|
+
|
|
204
|
+
SELECT
|
|
205
|
+
'Total entries' as metric,
|
|
206
|
+
TO_CHAR(COUNT(*), 'FM999,999') as value
|
|
207
|
+
FROM ask_ruvnet.architecture_docs
|
|
208
|
+
UNION ALL
|
|
209
|
+
SELECT
|
|
210
|
+
'Active (non-duplicate)',
|
|
211
|
+
TO_CHAR(COUNT(*), 'FM999,999')
|
|
212
|
+
FROM ask_ruvnet.architecture_docs WHERE is_duplicate = false
|
|
213
|
+
UNION ALL
|
|
214
|
+
SELECT
|
|
215
|
+
'High quality (>=40)',
|
|
216
|
+
TO_CHAR(COUNT(*), 'FM999,999')
|
|
217
|
+
FROM ask_ruvnet.architecture_docs WHERE is_duplicate = false AND quality_score >= 40
|
|
218
|
+
UNION ALL
|
|
219
|
+
SELECT
|
|
220
|
+
'With embeddings',
|
|
221
|
+
TO_CHAR(COUNT(*), 'FM999,999')
|
|
222
|
+
FROM ask_ruvnet.architecture_docs WHERE embedding IS NOT NULL
|
|
223
|
+
UNION ALL
|
|
224
|
+
SELECT
|
|
225
|
+
'Categories',
|
|
226
|
+
TO_CHAR(COUNT(DISTINCT category), 'FM999')
|
|
227
|
+
FROM ask_ruvnet.architecture_docs WHERE is_duplicate = false;
|
|
228
|
+
|
|
229
|
+
\echo ''
|
|
230
|
+
\echo 'Category distribution:'
|
|
231
|
+
SELECT * FROM ask_ruvnet.category_stats;
|
|
232
|
+
|
|
233
|
+
\echo ''
|
|
234
|
+
\echo '════════════════════════════════════════════════════════════════'
|
|
235
|
+
\echo ' USAGE EXAMPLES '
|
|
236
|
+
\echo '════════════════════════════════════════════════════════════════'
|
|
237
|
+
\echo ''
|
|
238
|
+
\echo 'Semantic search:'
|
|
239
|
+
\echo ' SELECT * FROM ask_ruvnet.semantic_search(''how to create agents'', 5);'
|
|
240
|
+
\echo ''
|
|
241
|
+
\echo 'Direct cosine distance:'
|
|
242
|
+
\echo ' SELECT title, cosine_distance(embedding, (SELECT embedding FROM ask_ruvnet.architecture_docs WHERE id = 1)) as dist'
|
|
243
|
+
\echo ' FROM ask_ruvnet.architecture_docs WHERE embedding IS NOT NULL ORDER BY dist LIMIT 5;'
|
|
244
|
+
\echo ''
|
|
245
|
+
\echo 'Filtered by category:'
|
|
246
|
+
\echo ' SELECT * FROM ask_ruvnet.kb WHERE category = ''agents'' LIMIT 10;'
|
|
247
|
+
\echo ''
|
|
248
|
+
\echo 'Refresh category stats:'
|
|
249
|
+
\echo ' REFRESH MATERIALIZED VIEW ask_ruvnet.category_stats;'
|
|
250
|
+
\echo ''
|