@hasna/knowledge 0.2.13 → 0.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +30 -3
- package/bin/open-knowledge-mcp.js +661 -212
- package/bin/open-knowledge.js +150 -33
- package/docs/architecture/ai-native-knowledge-base.md +9 -0
- package/docs/architecture/hybrid-semantic-search.md +17 -0
- package/package.json +1 -1
- package/src/cli.ts +48 -4
- package/src/embeddings.ts +516 -0
- package/src/knowledge-db.ts +39 -1
- package/src/mcp.js +38 -0
- package/src/outbox-consume.ts +11 -2
- package/src/service.ts +30 -0
- package/src/workspace.ts +12 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import type { Database } from 'bun:sqlite';
|
|
3
|
+
import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
|
|
4
|
+
import { assertProviderCredentials, parseModelRef, providerSettings, type AiProviderId } from './providers';
|
|
5
|
+
import { sourceProvenance, type KnowledgeProvenance } from './provenance';
|
|
6
|
+
import type { KnowledgeConfig } from './workspace';
|
|
7
|
+
|
|
8
|
+
export interface EmbeddingRuntimeOptions {
|
|
9
|
+
config?: KnowledgeConfig;
|
|
10
|
+
env?: Record<string, string | undefined>;
|
|
11
|
+
modelRef?: string;
|
|
12
|
+
dimensions?: number;
|
|
13
|
+
fake?: boolean;
|
|
14
|
+
batchSize?: number;
|
|
15
|
+
maxParallelCalls?: number;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface EmbeddingIndexOptions extends EmbeddingRuntimeOptions {
|
|
19
|
+
dbPath: string;
|
|
20
|
+
limit?: number;
|
|
21
|
+
sourceRevisionId?: string;
|
|
22
|
+
now?: Date;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface EmbeddingSearchOptions extends EmbeddingRuntimeOptions {
|
|
26
|
+
dbPath: string;
|
|
27
|
+
query: string;
|
|
28
|
+
limit?: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface EmbeddingUsage {
|
|
32
|
+
input_tokens: number;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface EmbeddingVectorResult {
|
|
36
|
+
provider: AiProviderId;
|
|
37
|
+
model: string;
|
|
38
|
+
dimensions: number;
|
|
39
|
+
vectors: number[][];
|
|
40
|
+
usage: EmbeddingUsage;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export interface EmbeddingIndexResult {
|
|
44
|
+
provider: AiProviderId;
|
|
45
|
+
model: string;
|
|
46
|
+
dimensions: number;
|
|
47
|
+
chunks_seen: number;
|
|
48
|
+
chunks_embedded: number;
|
|
49
|
+
embeddings_upserted: number;
|
|
50
|
+
vector_entries_upserted: number;
|
|
51
|
+
usage: EmbeddingUsage;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface EmbeddingStatusResult {
|
|
55
|
+
total_embeddings: number;
|
|
56
|
+
total_vector_entries: number;
|
|
57
|
+
indexes: Array<{
|
|
58
|
+
provider: string;
|
|
59
|
+
model: string;
|
|
60
|
+
dimensions: number;
|
|
61
|
+
entries: number;
|
|
62
|
+
updated_at: string | null;
|
|
63
|
+
}>;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export interface SemanticSearchResult {
|
|
67
|
+
provider: AiProviderId;
|
|
68
|
+
model: string;
|
|
69
|
+
dimensions: number;
|
|
70
|
+
query: string;
|
|
71
|
+
results: Array<{
|
|
72
|
+
chunk_id: string;
|
|
73
|
+
score: number;
|
|
74
|
+
text: string;
|
|
75
|
+
source_uri: string | null;
|
|
76
|
+
source_ref: string | null;
|
|
77
|
+
revision: string | null;
|
|
78
|
+
hash: string | null;
|
|
79
|
+
provenance: KnowledgeProvenance | null;
|
|
80
|
+
}>;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
interface CandidateChunk {
|
|
84
|
+
id: string;
|
|
85
|
+
text: string;
|
|
86
|
+
token_count: number | null;
|
|
87
|
+
start_offset: number | null;
|
|
88
|
+
end_offset: number | null;
|
|
89
|
+
metadata_json: string;
|
|
90
|
+
source_revision_id: string | null;
|
|
91
|
+
revision: string | null;
|
|
92
|
+
hash: string | null;
|
|
93
|
+
source_uri: string | null;
|
|
94
|
+
source_kind: string | null;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
interface VectorRow {
|
|
98
|
+
chunk_id: string;
|
|
99
|
+
text: string;
|
|
100
|
+
vector_json: string;
|
|
101
|
+
vector_norm: number;
|
|
102
|
+
source_uri: string | null;
|
|
103
|
+
source_ref: string | null;
|
|
104
|
+
revision: string | null;
|
|
105
|
+
hash: string | null;
|
|
106
|
+
metadata_json: string;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
export const DEFAULT_EMBEDDING_MODEL_REF = 'openai:text-embedding-3-small';
|
|
110
|
+
export const DEFAULT_EMBEDDING_DIMENSIONS = 1536;
|
|
111
|
+
|
|
112
|
+
function embeddingConfig(config?: KnowledgeConfig) {
|
|
113
|
+
return (config as KnowledgeConfig & {
|
|
114
|
+
embeddings?: {
|
|
115
|
+
default_model?: string;
|
|
116
|
+
dimensions?: number;
|
|
117
|
+
batch_size?: number;
|
|
118
|
+
max_parallel_calls?: number;
|
|
119
|
+
};
|
|
120
|
+
} | undefined)?.embeddings ?? {};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function stableId(prefix: string, value: string): string {
|
|
124
|
+
return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function parseJsonObject(value: string | null | undefined): Record<string, unknown> {
|
|
128
|
+
if (!value) return {};
|
|
129
|
+
try {
|
|
130
|
+
const parsed = JSON.parse(value);
|
|
131
|
+
return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed as Record<string, unknown> : {};
|
|
132
|
+
} catch {
|
|
133
|
+
return {};
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function metadataString(metadata: Record<string, unknown>, keys: string[]): string | null {
|
|
138
|
+
for (const key of keys) {
|
|
139
|
+
const value = metadata[key];
|
|
140
|
+
if (typeof value === 'string' && value.length > 0) return value;
|
|
141
|
+
}
|
|
142
|
+
return null;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
function metadataNumber(metadata: Record<string, unknown>, keys: string[]): number | null {
|
|
146
|
+
for (const key of keys) {
|
|
147
|
+
const value = metadata[key];
|
|
148
|
+
if (typeof value === 'number' && Number.isFinite(value)) return value;
|
|
149
|
+
}
|
|
150
|
+
return null;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function vectorNorm(vector: number[]): number {
|
|
154
|
+
return Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0));
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
function cosineSimilarity(a: number[], b: number[], bNorm = vectorNorm(b)): number {
|
|
158
|
+
const aNorm = vectorNorm(a);
|
|
159
|
+
if (aNorm === 0 || bNorm === 0) return 0;
|
|
160
|
+
const length = Math.min(a.length, b.length);
|
|
161
|
+
let dot = 0;
|
|
162
|
+
for (let i = 0; i < length; i += 1) dot += a[i] * b[i];
|
|
163
|
+
return dot / (aNorm * bNorm);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function deterministicVector(text: string, dimensions: number): number[] {
|
|
167
|
+
const bytes = createHash('sha256').update(text).digest();
|
|
168
|
+
return Array.from({ length: dimensions }, (_, index) => {
|
|
169
|
+
const value = bytes[index % bytes.length] / 255;
|
|
170
|
+
return Number((value * 2 - 1).toFixed(6));
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
async function openAiEmbeddingModel(model: string, config?: KnowledgeConfig, env: Record<string, string | undefined> = process.env): Promise<unknown> {
|
|
175
|
+
assertProviderCredentials('openai', config, env);
|
|
176
|
+
const settings = providerSettings(config, 'openai');
|
|
177
|
+
const { createOpenAI } = await import('@ai-sdk/openai');
|
|
178
|
+
const openai = createOpenAI({
|
|
179
|
+
apiKey: env[settings.api_key_env],
|
|
180
|
+
baseURL: settings.base_url,
|
|
181
|
+
}) as unknown as {
|
|
182
|
+
embeddingModel?: (modelId: string) => unknown;
|
|
183
|
+
textEmbedding?: (modelId: string) => unknown;
|
|
184
|
+
textEmbeddingModel?: (modelId: string) => unknown;
|
|
185
|
+
};
|
|
186
|
+
if (openai.embeddingModel) return openai.embeddingModel(model);
|
|
187
|
+
if (openai.textEmbedding) return openai.textEmbedding(model);
|
|
188
|
+
if (openai.textEmbeddingModel) return openai.textEmbeddingModel(model);
|
|
189
|
+
throw new Error('OpenAI provider does not expose an embedding model factory.');
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
export function resolveEmbeddingModelRef(modelRef?: string, config?: KnowledgeConfig): string {
|
|
193
|
+
if (!modelRef || modelRef === 'default' || modelRef === 'embedding') {
|
|
194
|
+
return embeddingConfig(config).default_model ?? DEFAULT_EMBEDDING_MODEL_REF;
|
|
195
|
+
}
|
|
196
|
+
return modelRef;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
export async function embedTexts(texts: string[], options: EmbeddingRuntimeOptions = {}): Promise<EmbeddingVectorResult> {
|
|
200
|
+
const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
|
|
201
|
+
const parsed = parseModelRef(modelRef);
|
|
202
|
+
if (parsed.provider !== 'openai') {
|
|
203
|
+
throw new Error(`Embedding provider ${parsed.provider} is not supported yet. Use openai:text-embedding-3-small.`);
|
|
204
|
+
}
|
|
205
|
+
const dimensions = options.dimensions ?? embeddingConfig(options.config).dimensions ?? DEFAULT_EMBEDDING_DIMENSIONS;
|
|
206
|
+
|
|
207
|
+
if (options.fake) {
|
|
208
|
+
return {
|
|
209
|
+
provider: parsed.provider,
|
|
210
|
+
model: parsed.model,
|
|
211
|
+
dimensions,
|
|
212
|
+
vectors: texts.map((text) => deterministicVector(text, dimensions)),
|
|
213
|
+
usage: { input_tokens: texts.reduce((sum, text) => sum + Math.max(1, Math.ceil(text.split(/\s+/).filter(Boolean).length * 1.25)), 0) },
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const { embedMany } = await import('ai');
|
|
218
|
+
const model = await openAiEmbeddingModel(parsed.model, options.config, options.env);
|
|
219
|
+
const result = await embedMany({
|
|
220
|
+
model: model as never,
|
|
221
|
+
values: texts,
|
|
222
|
+
maxParallelCalls: options.maxParallelCalls ?? embeddingConfig(options.config).max_parallel_calls,
|
|
223
|
+
providerOptions: {
|
|
224
|
+
openai: {
|
|
225
|
+
dimensions,
|
|
226
|
+
},
|
|
227
|
+
},
|
|
228
|
+
});
|
|
229
|
+
const vectors = result.embeddings as number[][];
|
|
230
|
+
return {
|
|
231
|
+
provider: parsed.provider,
|
|
232
|
+
model: parsed.model,
|
|
233
|
+
dimensions: vectors[0]?.length ?? dimensions,
|
|
234
|
+
vectors,
|
|
235
|
+
usage: { input_tokens: result.usage?.tokens ?? 0 },
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
function selectCandidateChunks(db: Database, options: {
|
|
240
|
+
provider: AiProviderId;
|
|
241
|
+
model: string;
|
|
242
|
+
limit: number;
|
|
243
|
+
sourceRevisionId?: string;
|
|
244
|
+
}): CandidateChunk[] {
|
|
245
|
+
const baseQuery =
|
|
246
|
+
`SELECT
|
|
247
|
+
c.id,
|
|
248
|
+
c.text,
|
|
249
|
+
c.token_count,
|
|
250
|
+
c.start_offset,
|
|
251
|
+
c.end_offset,
|
|
252
|
+
c.metadata_json,
|
|
253
|
+
c.source_revision_id,
|
|
254
|
+
sr.revision,
|
|
255
|
+
sr.hash,
|
|
256
|
+
s.uri AS source_uri,
|
|
257
|
+
s.kind AS source_kind
|
|
258
|
+
FROM chunks c
|
|
259
|
+
LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
|
|
260
|
+
LEFT JOIN sources s ON s.id = sr.source_id
|
|
261
|
+
LEFT JOIN vector_index_entries v
|
|
262
|
+
ON v.chunk_id = c.id AND v.provider = ? AND v.model = ?
|
|
263
|
+
WHERE v.id IS NULL`;
|
|
264
|
+
const suffix = `
|
|
265
|
+
ORDER BY c.created_at ASC, c.ordinal ASC
|
|
266
|
+
LIMIT ?`;
|
|
267
|
+
if (options.sourceRevisionId) {
|
|
268
|
+
return db.query<CandidateChunk, [string, string, string, number]>(
|
|
269
|
+
`${baseQuery} AND c.source_revision_id = ?${suffix}`,
|
|
270
|
+
).all(options.provider, options.model, options.sourceRevisionId, options.limit);
|
|
271
|
+
}
|
|
272
|
+
return db.query<CandidateChunk, [string, string, number]>(
|
|
273
|
+
`${baseQuery}${suffix}`,
|
|
274
|
+
).all(options.provider, options.model, options.limit);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
function provenanceForChunk(row: CandidateChunk): KnowledgeProvenance {
|
|
278
|
+
const metadata = parseJsonObject(row.metadata_json);
|
|
279
|
+
const existing = metadata.provenance;
|
|
280
|
+
if (existing && typeof existing === 'object' && !Array.isArray(existing)) return existing as KnowledgeProvenance;
|
|
281
|
+
return sourceProvenance({
|
|
282
|
+
source_ref: metadataString(metadata, ['source_ref']),
|
|
283
|
+
source_uri: row.source_uri ?? metadataString(metadata, ['source_uri']),
|
|
284
|
+
source_kind: row.source_kind ?? metadataString(metadata, ['source_kind']),
|
|
285
|
+
source_revision_id: row.source_revision_id,
|
|
286
|
+
revision: row.revision ?? metadataString(metadata, ['revision']),
|
|
287
|
+
hash: row.hash ?? metadataString(metadata, ['hash']),
|
|
288
|
+
chunk_id: row.id,
|
|
289
|
+
start_offset: row.start_offset ?? metadataNumber(metadata, ['start_offset']),
|
|
290
|
+
end_offset: row.end_offset ?? metadataNumber(metadata, ['end_offset']),
|
|
291
|
+
status: metadataString(metadata, ['status']),
|
|
292
|
+
resolver: 'open-files-read-only',
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function upsertVectors(db: Database, rows: CandidateChunk[], embedding: EmbeddingVectorResult, now: string): number {
|
|
297
|
+
const insertEmbedding = db.prepare(`
|
|
298
|
+
INSERT INTO chunk_embeddings (id, chunk_id, provider, model, dimensions, vector_json, created_at)
|
|
299
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
300
|
+
ON CONFLICT(chunk_id, provider, model) DO UPDATE SET
|
|
301
|
+
dimensions = excluded.dimensions,
|
|
302
|
+
vector_json = excluded.vector_json,
|
|
303
|
+
created_at = excluded.created_at
|
|
304
|
+
`);
|
|
305
|
+
const insertVector = db.prepare(`
|
|
306
|
+
INSERT INTO vector_index_entries (
|
|
307
|
+
id, chunk_id, source_revision_id, provider, model, dimensions, vector_json, vector_norm,
|
|
308
|
+
source_uri, source_ref, revision, hash, start_offset, end_offset, token_count, status,
|
|
309
|
+
metadata_json, created_at, updated_at
|
|
310
|
+
)
|
|
311
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
312
|
+
ON CONFLICT(chunk_id, provider, model) DO UPDATE SET
|
|
313
|
+
source_revision_id = excluded.source_revision_id,
|
|
314
|
+
dimensions = excluded.dimensions,
|
|
315
|
+
vector_json = excluded.vector_json,
|
|
316
|
+
vector_norm = excluded.vector_norm,
|
|
317
|
+
source_uri = excluded.source_uri,
|
|
318
|
+
source_ref = excluded.source_ref,
|
|
319
|
+
revision = excluded.revision,
|
|
320
|
+
hash = excluded.hash,
|
|
321
|
+
start_offset = excluded.start_offset,
|
|
322
|
+
end_offset = excluded.end_offset,
|
|
323
|
+
token_count = excluded.token_count,
|
|
324
|
+
status = excluded.status,
|
|
325
|
+
metadata_json = excluded.metadata_json,
|
|
326
|
+
updated_at = excluded.updated_at
|
|
327
|
+
`);
|
|
328
|
+
|
|
329
|
+
const write = db.transaction(() => {
|
|
330
|
+
for (let index = 0; index < rows.length; index += 1) {
|
|
331
|
+
const row = rows[index];
|
|
332
|
+
const vector = embedding.vectors[index];
|
|
333
|
+
if (!vector) continue;
|
|
334
|
+
const metadata = parseJsonObject(row.metadata_json);
|
|
335
|
+
const provenance = provenanceForChunk(row);
|
|
336
|
+
const sourceRef = provenance.source_ref ?? metadataString(metadata, ['source_ref']);
|
|
337
|
+
const sourceUri = provenance.source_uri ?? row.source_uri ?? metadataString(metadata, ['source_uri']);
|
|
338
|
+
const revision = provenance.revision ?? row.revision ?? metadataString(metadata, ['revision']);
|
|
339
|
+
const hash = provenance.hash ?? row.hash ?? metadataString(metadata, ['hash']);
|
|
340
|
+
const status = provenance.status ?? metadataString(metadata, ['status']) ?? 'active';
|
|
341
|
+
const vectorJson = JSON.stringify(vector);
|
|
342
|
+
insertEmbedding.run(
|
|
343
|
+
stableId('emb', `${row.id}\u0000${embedding.provider}\u0000${embedding.model}`),
|
|
344
|
+
row.id,
|
|
345
|
+
embedding.provider,
|
|
346
|
+
embedding.model,
|
|
347
|
+
embedding.dimensions,
|
|
348
|
+
vectorJson,
|
|
349
|
+
now,
|
|
350
|
+
);
|
|
351
|
+
insertVector.run(
|
|
352
|
+
stableId('vec', `${row.id}\u0000${embedding.provider}\u0000${embedding.model}`),
|
|
353
|
+
row.id,
|
|
354
|
+
row.source_revision_id,
|
|
355
|
+
embedding.provider,
|
|
356
|
+
embedding.model,
|
|
357
|
+
embedding.dimensions,
|
|
358
|
+
vectorJson,
|
|
359
|
+
vectorNorm(vector),
|
|
360
|
+
sourceUri,
|
|
361
|
+
sourceRef,
|
|
362
|
+
revision,
|
|
363
|
+
hash,
|
|
364
|
+
provenance.start_offset,
|
|
365
|
+
provenance.end_offset,
|
|
366
|
+
row.token_count,
|
|
367
|
+
status,
|
|
368
|
+
JSON.stringify({
|
|
369
|
+
...metadata,
|
|
370
|
+
provenance,
|
|
371
|
+
embedded_at: now,
|
|
372
|
+
}),
|
|
373
|
+
now,
|
|
374
|
+
now,
|
|
375
|
+
);
|
|
376
|
+
}
|
|
377
|
+
});
|
|
378
|
+
write();
|
|
379
|
+
return rows.length;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
export async function indexKnowledgeEmbeddings(options: EmbeddingIndexOptions): Promise<EmbeddingIndexResult> {
|
|
383
|
+
const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
|
|
384
|
+
const parsed = parseModelRef(modelRef);
|
|
385
|
+
if (parsed.provider !== 'openai') throw new Error(`Embedding provider ${parsed.provider} is not supported yet.`);
|
|
386
|
+
const now = (options.now ?? new Date()).toISOString();
|
|
387
|
+
const limit = Math.max(1, Math.min(options.limit ?? 100, 1000));
|
|
388
|
+
migrateKnowledgeDb(options.dbPath);
|
|
389
|
+
const readDb = openKnowledgeDb(options.dbPath);
|
|
390
|
+
let rows: CandidateChunk[];
|
|
391
|
+
try {
|
|
392
|
+
rows = selectCandidateChunks(readDb, {
|
|
393
|
+
provider: parsed.provider,
|
|
394
|
+
model: parsed.model,
|
|
395
|
+
limit,
|
|
396
|
+
sourceRevisionId: options.sourceRevisionId,
|
|
397
|
+
});
|
|
398
|
+
} finally {
|
|
399
|
+
readDb.close();
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
if (rows.length === 0) {
|
|
403
|
+
return {
|
|
404
|
+
provider: parsed.provider,
|
|
405
|
+
model: parsed.model,
|
|
406
|
+
dimensions: options.dimensions ?? embeddingConfig(options.config).dimensions ?? DEFAULT_EMBEDDING_DIMENSIONS,
|
|
407
|
+
chunks_seen: 0,
|
|
408
|
+
chunks_embedded: 0,
|
|
409
|
+
embeddings_upserted: 0,
|
|
410
|
+
vector_entries_upserted: 0,
|
|
411
|
+
usage: { input_tokens: 0 },
|
|
412
|
+
};
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
const embedding = await embedTexts(rows.map((row) => row.text), options);
|
|
416
|
+
const writeDb = openKnowledgeDb(options.dbPath);
|
|
417
|
+
try {
|
|
418
|
+
const upserted = upsertVectors(writeDb, rows, embedding, now);
|
|
419
|
+
return {
|
|
420
|
+
provider: embedding.provider,
|
|
421
|
+
model: embedding.model,
|
|
422
|
+
dimensions: embedding.dimensions,
|
|
423
|
+
chunks_seen: rows.length,
|
|
424
|
+
chunks_embedded: rows.length,
|
|
425
|
+
embeddings_upserted: upserted,
|
|
426
|
+
vector_entries_upserted: upserted,
|
|
427
|
+
usage: embedding.usage,
|
|
428
|
+
};
|
|
429
|
+
} finally {
|
|
430
|
+
writeDb.close();
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
export function embeddingIndexStatus(dbPath: string): EmbeddingStatusResult {
|
|
435
|
+
migrateKnowledgeDb(dbPath);
|
|
436
|
+
const db = openKnowledgeDb(dbPath);
|
|
437
|
+
try {
|
|
438
|
+
const totalEmbeddings = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM chunk_embeddings').get()?.n ?? 0;
|
|
439
|
+
const totalVectorEntries = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM vector_index_entries').get()?.n ?? 0;
|
|
440
|
+
const indexes = db.query<{
|
|
441
|
+
provider: string;
|
|
442
|
+
model: string;
|
|
443
|
+
dimensions: number;
|
|
444
|
+
entries: number;
|
|
445
|
+
updated_at: string | null;
|
|
446
|
+
}, []>(
|
|
447
|
+
`SELECT provider, model, dimensions, COUNT(*) AS entries, MAX(updated_at) AS updated_at
|
|
448
|
+
FROM vector_index_entries
|
|
449
|
+
GROUP BY provider, model, dimensions
|
|
450
|
+
ORDER BY provider, model`,
|
|
451
|
+
).all();
|
|
452
|
+
return {
|
|
453
|
+
total_embeddings: totalEmbeddings,
|
|
454
|
+
total_vector_entries: totalVectorEntries,
|
|
455
|
+
indexes,
|
|
456
|
+
};
|
|
457
|
+
} finally {
|
|
458
|
+
db.close();
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
export async function searchVectorIndex(options: EmbeddingSearchOptions): Promise<SemanticSearchResult> {
|
|
463
|
+
const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
|
|
464
|
+
const parsed = parseModelRef(modelRef);
|
|
465
|
+
const limit = Math.max(1, Math.min(options.limit ?? 10, 100));
|
|
466
|
+
const embedded = await embedTexts([options.query], options);
|
|
467
|
+
const queryVector = embedded.vectors[0] ?? [];
|
|
468
|
+
|
|
469
|
+
migrateKnowledgeDb(options.dbPath);
|
|
470
|
+
const db = openKnowledgeDb(options.dbPath);
|
|
471
|
+
try {
|
|
472
|
+
const rows = db.query<VectorRow, [string, string]>(
|
|
473
|
+
`SELECT
|
|
474
|
+
v.chunk_id,
|
|
475
|
+
c.text,
|
|
476
|
+
v.vector_json,
|
|
477
|
+
v.vector_norm,
|
|
478
|
+
v.source_uri,
|
|
479
|
+
v.source_ref,
|
|
480
|
+
v.revision,
|
|
481
|
+
v.hash,
|
|
482
|
+
v.metadata_json
|
|
483
|
+
FROM vector_index_entries v
|
|
484
|
+
JOIN chunks c ON c.id = v.chunk_id
|
|
485
|
+
WHERE v.provider = ? AND v.model = ? AND v.status = 'active'`,
|
|
486
|
+
).all(parsed.provider, parsed.model);
|
|
487
|
+
|
|
488
|
+
const scored = rows.map((row) => {
|
|
489
|
+
const vector = JSON.parse(row.vector_json) as number[];
|
|
490
|
+
const metadata = parseJsonObject(row.metadata_json);
|
|
491
|
+
const provenance = metadata.provenance && typeof metadata.provenance === 'object' && !Array.isArray(metadata.provenance)
|
|
492
|
+
? metadata.provenance as KnowledgeProvenance
|
|
493
|
+
: null;
|
|
494
|
+
return {
|
|
495
|
+
chunk_id: row.chunk_id,
|
|
496
|
+
score: cosineSimilarity(queryVector, vector, row.vector_norm),
|
|
497
|
+
text: row.text,
|
|
498
|
+
source_uri: row.source_uri,
|
|
499
|
+
source_ref: row.source_ref,
|
|
500
|
+
revision: row.revision,
|
|
501
|
+
hash: row.hash,
|
|
502
|
+
provenance,
|
|
503
|
+
};
|
|
504
|
+
}).sort((a, b) => b.score - a.score).slice(0, limit);
|
|
505
|
+
|
|
506
|
+
return {
|
|
507
|
+
provider: parsed.provider,
|
|
508
|
+
model: parsed.model,
|
|
509
|
+
dimensions: embedded.dimensions,
|
|
510
|
+
query: options.query,
|
|
511
|
+
results: scored,
|
|
512
|
+
};
|
|
513
|
+
} finally {
|
|
514
|
+
db.close();
|
|
515
|
+
}
|
|
516
|
+
}
|
package/src/knowledge-db.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Database } from 'bun:sqlite';
|
|
2
2
|
import { ensureParentDir } from './workspace';
|
|
3
3
|
|
|
4
|
-
export const CURRENT_SCHEMA_VERSION =
|
|
4
|
+
export const CURRENT_SCHEMA_VERSION = 4;
|
|
5
5
|
|
|
6
6
|
export interface KnowledgeDbStats {
|
|
7
7
|
schema_version: number;
|
|
@@ -17,6 +17,8 @@ export interface KnowledgeDbStats {
|
|
|
17
17
|
audit_events: number;
|
|
18
18
|
approval_gates: number;
|
|
19
19
|
storage_objects: number;
|
|
20
|
+
embeddings: number;
|
|
21
|
+
vector_entries: number;
|
|
20
22
|
}
|
|
21
23
|
|
|
22
24
|
const MIGRATION_1 = `
|
|
@@ -236,6 +238,39 @@ INSERT OR IGNORE INTO schema_versions(version, applied_at)
|
|
|
236
238
|
VALUES (3, datetime('now'));
|
|
237
239
|
`;
|
|
238
240
|
|
|
241
|
+
const MIGRATION_4 = `
|
|
242
|
+
CREATE TABLE IF NOT EXISTS vector_index_entries (
|
|
243
|
+
id TEXT PRIMARY KEY,
|
|
244
|
+
chunk_id TEXT NOT NULL REFERENCES chunks(id) ON DELETE CASCADE,
|
|
245
|
+
source_revision_id TEXT REFERENCES source_revisions(id) ON DELETE CASCADE,
|
|
246
|
+
provider TEXT NOT NULL,
|
|
247
|
+
model TEXT NOT NULL,
|
|
248
|
+
dimensions INTEGER NOT NULL,
|
|
249
|
+
vector_json TEXT NOT NULL,
|
|
250
|
+
vector_norm REAL NOT NULL,
|
|
251
|
+
source_uri TEXT,
|
|
252
|
+
source_ref TEXT,
|
|
253
|
+
revision TEXT,
|
|
254
|
+
hash TEXT,
|
|
255
|
+
start_offset INTEGER,
|
|
256
|
+
end_offset INTEGER,
|
|
257
|
+
token_count INTEGER,
|
|
258
|
+
status TEXT NOT NULL DEFAULT 'active',
|
|
259
|
+
metadata_json TEXT NOT NULL DEFAULT '{}',
|
|
260
|
+
created_at TEXT NOT NULL,
|
|
261
|
+
updated_at TEXT NOT NULL,
|
|
262
|
+
UNIQUE(chunk_id, provider, model)
|
|
263
|
+
);
|
|
264
|
+
|
|
265
|
+
CREATE INDEX IF NOT EXISTS idx_vector_index_provider_model ON vector_index_entries(provider, model);
|
|
266
|
+
CREATE INDEX IF NOT EXISTS idx_vector_index_source_revision ON vector_index_entries(source_revision_id);
|
|
267
|
+
CREATE INDEX IF NOT EXISTS idx_vector_index_source_uri ON vector_index_entries(source_uri);
|
|
268
|
+
CREATE INDEX IF NOT EXISTS idx_vector_index_status ON vector_index_entries(status);
|
|
269
|
+
|
|
270
|
+
INSERT OR IGNORE INTO schema_versions(version, applied_at)
|
|
271
|
+
VALUES (4, datetime('now'));
|
|
272
|
+
`;
|
|
273
|
+
|
|
239
274
|
export function openKnowledgeDb(path: string): Database {
|
|
240
275
|
ensureParentDir(path);
|
|
241
276
|
const db = new Database(path);
|
|
@@ -250,6 +285,7 @@ export function migrateKnowledgeDb(path: string): { path: string; schema_version
|
|
|
250
285
|
db.exec(MIGRATION_1);
|
|
251
286
|
if (getSchemaVersion(db) < 2) db.exec(MIGRATION_2);
|
|
252
287
|
if (getSchemaVersion(db) < 3) db.exec(MIGRATION_3);
|
|
288
|
+
if (getSchemaVersion(db) < 4) db.exec(MIGRATION_4);
|
|
253
289
|
return { path, schema_version: getSchemaVersion(db) };
|
|
254
290
|
} finally {
|
|
255
291
|
db.close();
|
|
@@ -283,6 +319,8 @@ export function getKnowledgeDbStats(path: string): KnowledgeDbStats {
|
|
|
283
319
|
audit_events: count(db, 'audit_events'),
|
|
284
320
|
approval_gates: count(db, 'approval_gates'),
|
|
285
321
|
storage_objects: count(db, 'storage_objects'),
|
|
322
|
+
embeddings: count(db, 'chunk_embeddings'),
|
|
323
|
+
vector_entries: count(db, 'vector_index_entries'),
|
|
286
324
|
};
|
|
287
325
|
} finally {
|
|
288
326
|
db.close();
|
package/src/mcp.js
CHANGED
|
@@ -131,6 +131,44 @@ export function buildServer() {
|
|
|
131
131
|
return jsonText({ ok: true, models: service.modelRegistry() });
|
|
132
132
|
});
|
|
133
133
|
|
|
134
|
+
registerTool(server, 'ok_embeddings_status', 'Embedding index status', 'Inspect local embedding/vector index counts by provider and model', {
|
|
135
|
+
scope: scopeField,
|
|
136
|
+
}, async ({ scope }) => {
|
|
137
|
+
const service = createKnowledgeService({ scope });
|
|
138
|
+
return jsonText({ ok: true, ...service.embeddingStatus() });
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
registerTool(server, 'ok_embeddings_index', 'Index embeddings', 'Embed unindexed knowledge chunks into the local vector index', {
|
|
142
|
+
scope: scopeField,
|
|
143
|
+
limit: z.number().optional().describe('Maximum chunks to embed'),
|
|
144
|
+
model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
|
|
145
|
+
dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
|
|
146
|
+
fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
|
|
147
|
+
}, async ({ scope, limit, model, dimensions, fake }) => {
|
|
148
|
+
const service = createKnowledgeService({ scope });
|
|
149
|
+
try {
|
|
150
|
+
return jsonText({ ok: true, ...await service.indexEmbeddings({ limit, modelRef: model, dimensions, fake }) });
|
|
151
|
+
} catch (error) {
|
|
152
|
+
return errorText(error instanceof Error ? error.message : String(error));
|
|
153
|
+
}
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
registerTool(server, 'ok_semantic_search', 'Semantic search', 'Search the local vector index and return cited chunks with provenance', {
|
|
157
|
+
scope: scopeField,
|
|
158
|
+
query: z.string().describe('Semantic query'),
|
|
159
|
+
limit: z.number().optional().describe('Maximum results'),
|
|
160
|
+
model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
|
|
161
|
+
dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
|
|
162
|
+
fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
|
|
163
|
+
}, async ({ scope, query, limit, model, dimensions, fake }) => {
|
|
164
|
+
const service = createKnowledgeService({ scope });
|
|
165
|
+
try {
|
|
166
|
+
return jsonText({ ok: true, ...await service.semanticSearch({ query, limit, modelRef: model, dimensions, fake }) });
|
|
167
|
+
} catch (error) {
|
|
168
|
+
return errorText(error instanceof Error ? error.message : String(error));
|
|
169
|
+
}
|
|
170
|
+
});
|
|
171
|
+
|
|
134
172
|
registerTool(server, 'ok_add', 'Add a knowledge item', 'Add a new item to the knowledge store', {
|
|
135
173
|
title: z.string().describe('Item title'),
|
|
136
174
|
content: z.string().describe('Item content/body'),
|
package/src/outbox-consume.ts
CHANGED
|
@@ -30,6 +30,7 @@ export interface OutboxConsumeResult {
|
|
|
30
30
|
deleted_sources: number;
|
|
31
31
|
moved_sources: number;
|
|
32
32
|
permission_updates: number;
|
|
33
|
+
vector_entries_deleted: number;
|
|
33
34
|
}
|
|
34
35
|
|
|
35
36
|
interface NormalizedOutboxEvent {
|
|
@@ -289,12 +290,16 @@ function revisionIdsForEvent(db: Database, sourceId: string, event: NormalizedOu
|
|
|
289
290
|
).all(sourceId).map((row) => row.id);
|
|
290
291
|
}
|
|
291
292
|
|
|
292
|
-
function invalidateRevision(db: Database, revisionId: string): { chunksDeleted: number; embeddingsDeleted: number } {
|
|
293
|
+
function invalidateRevision(db: Database, revisionId: string): { chunksDeleted: number; embeddingsDeleted: number; vectorEntriesDeleted: number } {
|
|
293
294
|
const chunks = db.query<{ id: string }, [string]>('SELECT id FROM chunks WHERE source_revision_id = ?').all(revisionId);
|
|
294
295
|
let embeddingsDeleted = 0;
|
|
296
|
+
let vectorEntriesDeleted = 0;
|
|
295
297
|
for (const chunk of chunks) {
|
|
296
298
|
const row = db.query<{ n: number }, [string]>('SELECT COUNT(*) AS n FROM chunk_embeddings WHERE chunk_id = ?').get(chunk.id);
|
|
297
299
|
embeddingsDeleted += row?.n ?? 0;
|
|
300
|
+
const vectorRow = db.query<{ n: number }, [string]>('SELECT COUNT(*) AS n FROM vector_index_entries WHERE chunk_id = ?').get(chunk.id);
|
|
301
|
+
vectorEntriesDeleted += vectorRow?.n ?? 0;
|
|
302
|
+
db.run('DELETE FROM vector_index_entries WHERE chunk_id = ?', [chunk.id]);
|
|
298
303
|
db.run('DELETE FROM chunk_embeddings WHERE chunk_id = ?', [chunk.id]);
|
|
299
304
|
db.run('DELETE FROM chunks_fts WHERE chunk_id = ?', [chunk.id]);
|
|
300
305
|
}
|
|
@@ -304,7 +309,7 @@ function invalidateRevision(db: Database, revisionId: string): { chunksDeleted:
|
|
|
304
309
|
'UPDATE source_revisions SET metadata_json = ? WHERE id = ?',
|
|
305
310
|
[mergeJson(revision?.metadata_json, { reindex_required: true, invalidated_at: new Date().toISOString() }), revisionId],
|
|
306
311
|
);
|
|
307
|
-
return { chunksDeleted: chunks.length, embeddingsDeleted };
|
|
312
|
+
return { chunksDeleted: chunks.length, embeddingsDeleted, vectorEntriesDeleted };
|
|
308
313
|
}
|
|
309
314
|
|
|
310
315
|
function isDeleteEvent(eventType: string, status: string | null): boolean {
|
|
@@ -349,6 +354,7 @@ export async function consumeOpenFilesOutbox(options: OutboxConsumeOptions): Pro
|
|
|
349
354
|
const revisionsTouched = new Set<string>();
|
|
350
355
|
let chunksDeleted = 0;
|
|
351
356
|
let embeddingsDeleted = 0;
|
|
357
|
+
let vectorEntriesDeleted = 0;
|
|
352
358
|
let staleRevisions = 0;
|
|
353
359
|
let deletedSources = 0;
|
|
354
360
|
let movedSources = 0;
|
|
@@ -376,6 +382,7 @@ export async function consumeOpenFilesOutbox(options: OutboxConsumeOptions): Pro
|
|
|
376
382
|
const invalidation = invalidateRevision(db, revisionId);
|
|
377
383
|
chunksDeleted += invalidation.chunksDeleted;
|
|
378
384
|
embeddingsDeleted += invalidation.embeddingsDeleted;
|
|
385
|
+
vectorEntriesDeleted += invalidation.vectorEntriesDeleted;
|
|
379
386
|
staleRevisions += 1;
|
|
380
387
|
}
|
|
381
388
|
|
|
@@ -429,6 +436,7 @@ export async function consumeOpenFilesOutbox(options: OutboxConsumeOptions): Pro
|
|
|
429
436
|
revisions: revisionsTouched.size,
|
|
430
437
|
chunks_deleted: chunksDeleted,
|
|
431
438
|
embeddings_deleted: embeddingsDeleted,
|
|
439
|
+
vector_entries_deleted: vectorEntriesDeleted,
|
|
432
440
|
},
|
|
433
441
|
created_at: now,
|
|
434
442
|
});
|
|
@@ -442,6 +450,7 @@ export async function consumeOpenFilesOutbox(options: OutboxConsumeOptions): Pro
|
|
|
442
450
|
revisions_touched: revisionsTouched.size,
|
|
443
451
|
chunks_deleted: chunksDeleted,
|
|
444
452
|
embeddings_deleted: embeddingsDeleted,
|
|
453
|
+
vector_entries_deleted: vectorEntriesDeleted,
|
|
445
454
|
stale_revisions: staleRevisions,
|
|
446
455
|
deleted_sources: deletedSources,
|
|
447
456
|
moved_sources: movedSources,
|