@hasna/knowledge 0.2.13 → 0.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,6 +30,7 @@ export interface OutboxConsumeResult {
30
30
  deleted_sources: number;
31
31
  moved_sources: number;
32
32
  permission_updates: number;
33
+ vector_entries_deleted: number;
33
34
  }
34
35
 
35
36
  interface NormalizedOutboxEvent {
@@ -289,12 +290,16 @@ function revisionIdsForEvent(db: Database, sourceId: string, event: NormalizedOu
289
290
  ).all(sourceId).map((row) => row.id);
290
291
  }
291
292
 
292
- function invalidateRevision(db: Database, revisionId: string): { chunksDeleted: number; embeddingsDeleted: number } {
293
+ function invalidateRevision(db: Database, revisionId: string): { chunksDeleted: number; embeddingsDeleted: number; vectorEntriesDeleted: number } {
293
294
  const chunks = db.query<{ id: string }, [string]>('SELECT id FROM chunks WHERE source_revision_id = ?').all(revisionId);
294
295
  let embeddingsDeleted = 0;
296
+ let vectorEntriesDeleted = 0;
295
297
  for (const chunk of chunks) {
296
298
  const row = db.query<{ n: number }, [string]>('SELECT COUNT(*) AS n FROM chunk_embeddings WHERE chunk_id = ?').get(chunk.id);
297
299
  embeddingsDeleted += row?.n ?? 0;
300
+ const vectorRow = db.query<{ n: number }, [string]>('SELECT COUNT(*) AS n FROM vector_index_entries WHERE chunk_id = ?').get(chunk.id);
301
+ vectorEntriesDeleted += vectorRow?.n ?? 0;
302
+ db.run('DELETE FROM vector_index_entries WHERE chunk_id = ?', [chunk.id]);
298
303
  db.run('DELETE FROM chunk_embeddings WHERE chunk_id = ?', [chunk.id]);
299
304
  db.run('DELETE FROM chunks_fts WHERE chunk_id = ?', [chunk.id]);
300
305
  }
@@ -304,7 +309,7 @@ function invalidateRevision(db: Database, revisionId: string): { chunksDeleted:
304
309
  'UPDATE source_revisions SET metadata_json = ? WHERE id = ?',
305
310
  [mergeJson(revision?.metadata_json, { reindex_required: true, invalidated_at: new Date().toISOString() }), revisionId],
306
311
  );
307
- return { chunksDeleted: chunks.length, embeddingsDeleted };
312
+ return { chunksDeleted: chunks.length, embeddingsDeleted, vectorEntriesDeleted };
308
313
  }
309
314
 
310
315
  function isDeleteEvent(eventType: string, status: string | null): boolean {
@@ -349,6 +354,7 @@ export async function consumeOpenFilesOutbox(options: OutboxConsumeOptions): Pro
349
354
  const revisionsTouched = new Set<string>();
350
355
  let chunksDeleted = 0;
351
356
  let embeddingsDeleted = 0;
357
+ let vectorEntriesDeleted = 0;
352
358
  let staleRevisions = 0;
353
359
  let deletedSources = 0;
354
360
  let movedSources = 0;
@@ -376,6 +382,7 @@ export async function consumeOpenFilesOutbox(options: OutboxConsumeOptions): Pro
376
382
  const invalidation = invalidateRevision(db, revisionId);
377
383
  chunksDeleted += invalidation.chunksDeleted;
378
384
  embeddingsDeleted += invalidation.embeddingsDeleted;
385
+ vectorEntriesDeleted += invalidation.vectorEntriesDeleted;
379
386
  staleRevisions += 1;
380
387
  }
381
388
 
@@ -429,6 +436,7 @@ export async function consumeOpenFilesOutbox(options: OutboxConsumeOptions): Pro
429
436
  revisions: revisionsTouched.size,
430
437
  chunks_deleted: chunksDeleted,
431
438
  embeddings_deleted: embeddingsDeleted,
439
+ vector_entries_deleted: vectorEntriesDeleted,
432
440
  },
433
441
  created_at: now,
434
442
  });
@@ -442,6 +450,7 @@ export async function consumeOpenFilesOutbox(options: OutboxConsumeOptions): Pro
442
450
  revisions_touched: revisionsTouched.size,
443
451
  chunks_deleted: chunksDeleted,
444
452
  embeddings_deleted: embeddingsDeleted,
453
+ vector_entries_deleted: vectorEntriesDeleted,
445
454
  stale_revisions: staleRevisions,
446
455
  deleted_sources: deletedSources,
447
456
  moved_sources: movedSources,
package/src/search.ts ADDED
@@ -0,0 +1,510 @@
1
+ import type { Database } from 'bun:sqlite';
2
+ import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
3
+ import { searchVectorIndex, type EmbeddingRuntimeOptions } from './embeddings';
4
+ import { sourceProvenance, type GeneratedArtifactProvenance, type KnowledgeProvenance } from './provenance';
5
+ import type { KnowledgeConfig } from './workspace';
6
+
7
+ export type SearchResultKind = 'source_chunk' | 'wiki_chunk' | 'wiki_page' | 'knowledge_index';
8
+ export type SearchProvenance = KnowledgeProvenance | GeneratedArtifactProvenance;
9
+
10
+ export interface HybridSearchOptions extends EmbeddingRuntimeOptions {
11
+ dbPath: string;
12
+ query: string;
13
+ limit?: number;
14
+ semantic?: boolean;
15
+ config?: KnowledgeConfig;
16
+ }
17
+
18
+ export interface HybridSearchResult {
19
+ query: string;
20
+ limit: number;
21
+ mode: {
22
+ keyword: true;
23
+ catalog: true;
24
+ semantic: boolean;
25
+ };
26
+ semantic_provider: string | null;
27
+ semantic_model: string | null;
28
+ semantic_dimensions: number | null;
29
+ counts: {
30
+ keyword_results: number;
31
+ catalog_results: number;
32
+ semantic_results: number;
33
+ merged_results: number;
34
+ };
35
+ warnings: string[];
36
+ results: HybridSearchEntry[];
37
+ }
38
+
39
+ export interface HybridSearchEntry {
40
+ kind: SearchResultKind;
41
+ id: string;
42
+ title: string | null;
43
+ text: string | null;
44
+ score: number;
45
+ scores: {
46
+ keyword?: number;
47
+ semantic?: number;
48
+ catalog?: number;
49
+ };
50
+ source: {
51
+ uri: string | null;
52
+ ref: string | null;
53
+ kind: string | null;
54
+ revision: string | null;
55
+ hash: string | null;
56
+ } | null;
57
+ citation: {
58
+ chunk_id: string | null;
59
+ start_offset: number | null;
60
+ end_offset: number | null;
61
+ } | null;
62
+ artifact: {
63
+ uri: string | null;
64
+ path: string | null;
65
+ hash: string | null;
66
+ shard_key: string | null;
67
+ } | null;
68
+ provenance: SearchProvenance | null;
69
+ reasons: string[];
70
+ }
71
+
72
+ interface FtsChunkRow {
73
+ chunk_id: string;
74
+ chunk_kind: string;
75
+ wiki_page_id: string | null;
76
+ text: string;
77
+ token_count: number | null;
78
+ start_offset: number | null;
79
+ end_offset: number | null;
80
+ chunk_metadata_json: string;
81
+ source_revision_id: string | null;
82
+ revision: string | null;
83
+ hash: string | null;
84
+ source_uri: string | null;
85
+ source_kind: string | null;
86
+ source_title: string | null;
87
+ wiki_path: string | null;
88
+ wiki_title: string | null;
89
+ wiki_artifact_uri: string | null;
90
+ wiki_content_hash: string | null;
91
+ wiki_status: string | null;
92
+ wiki_metadata_json: string | null;
93
+ rank: number;
94
+ }
95
+
96
+ interface WikiPageRow {
97
+ id: string;
98
+ path: string;
99
+ title: string;
100
+ artifact_uri: string | null;
101
+ content_hash: string | null;
102
+ status: string;
103
+ metadata_json: string;
104
+ }
105
+
106
+ interface IndexRow {
107
+ id: string;
108
+ kind: string;
109
+ name: string;
110
+ artifact_uri: string | null;
111
+ shard_key: string | null;
112
+ metadata_json: string;
113
+ }
114
+
115
+ function parseJsonObject(value: string | null | undefined): Record<string, unknown> {
116
+ if (!value) return {};
117
+ try {
118
+ const parsed = JSON.parse(value);
119
+ return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed as Record<string, unknown> : {};
120
+ } catch {
121
+ return {};
122
+ }
123
+ }
124
+
125
+ function metadataString(metadata: Record<string, unknown>, keys: string[]): string | null {
126
+ for (const key of keys) {
127
+ const value = metadata[key];
128
+ if (typeof value === 'string' && value.length > 0) return value;
129
+ }
130
+ return null;
131
+ }
132
+
133
+ function metadataNumber(metadata: Record<string, unknown>, keys: string[]): number | null {
134
+ for (const key of keys) {
135
+ const value = metadata[key];
136
+ if (typeof value === 'number' && Number.isFinite(value)) return value;
137
+ }
138
+ return null;
139
+ }
140
+
141
+ function unique<T>(values: T[]): T[] {
142
+ return Array.from(new Set(values));
143
+ }
144
+
145
+ function queryTerms(query: string): string[] {
146
+ const terms = query
147
+ .normalize('NFKC')
148
+ .toLowerCase()
149
+ .match(/[\p{L}\p{N}_]+/gu) ?? [];
150
+ return unique(terms.filter((term) => term.length > 0)).slice(0, 16);
151
+ }
152
+
153
+ function ftsQueryForTerms(terms: string[]): string | null {
154
+ if (terms.length === 0) return null;
155
+ return terms.map((term) => `${term}*`).join(' OR ');
156
+ }
157
+
158
+ function escapeLikeTerm(term: string): string {
159
+ return term.replace(/[\\%_]/g, (char) => `\\${char}`);
160
+ }
161
+
162
+ function likeParams(terms: string[], fieldsPerTerm: number): string[] {
163
+ return terms.flatMap((term) => Array.from({ length: fieldsPerTerm }, () => `%${escapeLikeTerm(term)}%`));
164
+ }
165
+
166
+ function scoreFromRank(rank: number, index: number): number {
167
+ const rankScore = Number.isFinite(rank) ? 1 / (1 + Math.abs(rank)) : 0;
168
+ const orderScore = 1 / (1 + index);
169
+ return roundScore(Math.max(rankScore, orderScore));
170
+ }
171
+
172
+ function catalogScore(haystack: string, terms: string[]): number {
173
+ if (terms.length === 0) return 0;
174
+ const matched = terms.filter((term) => haystack.includes(term)).length;
175
+ if (matched === 0) return 0;
176
+ return roundScore(Math.min(0.85, 0.35 + (matched / terms.length) * 0.5));
177
+ }
178
+
179
+ function semanticScore(score: number): number {
180
+ return roundScore(Math.max(0, Math.min(1, (score + 1) / 2)));
181
+ }
182
+
183
+ function roundScore(score: number): number {
184
+ return Number(score.toFixed(6));
185
+ }
186
+
187
+ function combinedScore(scores: HybridSearchEntry['scores'], citation: HybridSearchEntry['citation']): number {
188
+ const keyword = scores.keyword ?? 0;
189
+ const semantic = scores.semantic ?? 0;
190
+ const catalog = scores.catalog ?? 0;
191
+ const citationBoost = citation?.chunk_id ? 0.05 : 0;
192
+ return roundScore(Math.min(1, keyword * 0.55 + semantic * 0.4 + catalog * 0.35 + citationBoost));
193
+ }
194
+
195
+ function existingProvenance(metadata: Record<string, unknown>): SearchProvenance | null {
196
+ const provenance = metadata.provenance;
197
+ return provenance && typeof provenance === 'object' && !Array.isArray(provenance) ? provenance as SearchProvenance : null;
198
+ }
199
+
200
+ function provenanceForChunk(row: FtsChunkRow): SearchProvenance | null {
201
+ const metadata = parseJsonObject(row.chunk_metadata_json);
202
+ const existing = existingProvenance(metadata);
203
+ if (existing) return existing;
204
+ if (!row.source_revision_id && !row.source_uri) return null;
205
+ return sourceProvenance({
206
+ source_ref: metadataString(metadata, ['source_ref']),
207
+ source_uri: row.source_uri ?? metadataString(metadata, ['source_uri']),
208
+ source_kind: row.source_kind ?? metadataString(metadata, ['source_kind']),
209
+ source_revision_id: row.source_revision_id,
210
+ revision: row.revision ?? metadataString(metadata, ['revision']),
211
+ hash: row.hash ?? metadataString(metadata, ['hash']),
212
+ chunk_id: row.chunk_id,
213
+ start_offset: row.start_offset ?? metadataNumber(metadata, ['start_offset']),
214
+ end_offset: row.end_offset ?? metadataNumber(metadata, ['end_offset']),
215
+ status: metadataString(metadata, ['status']),
216
+ resolver: 'open-files-read-only',
217
+ });
218
+ }
219
+
220
+ function selectFtsChunks(db: Database, ftsQuery: string | null, limit: number): FtsChunkRow[] {
221
+ if (!ftsQuery) return [];
222
+ return db.query<FtsChunkRow, [string, number]>(
223
+ `SELECT
224
+ chunks_fts.chunk_id,
225
+ c.kind AS chunk_kind,
226
+ c.wiki_page_id,
227
+ c.text,
228
+ c.token_count,
229
+ c.start_offset,
230
+ c.end_offset,
231
+ c.metadata_json AS chunk_metadata_json,
232
+ c.source_revision_id,
233
+ sr.revision,
234
+ sr.hash,
235
+ s.uri AS source_uri,
236
+ s.kind AS source_kind,
237
+ s.title AS source_title,
238
+ wp.path AS wiki_path,
239
+ wp.title AS wiki_title,
240
+ wp.artifact_uri AS wiki_artifact_uri,
241
+ wp.content_hash AS wiki_content_hash,
242
+ wp.status AS wiki_status,
243
+ wp.metadata_json AS wiki_metadata_json,
244
+ bm25(chunks_fts) AS rank
245
+ FROM chunks_fts
246
+ JOIN chunks c ON c.id = chunks_fts.chunk_id
247
+ LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
248
+ LEFT JOIN sources s ON s.id = sr.source_id
249
+ LEFT JOIN wiki_pages wp ON wp.id = c.wiki_page_id
250
+ WHERE chunks_fts MATCH ?
251
+ ORDER BY rank ASC
252
+ LIMIT ?`,
253
+ ).all(ftsQuery, limit);
254
+ }
255
+
256
+ function catalogWhere(fields: string[], terms: string[]): string {
257
+ if (terms.length === 0) return '1 = 0';
258
+ const clauses = terms.map(() => `(${fields.map((field) => `lower(COALESCE(${field}, '')) LIKE ? ESCAPE '\\'`).join(' OR ')})`);
259
+ return clauses.join(' OR ');
260
+ }
261
+
262
+ function selectWikiPages(db: Database, terms: string[], limit: number): WikiPageRow[] {
263
+ const fields = ['path', 'title', 'artifact_uri', 'metadata_json'];
264
+ return db.query<WikiPageRow, [...string[], number]>(
265
+ `SELECT id, path, title, artifact_uri, content_hash, status, metadata_json
266
+ FROM wiki_pages
267
+ WHERE status = 'active' AND (${catalogWhere(fields, terms)})
268
+ ORDER BY updated_at DESC
269
+ LIMIT ?`,
270
+ ).all(...likeParams(terms, fields.length), limit);
271
+ }
272
+
273
+ function selectKnowledgeIndexes(db: Database, terms: string[], limit: number): IndexRow[] {
274
+ const fields = ['kind', 'name', 'shard_key', 'artifact_uri', 'metadata_json'];
275
+ return db.query<IndexRow, [...string[], number]>(
276
+ `SELECT id, kind, name, artifact_uri, shard_key, metadata_json
277
+ FROM knowledge_indexes
278
+ WHERE ${catalogWhere(fields, terms)}
279
+ ORDER BY updated_at DESC
280
+ LIMIT ?`,
281
+ ).all(...likeParams(terms, fields.length), limit);
282
+ }
283
+
284
+ function chunkResult(row: FtsChunkRow, keywordScore: number): HybridSearchEntry {
285
+ const metadata = parseJsonObject(row.chunk_metadata_json);
286
+ const provenance = provenanceForChunk(row);
287
+ const sourceRef = metadataString(metadata, ['source_ref']);
288
+ const sourceUri = row.source_uri ?? metadataString(metadata, ['source_uri']);
289
+ const isWiki = Boolean(row.wiki_page_id);
290
+ const result: HybridSearchEntry = {
291
+ kind: isWiki ? 'wiki_chunk' : 'source_chunk',
292
+ id: row.chunk_id,
293
+ title: isWiki ? row.wiki_title : row.source_title,
294
+ text: row.text,
295
+ score: 0,
296
+ scores: { keyword: keywordScore },
297
+ source: sourceUri || sourceRef ? {
298
+ uri: sourceUri,
299
+ ref: sourceRef,
300
+ kind: row.source_kind ?? metadataString(metadata, ['source_kind']),
301
+ revision: row.revision ?? metadataString(metadata, ['revision']),
302
+ hash: row.hash ?? metadataString(metadata, ['hash']),
303
+ } : null,
304
+ citation: {
305
+ chunk_id: row.chunk_id,
306
+ start_offset: row.start_offset,
307
+ end_offset: row.end_offset,
308
+ },
309
+ artifact: isWiki ? {
310
+ uri: row.wiki_artifact_uri,
311
+ path: row.wiki_path,
312
+ hash: row.wiki_content_hash,
313
+ shard_key: row.wiki_path,
314
+ } : null,
315
+ provenance,
316
+ reasons: ['keyword_match'],
317
+ };
318
+ result.score = combinedScore(result.scores, result.citation);
319
+ return result;
320
+ }
321
+
322
+ function wikiPageResult(row: WikiPageRow, terms: string[]): HybridSearchEntry {
323
+ const metadata = parseJsonObject(row.metadata_json);
324
+ const score = catalogScore(`${row.path} ${row.title} ${row.artifact_uri ?? ''} ${row.metadata_json}`.toLowerCase(), terms);
325
+ const result: HybridSearchEntry = {
326
+ kind: 'wiki_page',
327
+ id: row.id,
328
+ title: row.title,
329
+ text: null,
330
+ score: 0,
331
+ scores: { catalog: score },
332
+ source: null,
333
+ citation: null,
334
+ artifact: {
335
+ uri: row.artifact_uri,
336
+ path: row.path,
337
+ hash: row.content_hash,
338
+ shard_key: row.path,
339
+ },
340
+ provenance: existingProvenance(metadata),
341
+ reasons: ['wiki_catalog_match'],
342
+ };
343
+ result.score = combinedScore(result.scores, result.citation);
344
+ return result;
345
+ }
346
+
347
+ function indexResult(row: IndexRow, terms: string[]): HybridSearchEntry {
348
+ const metadata = parseJsonObject(row.metadata_json);
349
+ const score = catalogScore(`${row.kind} ${row.name} ${row.shard_key ?? ''} ${row.artifact_uri ?? ''} ${row.metadata_json}`.toLowerCase(), terms);
350
+ const result: HybridSearchEntry = {
351
+ kind: 'knowledge_index',
352
+ id: row.id,
353
+ title: row.name,
354
+ text: null,
355
+ score: 0,
356
+ scores: { catalog: score },
357
+ source: null,
358
+ citation: null,
359
+ artifact: {
360
+ uri: row.artifact_uri,
361
+ path: metadataString(metadata, ['artifact_key']),
362
+ hash: metadataString(metadata, ['content_hash']),
363
+ shard_key: row.shard_key,
364
+ },
365
+ provenance: existingProvenance(metadata),
366
+ reasons: ['index_catalog_match'],
367
+ };
368
+ result.score = combinedScore(result.scores, result.citation);
369
+ return result;
370
+ }
371
+
372
+ function mergeResult(results: Map<string, HybridSearchEntry>, entry: HybridSearchEntry): void {
373
+ const key = `${entry.kind}:${entry.id}`;
374
+ const existing = results.get(key);
375
+ if (!existing) {
376
+ results.set(key, entry);
377
+ return;
378
+ }
379
+ existing.scores = {
380
+ keyword: Math.max(existing.scores.keyword ?? 0, entry.scores.keyword ?? 0) || undefined,
381
+ semantic: Math.max(existing.scores.semantic ?? 0, entry.scores.semantic ?? 0) || undefined,
382
+ catalog: Math.max(existing.scores.catalog ?? 0, entry.scores.catalog ?? 0) || undefined,
383
+ };
384
+ existing.reasons = unique([...existing.reasons, ...entry.reasons]);
385
+ existing.text = existing.text ?? entry.text;
386
+ existing.title = existing.title ?? entry.title;
387
+ existing.source = existing.source ?? entry.source;
388
+ existing.citation = existing.citation ?? entry.citation;
389
+ existing.artifact = existing.artifact ?? entry.artifact;
390
+ existing.provenance = existing.provenance ?? entry.provenance;
391
+ existing.score = combinedScore(existing.scores, existing.citation);
392
+ }
393
+
394
+ function sortResults(results: HybridSearchEntry[]): HybridSearchEntry[] {
395
+ const kindOrder: Record<SearchResultKind, number> = {
396
+ source_chunk: 0,
397
+ wiki_chunk: 1,
398
+ wiki_page: 2,
399
+ knowledge_index: 3,
400
+ };
401
+ return results.sort((a, b) => {
402
+ if (b.score !== a.score) return b.score - a.score;
403
+ return kindOrder[a.kind] - kindOrder[b.kind] || a.id.localeCompare(b.id);
404
+ });
405
+ }
406
+
407
+ export async function hybridSearch(options: HybridSearchOptions): Promise<HybridSearchResult> {
408
+ const query = options.query.trim();
409
+ if (!query) throw new Error('Search query is required.');
410
+ const limit = Math.max(1, Math.min(options.limit ?? 10, 100));
411
+ const terms = queryTerms(query);
412
+ const ftsQuery = ftsQueryForTerms(terms);
413
+ const semanticEnabled = options.semantic === true || options.fake === true || Boolean(options.modelRef);
414
+ const warnings: string[] = [];
415
+ let semanticProvider: string | null = null;
416
+ let semanticModel: string | null = null;
417
+ let semanticDimensions: number | null = null;
418
+ let keywordCount = 0;
419
+ let catalogCount = 0;
420
+ let semanticCount = 0;
421
+ const merged = new Map<string, HybridSearchEntry>();
422
+
423
+ migrateKnowledgeDb(options.dbPath);
424
+ const db = openKnowledgeDb(options.dbPath);
425
+ try {
426
+ const ftsRows = selectFtsChunks(db, ftsQuery, Math.max(limit * 3, 20));
427
+ keywordCount = ftsRows.length;
428
+ ftsRows.forEach((row, index) => mergeResult(merged, chunkResult(row, scoreFromRank(row.rank, index))));
429
+
430
+ const wikiRows = selectWikiPages(db, terms, Math.max(limit, 10));
431
+ const indexRows = selectKnowledgeIndexes(db, terms, Math.max(limit, 10));
432
+ catalogCount = wikiRows.length + indexRows.length;
433
+ wikiRows.forEach((row) => mergeResult(merged, wikiPageResult(row, terms)));
434
+ indexRows.forEach((row) => mergeResult(merged, indexResult(row, terms)));
435
+ } finally {
436
+ db.close();
437
+ }
438
+
439
+ if (semanticEnabled) {
440
+ try {
441
+ const semantic = await searchVectorIndex({
442
+ dbPath: options.dbPath,
443
+ query,
444
+ limit: Math.max(limit * 3, 20),
445
+ config: options.config,
446
+ env: options.env,
447
+ modelRef: options.modelRef,
448
+ dimensions: options.dimensions,
449
+ fake: options.fake,
450
+ batchSize: options.batchSize,
451
+ maxParallelCalls: options.maxParallelCalls,
452
+ });
453
+ semanticProvider = semantic.provider;
454
+ semanticModel = semantic.model;
455
+ semanticDimensions = semantic.dimensions;
456
+ semanticCount = semantic.results.length;
457
+ for (const row of semantic.results) {
458
+ const result: HybridSearchEntry = {
459
+ kind: 'source_chunk',
460
+ id: row.chunk_id,
461
+ title: null,
462
+ text: row.text,
463
+ score: 0,
464
+ scores: { semantic: semanticScore(row.score) },
465
+ source: {
466
+ uri: row.source_uri,
467
+ ref: row.source_ref,
468
+ kind: row.provenance?.source_kind ?? null,
469
+ revision: row.revision,
470
+ hash: row.hash,
471
+ },
472
+ citation: {
473
+ chunk_id: row.chunk_id,
474
+ start_offset: row.provenance?.start_offset ?? null,
475
+ end_offset: row.provenance?.end_offset ?? null,
476
+ },
477
+ artifact: null,
478
+ provenance: row.provenance,
479
+ reasons: ['semantic_match'],
480
+ };
481
+ result.score = combinedScore(result.scores, result.citation);
482
+ mergeResult(merged, result);
483
+ }
484
+ } catch (error) {
485
+ warnings.push(`semantic_search_failed: ${error instanceof Error ? error.message : String(error)}`);
486
+ }
487
+ }
488
+
489
+ const results = sortResults(Array.from(merged.values())).slice(0, limit);
490
+ return {
491
+ query,
492
+ limit,
493
+ mode: {
494
+ keyword: true,
495
+ catalog: true,
496
+ semantic: semanticEnabled,
497
+ },
498
+ semantic_provider: semanticProvider,
499
+ semantic_model: semanticModel,
500
+ semantic_dimensions: semanticDimensions,
501
+ counts: {
502
+ keyword_results: keywordCount,
503
+ catalog_results: catalogCount,
504
+ semantic_results: semanticCount,
505
+ merged_results: results.length,
506
+ },
507
+ warnings,
508
+ results,
509
+ };
510
+ }
package/src/service.ts CHANGED
@@ -1,10 +1,18 @@
1
1
  import { createArtifactStore } from './artifact-store';
2
+ import {
3
+ embeddingIndexStatus,
4
+ indexKnowledgeEmbeddings,
5
+ searchVectorIndex,
6
+ type EmbeddingIndexOptions,
7
+ type EmbeddingSearchOptions,
8
+ } from './embeddings';
2
9
  import { consumeOpenFilesOutbox } from './outbox-consume';
3
10
  import { getKnowledgeDbStats, migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
4
11
  import { ingestOpenFilesManifest } from './manifest-ingest';
5
12
  import { ingestSourceRef } from './source-ingest';
6
13
  import { resolveOpenFilesSource } from './source-resolver';
7
14
  import { providerStatus, listModelRegistry, type ProviderStatusResult, type ModelRegistryEntry } from './providers';
15
+ import { hybridSearch, type HybridSearchOptions } from './search';
8
16
  import { resolveSafetyPolicy } from './safety';
9
17
  import {
10
18
  recordStorageObjects,
@@ -184,6 +192,38 @@ export class KnowledgeService {
184
192
  modelRegistry(): ModelRegistryEntry[] {
185
193
  return listModelRegistry(this.config());
186
194
  }
195
+
196
+ embeddingStatus() {
197
+ const workspace = this.ensureWorkspace();
198
+ return embeddingIndexStatus(workspace.knowledgeDbPath);
199
+ }
200
+
201
+ async indexEmbeddings(options: Omit<EmbeddingIndexOptions, 'dbPath' | 'config'> = {}) {
202
+ const workspace = this.ensureWorkspace();
203
+ return indexKnowledgeEmbeddings({
204
+ ...options,
205
+ dbPath: workspace.knowledgeDbPath,
206
+ config: this.config(),
207
+ });
208
+ }
209
+
210
+ async semanticSearch(options: Omit<EmbeddingSearchOptions, 'dbPath' | 'config'>) {
211
+ const workspace = this.ensureWorkspace();
212
+ return searchVectorIndex({
213
+ ...options,
214
+ dbPath: workspace.knowledgeDbPath,
215
+ config: this.config(),
216
+ });
217
+ }
218
+
219
+ async search(options: Omit<HybridSearchOptions, 'dbPath' | 'config'>) {
220
+ const workspace = this.ensureWorkspace();
221
+ return hybridSearch({
222
+ ...options,
223
+ dbPath: workspace.knowledgeDbPath,
224
+ config: this.config(),
225
+ });
226
+ }
187
227
  }
188
228
 
189
229
  export function createKnowledgeService(options: KnowledgeServiceOptions = {}): KnowledgeService {
@@ -35,6 +35,11 @@ function stableId(prefix: string, value: string): string {
35
35
  return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
36
36
  }
37
37
 
38
+ function estimateTokenCount(text: string): number {
39
+ const words = text.trim().split(/\s+/).filter(Boolean).length;
40
+ return Math.max(1, Math.ceil(words * 1.25));
41
+ }
42
+
38
43
  export function agentSchemaTemplate(): string {
39
44
  return `# Knowledge Agent Schema v1
40
45
 
@@ -151,6 +156,39 @@ function provenanceFor(artifact: CatalogArtifact): GeneratedArtifactProvenance {
151
156
  });
152
157
  }
153
158
 
159
+ function recordWikiChunk(db: Database, pageId: string, title: string, artifact: CatalogArtifact, body: string, now: string): void {
160
+ const provenance = provenanceFor(artifact);
161
+ const chunkId = stableId('chk', `${pageId}\u0000${artifact.hash ?? artifact.uri}`);
162
+ const existing = db.query<{ id: string }, [string]>('SELECT id FROM chunks WHERE wiki_page_id = ?').all(pageId);
163
+ for (const row of existing) db.run('DELETE FROM chunks_fts WHERE chunk_id = ?', [row.id]);
164
+ db.run('DELETE FROM chunks WHERE wiki_page_id = ?', [pageId]);
165
+ db.run(
166
+ `INSERT INTO chunks (id, wiki_page_id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json, created_at)
167
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
168
+ [
169
+ chunkId,
170
+ pageId,
171
+ 'wiki',
172
+ 0,
173
+ body,
174
+ estimateTokenCount(body),
175
+ 0,
176
+ body.length,
177
+ JSON.stringify({
178
+ artifact_key: artifact.key,
179
+ artifact_uri: artifact.uri,
180
+ content_hash: artifact.hash ?? null,
181
+ provenance,
182
+ }),
183
+ now,
184
+ ],
185
+ );
186
+ db.run(
187
+ 'INSERT INTO chunks_fts (chunk_id, text, title, source_uri) VALUES (?, ?, ?, ?)',
188
+ [chunkId, body, title, artifact.uri],
189
+ );
190
+ }
191
+
154
192
  export function recordWikiLayoutCatalog(db: Database, artifacts: CatalogArtifact[], now = new Date()): void {
155
193
  const timestamp = now.toISOString();
156
194
  const rootIndex = artifacts.find((artifact) => artifact.key.endsWith('indexes/root.md'));
@@ -182,6 +220,7 @@ export function recordWikiLayoutCatalog(db: Database, artifacts: CatalogArtifact
182
220
  }
183
221
 
184
222
  if (wikiReadme) {
223
+ const wikiPageId = stableId('wiki', 'wiki/README.md');
185
224
  db.run(
186
225
  `INSERT INTO wiki_pages (id, path, title, artifact_uri, content_hash, status, metadata_json, created_at, updated_at)
187
226
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
@@ -193,7 +232,7 @@ export function recordWikiLayoutCatalog(db: Database, artifacts: CatalogArtifact
193
232
  metadata_json = excluded.metadata_json,
194
233
  updated_at = excluded.updated_at`,
195
234
  [
196
- stableId('wiki', 'wiki/README.md'),
235
+ wikiPageId,
197
236
  'wiki/README.md',
198
237
  'Wiki',
199
238
  wikiReadme.uri,
@@ -207,5 +246,6 @@ export function recordWikiLayoutCatalog(db: Database, artifacts: CatalogArtifact
207
246
  timestamp,
208
247
  ],
209
248
  );
249
+ recordWikiChunk(db, wikiPageId, 'Wiki', wikiReadme, wikiReadmeTemplate(), timestamp);
210
250
  }
211
251
  }