@stablemodels/qmd-cf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/LICENSE +21 -0
  2. package/dist/chunker.d.ts +11 -0
  3. package/dist/chunker.d.ts.map +1 -0
  4. package/dist/chunker.js +199 -0
  5. package/dist/chunker.js.map +1 -0
  6. package/dist/fts.d.ts +19 -0
  7. package/dist/fts.d.ts.map +1 -0
  8. package/dist/fts.js +109 -0
  9. package/dist/fts.js.map +1 -0
  10. package/dist/hash.d.ts +7 -0
  11. package/dist/hash.d.ts.map +1 -0
  12. package/dist/hash.js +14 -0
  13. package/dist/hash.js.map +1 -0
  14. package/dist/index.d.ts +56 -0
  15. package/dist/index.d.ts.map +1 -0
  16. package/dist/index.js +57 -0
  17. package/dist/index.js.map +1 -0
  18. package/dist/qmd.d.ts +158 -0
  19. package/dist/qmd.d.ts.map +1 -0
  20. package/dist/qmd.js +462 -0
  21. package/dist/qmd.js.map +1 -0
  22. package/dist/rrf.d.ts +22 -0
  23. package/dist/rrf.d.ts.map +1 -0
  24. package/dist/rrf.js +92 -0
  25. package/dist/rrf.js.map +1 -0
  26. package/dist/schema.d.ts +14 -0
  27. package/dist/schema.d.ts.map +1 -0
  28. package/dist/schema.js +128 -0
  29. package/dist/schema.js.map +1 -0
  30. package/dist/testing.d.ts +77 -0
  31. package/dist/testing.d.ts.map +1 -0
  32. package/dist/testing.js +242 -0
  33. package/dist/testing.js.map +1 -0
  34. package/dist/types.d.ts +118 -0
  35. package/dist/types.d.ts.map +1 -0
  36. package/dist/types.js +9 -0
  37. package/dist/types.js.map +1 -0
  38. package/dist/vector.d.ts +38 -0
  39. package/dist/vector.d.ts.map +1 -0
  40. package/dist/vector.js +174 -0
  41. package/dist/vector.js.map +1 -0
  42. package/package.json +49 -0
  43. package/src/bun-sqlite.d.ts +17 -0
  44. package/src/chunker.ts +250 -0
  45. package/src/fts.ts +140 -0
  46. package/src/hash.ts +13 -0
  47. package/src/index.ts +72 -0
  48. package/src/qmd.ts +706 -0
  49. package/src/rrf.ts +115 -0
  50. package/src/schema.ts +147 -0
  51. package/src/testing.ts +303 -0
  52. package/src/types.ts +124 -0
  53. package/src/vector.ts +236 -0
package/src/qmd.ts ADDED
@@ -0,0 +1,706 @@
1
+ import { chunkText } from "./chunker.js";
2
+ import { searchFts } from "./fts.js";
3
+ import { fnv1a32 } from "./hash.js";
4
+ import { reciprocalRankFusion } from "./rrf.js";
5
+ import { initSchema } from "./schema.js";
6
+ import type {
7
+ Document,
8
+ EmbedFn,
9
+ FtsResult,
10
+ HybridSearchOptions,
11
+ IndexStats,
12
+ QmdConfig,
13
+ SearchOptions,
14
+ SearchResult,
15
+ VectorResult,
16
+ } from "./types.js";
17
+ import { indexVectors, removeVectors, searchVector } from "./vector.js";
18
+
19
+ /** Minimum normalized BM25 score to consider a "strong signal". */
20
+ const STRONG_SIGNAL_MIN_SCORE = 0.85;
21
+ /** Minimum gap between top-1 and top-2 BM25 scores for strong signal. */
22
+ const STRONG_SIGNAL_MIN_GAP = 0.15;
23
+
24
+ /**
25
+ * Qmd — Hybrid full-text + vector search for Cloudflare Durable Objects.
26
+ *
27
+ * A DO-native reimagination of qmd (https://github.com/tobi/qmd) that brings
28
+ * hybrid BM25 + semantic search to Cloudflare's edge.
29
+ *
30
+ * FTS5 runs co-located in the Durable Object's SQLite for zero-latency keyword search.
31
+ * Vector search optionally uses Cloudflare Vectorize for semantic similarity.
32
+ *
33
+ * Usage:
34
+ * ```ts
35
+ * // FTS-only (no external dependencies)
36
+ * const qmd = new Qmd(ctx.storage.sql);
37
+ *
38
+ * // Hybrid FTS + Vector
39
+ * const qmd = new Qmd(ctx.storage.sql, {
40
+ * vectorize: env.VECTORIZE,
41
+ * embedFn: (texts) => workerAiEmbed(env.AI, texts),
42
+ * });
43
+ *
44
+ * // Index a document
45
+ * await qmd.index({ id: "soul.md", content: "...", title: "Soul" });
46
+ *
47
+ * // Search
48
+ * const results = await qmd.search("what does the agent care about?");
49
+ * ```
50
+ */
51
+ export class Qmd {
52
+ private sql: SqlStorage;
53
+ private vectorize: Vectorize | null;
54
+ private embedFn: EmbedFn | null;
55
+ private config: Required<QmdConfig>;
56
+ private initialized = false;
57
+
58
+ constructor(
59
+ sql: SqlStorage,
60
+ options?: {
61
+ vectorize?: Vectorize;
62
+ embedFn?: EmbedFn;
63
+ config?: QmdConfig;
64
+ },
65
+ ) {
66
+ this.sql = sql;
67
+ this.vectorize = options?.vectorize ?? null;
68
+ this.embedFn = options?.embedFn ?? null;
69
+
70
+ if (this.vectorize && !this.embedFn) {
71
+ throw new Error("embedFn is required when vectorize is provided");
72
+ }
73
+
74
+ this.config = {
75
+ chunkSize: options?.config?.chunkSize ?? 3200,
76
+ chunkOverlap: options?.config?.chunkOverlap ?? 480,
77
+ tokenizer: options?.config?.tokenizer ?? "unicode61",
78
+ };
79
+ }
80
+
81
+ /** Ensure the FTS5 schema is initialized. Called automatically on first operation. */
82
+ private ensureInit(): void {
83
+ if (this.initialized) return;
84
+ initSchema(this.sql, this.config.tokenizer);
85
+ this.initialized = true;
86
+ }
87
+
88
+ /** Whether vector search is available. */
89
+ get hasVectorSearch(): boolean {
90
+ return this.vectorize !== null && this.embedFn !== null;
91
+ }
92
+
93
+ /**
94
+ * Index a document for search.
95
+ *
96
+ * The document is chunked and inserted into FTS5. If Vectorize is configured,
97
+ * chunks are also embedded and upserted into the vector index.
98
+ *
99
+ * If the content is unchanged (same hash), chunking and vector indexing are
100
+ * skipped. Document metadata (title, namespace, etc.) is always updated.
101
+ */
102
+ async index(doc: Document): Promise<{ chunks: number; skipped: boolean }> {
103
+ this.ensureInit();
104
+
105
+ const contentHash = fnv1a32(doc.content);
106
+ const metadataJson = doc.metadata ? JSON.stringify(doc.metadata) : null;
107
+
108
+ // Check if content is unchanged
109
+ const existing = this.sql
110
+ .exec<{ content_hash: string | null }>(
111
+ "SELECT content_hash FROM qmd_documents WHERE id = ?",
112
+ doc.id,
113
+ )
114
+ .toArray();
115
+
116
+ if (existing.length > 0 && existing[0].content_hash === contentHash) {
117
+ // Content unchanged — update metadata but skip re-chunking
118
+ this.sql.exec(
119
+ `UPDATE qmd_documents SET title = ?, doc_type = ?, namespace = ?, metadata = ?, updated_at = datetime('now')
120
+ WHERE id = ?`,
121
+ doc.title ?? null,
122
+ doc.docType ?? null,
123
+ doc.namespace ?? null,
124
+ metadataJson,
125
+ doc.id,
126
+ );
127
+ const chunkCount = this.sql
128
+ .exec<{ cnt: number }>(
129
+ "SELECT COUNT(*) as cnt FROM qmd_chunks WHERE doc_id = ?",
130
+ doc.id,
131
+ )
132
+ .one().cnt;
133
+ return { chunks: chunkCount, skipped: true };
134
+ }
135
+
136
+ // Upsert document metadata with content hash
137
+ this.sql.exec(
138
+ `INSERT OR REPLACE INTO qmd_documents (id, title, doc_type, namespace, metadata, content_hash, updated_at)
139
+ VALUES (?, ?, ?, ?, ?, ?, datetime('now'))`,
140
+ doc.id,
141
+ doc.title ?? null,
142
+ doc.docType ?? null,
143
+ doc.namespace ?? null,
144
+ metadataJson,
145
+ contentHash,
146
+ );
147
+
148
+ // Delete old chunks (triggers will clean up FTS)
149
+ this.sql.exec("DELETE FROM qmd_chunks WHERE doc_id = ?", doc.id);
150
+
151
+ // Chunk and insert
152
+ const chunks = chunkText(
153
+ doc.id,
154
+ doc.content,
155
+ this.config.chunkSize,
156
+ this.config.chunkOverlap,
157
+ );
158
+
159
+ for (const chunk of chunks) {
160
+ this.sql.exec(
161
+ "INSERT INTO qmd_chunks (doc_id, seq, content, char_offset) VALUES (?, ?, ?, ?)",
162
+ chunk.docId,
163
+ chunk.seq,
164
+ chunk.text,
165
+ chunk.charOffset,
166
+ );
167
+ }
168
+
169
+ // Vector indexing (async, non-blocking for FTS)
170
+ if (this.vectorize && this.embedFn) {
171
+ const contexts = this.getContextsForDoc(doc.id);
172
+ const contextText = contexts.map((c) => c.description).join(". ");
173
+
174
+ await indexVectors(
175
+ this.vectorize,
176
+ this.embedFn,
177
+ chunks.map((c) => ({
178
+ docId: c.docId,
179
+ seq: c.seq,
180
+ text: c.text,
181
+ title: doc.title,
182
+ namespace: doc.namespace,
183
+ docType: doc.docType,
184
+ context: contextText || undefined,
185
+ })),
186
+ );
187
+ }
188
+
189
+ return { chunks: chunks.length, skipped: false };
190
+ }
191
+
192
+ /**
193
+ * Index multiple documents in batch.
194
+ * More efficient than calling index() in a loop when Vectorize is configured,
195
+ * as embeddings are batched.
196
+ */
197
+ async indexBatch(
198
+ docs: Document[],
199
+ ): Promise<{ documents: number; chunks: number; skipped: number }> {
200
+ this.ensureInit();
201
+
202
+ let totalChunks = 0;
203
+ let skippedCount = 0;
204
+ const allVectorChunks: Array<{
205
+ docId: string;
206
+ seq: number;
207
+ text: string;
208
+ title?: string;
209
+ namespace?: string;
210
+ docType?: string;
211
+ context?: string;
212
+ }> = [];
213
+
214
+ for (const doc of docs) {
215
+ const contentHash = fnv1a32(doc.content);
216
+ const metadataJson = doc.metadata ? JSON.stringify(doc.metadata) : null;
217
+
218
+ // Check if content is unchanged
219
+ const existing = this.sql
220
+ .exec<{ content_hash: string | null }>(
221
+ "SELECT content_hash FROM qmd_documents WHERE id = ?",
222
+ doc.id,
223
+ )
224
+ .toArray();
225
+
226
+ if (existing.length > 0 && existing[0].content_hash === contentHash) {
227
+ // Update metadata, skip re-chunking
228
+ this.sql.exec(
229
+ `UPDATE qmd_documents SET title = ?, doc_type = ?, namespace = ?, metadata = ?, updated_at = datetime('now')
230
+ WHERE id = ?`,
231
+ doc.title ?? null,
232
+ doc.docType ?? null,
233
+ doc.namespace ?? null,
234
+ metadataJson,
235
+ doc.id,
236
+ );
237
+ const chunkCount = this.sql
238
+ .exec<{ cnt: number }>(
239
+ "SELECT COUNT(*) as cnt FROM qmd_chunks WHERE doc_id = ?",
240
+ doc.id,
241
+ )
242
+ .one().cnt;
243
+ totalChunks += chunkCount;
244
+ skippedCount++;
245
+ continue;
246
+ }
247
+
248
+ this.sql.exec(
249
+ `INSERT OR REPLACE INTO qmd_documents (id, title, doc_type, namespace, metadata, content_hash, updated_at)
250
+ VALUES (?, ?, ?, ?, ?, ?, datetime('now'))`,
251
+ doc.id,
252
+ doc.title ?? null,
253
+ doc.docType ?? null,
254
+ doc.namespace ?? null,
255
+ metadataJson,
256
+ contentHash,
257
+ );
258
+
259
+ this.sql.exec("DELETE FROM qmd_chunks WHERE doc_id = ?", doc.id);
260
+
261
+ const chunks = chunkText(
262
+ doc.id,
263
+ doc.content,
264
+ this.config.chunkSize,
265
+ this.config.chunkOverlap,
266
+ );
267
+
268
+ for (const chunk of chunks) {
269
+ this.sql.exec(
270
+ "INSERT INTO qmd_chunks (doc_id, seq, content, char_offset) VALUES (?, ?, ?, ?)",
271
+ chunk.docId,
272
+ chunk.seq,
273
+ chunk.text,
274
+ chunk.charOffset,
275
+ );
276
+ }
277
+
278
+ totalChunks += chunks.length;
279
+
280
+ if (this.vectorize && this.embedFn) {
281
+ const contexts = this.getContextsForDoc(doc.id);
282
+ const contextText = contexts.map((c) => c.description).join(". ");
283
+
284
+ for (const c of chunks) {
285
+ allVectorChunks.push({
286
+ docId: c.docId,
287
+ seq: c.seq,
288
+ text: c.text,
289
+ title: doc.title,
290
+ namespace: doc.namespace,
291
+ docType: doc.docType,
292
+ context: contextText || undefined,
293
+ });
294
+ }
295
+ }
296
+ }
297
+
298
+ // Batch embed and upsert vectors
299
+ if (this.vectorize && this.embedFn && allVectorChunks.length > 0) {
300
+ await indexVectors(this.vectorize, this.embedFn, allVectorChunks);
301
+ }
302
+
303
+ return {
304
+ documents: docs.length,
305
+ chunks: totalChunks,
306
+ skipped: skippedCount,
307
+ };
308
+ }
309
+
310
+ /**
311
+ * Remove a document and all its chunks from the index.
312
+ * Also removes vectors from Vectorize if configured.
313
+ */
314
+ async remove(docId: string): Promise<void> {
315
+ this.ensureInit();
316
+
317
+ if (this.vectorize) {
318
+ await removeVectors(this.vectorize, this.sql, docId);
319
+ }
320
+
321
+ // Delete chunks (FTS cleanup via trigger)
322
+ this.sql.exec("DELETE FROM qmd_chunks WHERE doc_id = ?", docId);
323
+ // Delete document
324
+ this.sql.exec("DELETE FROM qmd_documents WHERE id = ?", docId);
325
+ }
326
+
327
+ /**
328
+ * Full-text search using FTS5 BM25 ranking.
329
+ * Always available — no external dependencies needed.
330
+ */
331
+ searchFts(query: string, options?: SearchOptions): FtsResult[] {
332
+ this.ensureInit();
333
+ return searchFts(this.sql, query, options);
334
+ }
335
+
336
+ /**
337
+ * Vector similarity search using Cloudflare Vectorize.
338
+ * Requires vectorize + embedFn to be configured.
339
+ */
340
+ async searchVector(
341
+ query: string,
342
+ options?: SearchOptions,
343
+ ): Promise<VectorResult[]> {
344
+ if (!this.vectorize || !this.embedFn) {
345
+ throw new Error(
346
+ "Vector search requires vectorize and embedFn to be configured",
347
+ );
348
+ }
349
+ this.ensureInit();
350
+ return searchVector(this.vectorize, this.embedFn, this.sql, query, options);
351
+ }
352
+
353
+ /**
354
+ * Hybrid search combining FTS5 BM25 + Vectorize similarity via Reciprocal Rank Fusion.
355
+ *
356
+ * If only FTS is available, falls back to FTS-only results wrapped as SearchResult[].
357
+ * If both are available, runs FTS first as a probe. If BM25 has a strong signal
358
+ * (top score >= 0.85 with gap >= 0.15 to second), returns FTS results directly
359
+ * without the Vectorize round-trip. Otherwise, runs vector search and fuses with RRF.
360
+ */
361
+ async search(
362
+ query: string,
363
+ options?: HybridSearchOptions,
364
+ ): Promise<SearchResult[]> {
365
+ this.ensureInit();
366
+
367
+ const limit = options?.limit ?? 10;
368
+ // Fetch more from each source for better fusion
369
+ const sourceFetchLimit = limit * 3;
370
+
371
+ const ftsOptions: SearchOptions = {
372
+ limit: sourceFetchLimit,
373
+ docType: options?.docType,
374
+ namespace: options?.namespace,
375
+ };
376
+
377
+ // FTS-only mode
378
+ if (!this.vectorize || !this.embedFn) {
379
+ const ftsResults = searchFts(this.sql, query, ftsOptions);
380
+ return ftsResults.slice(0, limit).map((r) => ({
381
+ docId: r.docId,
382
+ score: r.score,
383
+ snippet: r.snippet,
384
+ sources: ["fts"] as Array<"fts" | "vector">,
385
+ sourceScores: { fts: r.score },
386
+ title: r.title,
387
+ docType: r.docType,
388
+ namespace: r.namespace,
389
+ metadata: r.metadata,
390
+ }));
391
+ }
392
+
393
+ // Hybrid mode: run FTS first for strong signal probe
394
+ const ftsResults = searchFts(this.sql, query, ftsOptions);
395
+
396
+ // Strong signal detection: if BM25 has a clear winner, skip vector search
397
+ if (ftsResults.length >= 1) {
398
+ const topScore = ftsResults[0].score;
399
+ const secondScore = ftsResults.length >= 2 ? ftsResults[1].score : 0;
400
+
401
+ if (
402
+ topScore >= STRONG_SIGNAL_MIN_SCORE &&
403
+ topScore - secondScore >= STRONG_SIGNAL_MIN_GAP
404
+ ) {
405
+ return ftsResults.slice(0, limit).map((r) => ({
406
+ docId: r.docId,
407
+ score: r.score,
408
+ snippet: r.snippet,
409
+ sources: ["fts"] as Array<"fts" | "vector">,
410
+ sourceScores: { fts: r.score },
411
+ title: r.title,
412
+ docType: r.docType,
413
+ namespace: r.namespace,
414
+ metadata: r.metadata,
415
+ }));
416
+ }
417
+ }
418
+
419
+ // No strong signal — run vector search and fuse
420
+ const vectorOptions: SearchOptions = {
421
+ limit: sourceFetchLimit,
422
+ docType: options?.docType,
423
+ namespace: options?.namespace,
424
+ };
425
+
426
+ const vectorResults = await searchVector(
427
+ this.vectorize,
428
+ this.embedFn,
429
+ this.sql,
430
+ query,
431
+ vectorOptions,
432
+ );
433
+
434
+ return reciprocalRankFusion(ftsResults, vectorResults, {
435
+ ftsWeight: options?.ftsWeight,
436
+ vectorWeight: options?.vectorWeight,
437
+ k: options?.rrfK,
438
+ limit,
439
+ });
440
+ }
441
+
442
+ /**
443
+ * Get a document by ID. Returns the full reconstructed content.
444
+ */
445
+ get(
446
+ docId: string,
447
+ ): { content: string; title: string | null; docType: string | null } | null {
448
+ this.ensureInit();
449
+
450
+ const doc = this.sql
451
+ .exec<{ title: string | null; doc_type: string | null }>(
452
+ "SELECT title, doc_type FROM qmd_documents WHERE id = ?",
453
+ docId,
454
+ )
455
+ .toArray();
456
+
457
+ if (doc.length === 0) return null;
458
+
459
+ const chunks = this.sql
460
+ .exec<{ content: string }>(
461
+ "SELECT content FROM qmd_chunks WHERE doc_id = ? ORDER BY seq",
462
+ docId,
463
+ )
464
+ .toArray();
465
+
466
+ // Reconstruct content from chunks (overlap means we can't just concatenate)
467
+ // For now, return the first chunk's full text + subsequent chunks' non-overlapping portions
468
+ // This is an approximation — exact reconstruction would need char_offset tracking
469
+ const content = chunks.map((c) => c.content).join("\n\n");
470
+
471
+ return {
472
+ content,
473
+ title: doc[0].title,
474
+ docType: doc[0].doc_type,
475
+ };
476
+ }
477
+
478
+ /**
479
+ * Check if a document exists in the index.
480
+ */
481
+ has(docId: string): boolean {
482
+ this.ensureInit();
483
+ const result = this.sql
484
+ .exec<{ cnt: number }>(
485
+ "SELECT COUNT(*) as cnt FROM qmd_documents WHERE id = ?",
486
+ docId,
487
+ )
488
+ .toArray();
489
+ return result.length > 0 && result[0].cnt > 0;
490
+ }
491
+
492
+ /**
493
+ * List all indexed document IDs, optionally filtered.
494
+ */
495
+ list(options?: { namespace?: string; docType?: string }): string[] {
496
+ this.ensureInit();
497
+
498
+ const filters: string[] = [];
499
+ const bindings: unknown[] = [];
500
+
501
+ if (options?.namespace) {
502
+ filters.push("namespace = ?");
503
+ bindings.push(options.namespace);
504
+ }
505
+ if (options?.docType) {
506
+ filters.push("doc_type = ?");
507
+ bindings.push(options.docType);
508
+ }
509
+
510
+ const where = filters.length > 0 ? `WHERE ${filters.join(" AND ")}` : "";
511
+
512
+ return this.sql
513
+ .exec<{ id: string }>(
514
+ `SELECT id FROM qmd_documents ${where} ORDER BY id`,
515
+ ...bindings,
516
+ )
517
+ .toArray()
518
+ .map((r) => r.id);
519
+ }
520
+
521
+ /**
522
+ * List documents by namespace pattern. Direct SQL query — no FTS or vector search.
523
+ * Supports glob patterns: "people/*" matches all namespaces starting with "people/".
524
+ * Returns documents ordered by most recently updated first.
525
+ */
526
+ listByNamespace(
527
+ pattern: string,
528
+ limit = 50,
529
+ ): Array<{
530
+ docId: string;
531
+ title: string | null;
532
+ content: string;
533
+ namespace: string | null;
534
+ }> {
535
+ this.ensureInit();
536
+
537
+ let whereClause: string;
538
+ let binding: string;
539
+
540
+ if (pattern.includes("*")) {
541
+ const prefix = pattern.replace(/\*+$/, "").replace(/\/+$/, "");
542
+ whereClause = "d.namespace LIKE ?";
543
+ binding = `${prefix}/%`;
544
+ } else {
545
+ whereClause = "d.namespace = ?";
546
+ binding = pattern;
547
+ }
548
+
549
+ const rows = this.sql
550
+ .exec<{
551
+ id: string;
552
+ title: string | null;
553
+ namespace: string | null;
554
+ content: string;
555
+ }>(
556
+ `SELECT d.id, d.title, d.namespace,
557
+ GROUP_CONCAT(c.content, '\n\n') as content
558
+ FROM qmd_documents d
559
+ JOIN qmd_chunks c ON c.doc_id = d.id
560
+ WHERE ${whereClause}
561
+ GROUP BY d.id
562
+ ORDER BY d.updated_at DESC
563
+ LIMIT ?`,
564
+ binding,
565
+ limit,
566
+ )
567
+ .toArray();
568
+
569
+ return rows.map((r) => ({
570
+ docId: r.id,
571
+ title: r.title,
572
+ content: r.content,
573
+ namespace: r.namespace,
574
+ }));
575
+ }
576
+
577
+ /**
578
+ * Get index statistics.
579
+ */
580
+ stats(): IndexStats {
581
+ this.ensureInit();
582
+
583
+ const docCount = this.sql
584
+ .exec<{ cnt: number }>("SELECT COUNT(*) as cnt FROM qmd_documents")
585
+ .one().cnt;
586
+
587
+ const chunkCount = this.sql
588
+ .exec<{ cnt: number }>("SELECT COUNT(*) as cnt FROM qmd_chunks")
589
+ .one().cnt;
590
+
591
+ const namespaces = this.sql
592
+ .exec<{ namespace: string }>(
593
+ "SELECT DISTINCT namespace FROM qmd_documents WHERE namespace IS NOT NULL",
594
+ )
595
+ .toArray()
596
+ .map((r) => r.namespace);
597
+
598
+ const docTypes = this.sql
599
+ .exec<{ doc_type: string }>(
600
+ "SELECT DISTINCT doc_type FROM qmd_documents WHERE doc_type IS NOT NULL",
601
+ )
602
+ .toArray()
603
+ .map((r) => r.doc_type);
604
+
605
+ return {
606
+ totalDocuments: docCount,
607
+ totalChunks: chunkCount,
608
+ totalVectors: 0, // Can't query Vectorize count from binding
609
+ namespaces,
610
+ docTypes,
611
+ };
612
+ }
613
+
614
+ /**
615
+ * Rebuild the FTS index from scratch.
616
+ * Useful after schema changes or data corruption.
617
+ */
618
+ rebuild(): void {
619
+ this.ensureInit();
620
+ this.sql.exec(
621
+ "INSERT INTO qmd_chunks_fts(qmd_chunks_fts) VALUES('rebuild')",
622
+ );
623
+ }
624
+
625
+ // --- Context system ---
626
+
627
+ /**
628
+ * Set a context description for a path prefix.
629
+ * Contexts enrich vector embeddings for all documents matching the prefix.
630
+ */
631
+ setContext(prefix: string, description: string, namespace?: string): void {
632
+ this.ensureInit();
633
+ this.sql.exec(
634
+ "INSERT OR REPLACE INTO qmd_contexts (prefix, namespace, description) VALUES (?, ?, ?)",
635
+ prefix,
636
+ namespace ?? "",
637
+ description,
638
+ );
639
+ }
640
+
641
+ /**
642
+ * Remove a context by prefix.
643
+ */
644
+ removeContext(prefix: string, namespace?: string): void {
645
+ this.ensureInit();
646
+ this.sql.exec(
647
+ "DELETE FROM qmd_contexts WHERE prefix = ? AND namespace = ?",
648
+ prefix,
649
+ namespace ?? "",
650
+ );
651
+ }
652
+
653
+ /**
654
+ * List all contexts, optionally filtered by namespace.
655
+ */
656
+ listContexts(
657
+ namespace?: string,
658
+ ): Array<{ prefix: string; description: string; namespace: string }> {
659
+ this.ensureInit();
660
+ if (namespace !== undefined) {
661
+ return this.sql
662
+ .exec<{ prefix: string; description: string; namespace: string }>(
663
+ "SELECT prefix, description, namespace FROM qmd_contexts WHERE namespace = ? ORDER BY prefix",
664
+ namespace,
665
+ )
666
+ .toArray();
667
+ }
668
+ return this.sql
669
+ .exec<{ prefix: string; description: string; namespace: string }>(
670
+ "SELECT prefix, description, namespace FROM qmd_contexts ORDER BY prefix",
671
+ )
672
+ .toArray();
673
+ }
674
+
675
+ /**
676
+ * Get all matching contexts for a document ID.
677
+ * Matches hierarchically: for "life/areas/health/exercise.md",
678
+ * returns contexts at "", "life/", "life/areas/", "life/areas/health/".
679
+ * Results ordered from most general to most specific.
680
+ */
681
+ getContextsForDoc(
682
+ docId: string,
683
+ ): Array<{ prefix: string; description: string }> {
684
+ this.ensureInit();
685
+
686
+ // Build all possible prefixes
687
+ const prefixes = [""];
688
+ const parts = docId.split("/");
689
+ let current = "";
690
+ for (let i = 0; i < parts.length - 1; i++) {
691
+ current += `${parts[i]}/`;
692
+ prefixes.push(current);
693
+ }
694
+ prefixes.push(docId);
695
+
696
+ const placeholders = prefixes.map(() => "?").join(", ");
697
+ return this.sql
698
+ .exec<{ prefix: string; description: string }>(
699
+ `SELECT prefix, description FROM qmd_contexts
700
+ WHERE prefix IN (${placeholders}) AND namespace = ''
701
+ ORDER BY length(prefix)`,
702
+ ...prefixes,
703
+ )
704
+ .toArray();
705
+ }
706
+ }