@stablemodels/qmd-cf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/LICENSE +21 -0
  2. package/dist/chunker.d.ts +11 -0
  3. package/dist/chunker.d.ts.map +1 -0
  4. package/dist/chunker.js +199 -0
  5. package/dist/chunker.js.map +1 -0
  6. package/dist/fts.d.ts +19 -0
  7. package/dist/fts.d.ts.map +1 -0
  8. package/dist/fts.js +109 -0
  9. package/dist/fts.js.map +1 -0
  10. package/dist/hash.d.ts +7 -0
  11. package/dist/hash.d.ts.map +1 -0
  12. package/dist/hash.js +14 -0
  13. package/dist/hash.js.map +1 -0
  14. package/dist/index.d.ts +56 -0
  15. package/dist/index.d.ts.map +1 -0
  16. package/dist/index.js +57 -0
  17. package/dist/index.js.map +1 -0
  18. package/dist/qmd.d.ts +158 -0
  19. package/dist/qmd.d.ts.map +1 -0
  20. package/dist/qmd.js +462 -0
  21. package/dist/qmd.js.map +1 -0
  22. package/dist/rrf.d.ts +22 -0
  23. package/dist/rrf.d.ts.map +1 -0
  24. package/dist/rrf.js +92 -0
  25. package/dist/rrf.js.map +1 -0
  26. package/dist/schema.d.ts +14 -0
  27. package/dist/schema.d.ts.map +1 -0
  28. package/dist/schema.js +128 -0
  29. package/dist/schema.js.map +1 -0
  30. package/dist/testing.d.ts +77 -0
  31. package/dist/testing.d.ts.map +1 -0
  32. package/dist/testing.js +242 -0
  33. package/dist/testing.js.map +1 -0
  34. package/dist/types.d.ts +118 -0
  35. package/dist/types.d.ts.map +1 -0
  36. package/dist/types.js +9 -0
  37. package/dist/types.js.map +1 -0
  38. package/dist/vector.d.ts +38 -0
  39. package/dist/vector.d.ts.map +1 -0
  40. package/dist/vector.js +174 -0
  41. package/dist/vector.js.map +1 -0
  42. package/package.json +49 -0
  43. package/src/bun-sqlite.d.ts +17 -0
  44. package/src/chunker.ts +250 -0
  45. package/src/fts.ts +140 -0
  46. package/src/hash.ts +13 -0
  47. package/src/index.ts +72 -0
  48. package/src/qmd.ts +706 -0
  49. package/src/rrf.ts +115 -0
  50. package/src/schema.ts +147 -0
  51. package/src/testing.ts +303 -0
  52. package/src/types.ts +124 -0
  53. package/src/vector.ts +236 -0
package/src/rrf.ts ADDED
@@ -0,0 +1,115 @@
1
+ import type { FtsResult, SearchResult, VectorResult } from "./types.js";
2
+
3
+ /**
4
+ * Reciprocal Rank Fusion (RRF) — merge ranked result lists into a single ranking.
5
+ *
6
+ * RRF score for document d = Σ(weight_i / (k + rank_i + 1)) across all lists
7
+ * where rank_i is the 0-based position in list i.
8
+ *
9
+ * From qmd: k=60 is the standard constant. Higher k reduces the impact of
10
+ * being ranked #1 vs #5, making the fusion more conservative.
11
+ *
12
+ * Additionally applies a top-rank bonus (from qmd):
13
+ * - Rank #1 in any list: +0.05
14
+ * - Rank #2-3 in any list: +0.02
15
+ * This prevents exact matches from being diluted by expansion queries.
16
+ */
17
+ export function reciprocalRankFusion(
18
+ ftsResults: FtsResult[],
19
+ vectorResults: VectorResult[],
20
+ options: {
21
+ ftsWeight?: number;
22
+ vectorWeight?: number;
23
+ k?: number;
24
+ limit?: number;
25
+ } = {},
26
+ ): SearchResult[] {
27
+ const k = options.k ?? 60;
28
+ const ftsWeight = options.ftsWeight ?? 1.0;
29
+ const vectorWeight = options.vectorWeight ?? 1.0;
30
+ const limit = options.limit ?? 10;
31
+
32
+ const scores = new Map<
33
+ string,
34
+ {
35
+ rrfScore: number;
36
+ topRank: number;
37
+ sources: Set<"fts" | "vector">;
38
+ sourceScores: { fts?: number; vector?: number };
39
+ bestResult: FtsResult | VectorResult;
40
+ }
41
+ >();
42
+
43
+ // Process FTS results
44
+ for (let rank = 0; rank < ftsResults.length; rank++) {
45
+ const r = ftsResults[rank];
46
+ const contribution = ftsWeight / (k + rank + 1);
47
+ const entry = scores.get(r.docId);
48
+
49
+ if (entry) {
50
+ entry.rrfScore += contribution;
51
+ entry.topRank = Math.min(entry.topRank, rank);
52
+ entry.sources.add("fts");
53
+ entry.sourceScores.fts = r.score;
54
+ } else {
55
+ scores.set(r.docId, {
56
+ rrfScore: contribution,
57
+ topRank: rank,
58
+ sources: new Set(["fts"]),
59
+ sourceScores: { fts: r.score },
60
+ bestResult: r,
61
+ });
62
+ }
63
+ }
64
+
65
+ // Process vector results
66
+ for (let rank = 0; rank < vectorResults.length; rank++) {
67
+ const r = vectorResults[rank];
68
+ const contribution = vectorWeight / (k + rank + 1);
69
+ const entry = scores.get(r.docId);
70
+
71
+ if (entry) {
72
+ entry.rrfScore += contribution;
73
+ entry.topRank = Math.min(entry.topRank, rank);
74
+ entry.sources.add("vector");
75
+ entry.sourceScores.vector = r.score;
76
+ // Keep the result with better snippet context
77
+ if (r.score > (entry.sourceScores.fts ?? 0)) {
78
+ entry.bestResult = r;
79
+ }
80
+ } else {
81
+ scores.set(r.docId, {
82
+ rrfScore: contribution,
83
+ topRank: rank,
84
+ sources: new Set(["vector"]),
85
+ sourceScores: { vector: r.score },
86
+ bestResult: r,
87
+ });
88
+ }
89
+ }
90
+
91
+ // Apply top-rank bonus
92
+ for (const entry of scores.values()) {
93
+ if (entry.topRank === 0) {
94
+ entry.rrfScore += 0.05;
95
+ } else if (entry.topRank <= 2) {
96
+ entry.rrfScore += 0.02;
97
+ }
98
+ }
99
+
100
+ // Sort by RRF score and return
101
+ return Array.from(scores.entries())
102
+ .sort((a, b) => b[1].rrfScore - a[1].rrfScore)
103
+ .slice(0, limit)
104
+ .map(([docId, entry]) => ({
105
+ docId,
106
+ score: entry.rrfScore,
107
+ snippet: entry.bestResult.snippet,
108
+ sources: Array.from(entry.sources),
109
+ sourceScores: entry.sourceScores,
110
+ title: entry.bestResult.title,
111
+ docType: entry.bestResult.docType,
112
+ namespace: entry.bestResult.namespace,
113
+ metadata: entry.bestResult.metadata,
114
+ }));
115
+ }
package/src/schema.ts ADDED
@@ -0,0 +1,147 @@
1
+ const CURRENT_VERSION = 2;
2
+
3
+ /**
4
+ * Initialize the FTS5 schema on a Durable Object's SQL storage.
5
+ *
6
+ * Tables created:
7
+ * - `qmd_documents` — document metadata (id, title, docType, namespace, metadata JSON, content_hash)
8
+ * - `qmd_chunks` — chunked content with parent doc reference
9
+ * - `qmd_chunks_fts` — FTS5 virtual table for full-text search over chunks
10
+ * - `qmd_contexts` — semantic context descriptions for path prefixes
11
+ * - `qmd_meta` — schema version tracking
12
+ *
13
+ * The FTS5 table is kept in sync via triggers on `qmd_chunks`.
14
+ */
15
+ export function initSchema(sql: SqlStorage, tokenizer = "unicode61"): void {
16
+ // Create version tracking table first so we can check if already initialized
17
+ sql.exec(`
18
+ CREATE TABLE IF NOT EXISTS qmd_meta (
19
+ key TEXT PRIMARY KEY,
20
+ version INTEGER NOT NULL
21
+ )
22
+ `);
23
+
24
+ const existing = sql
25
+ .exec<{ version: number }>("SELECT version FROM qmd_meta LIMIT 1")
26
+ .toArray();
27
+ const currentVersion = existing.length > 0 ? existing[0].version : 0;
28
+
29
+ if (currentVersion >= CURRENT_VERSION) {
30
+ return; // Already at current version
31
+ }
32
+
33
+ // Version 0 -> 1: initial schema
34
+ if (currentVersion < 1) {
35
+ // Document metadata table
36
+ sql.exec(`
37
+ CREATE TABLE IF NOT EXISTS qmd_documents (
38
+ id TEXT PRIMARY KEY,
39
+ title TEXT,
40
+ doc_type TEXT,
41
+ namespace TEXT,
42
+ metadata TEXT,
43
+ created_at TEXT DEFAULT (datetime('now')),
44
+ updated_at TEXT DEFAULT (datetime('now'))
45
+ )
46
+ `);
47
+
48
+ // Chunk content table
49
+ sql.exec(`
50
+ CREATE TABLE IF NOT EXISTS qmd_chunks (
51
+ doc_id TEXT NOT NULL,
52
+ seq INTEGER NOT NULL,
53
+ content TEXT NOT NULL,
54
+ char_offset INTEGER NOT NULL DEFAULT 0,
55
+ PRIMARY KEY (doc_id, seq),
56
+ FOREIGN KEY (doc_id) REFERENCES qmd_documents(id) ON DELETE CASCADE
57
+ )
58
+ `);
59
+
60
+ // FTS5 virtual table — indexes chunk content with document title for boosted relevance
61
+ sql.exec(`
62
+ CREATE VIRTUAL TABLE IF NOT EXISTS qmd_chunks_fts USING fts5(
63
+ doc_id UNINDEXED,
64
+ seq UNINDEXED,
65
+ title,
66
+ content,
67
+ tokenize='${tokenizer}'
68
+ )
69
+ `);
70
+
71
+ // Triggers to keep FTS in sync with chunks table
72
+ sql.exec(`
73
+ CREATE TRIGGER IF NOT EXISTS qmd_chunks_ai AFTER INSERT ON qmd_chunks
74
+ BEGIN
75
+ INSERT INTO qmd_chunks_fts(doc_id, seq, title, content)
76
+ SELECT NEW.doc_id, NEW.seq, d.title, NEW.content
77
+ FROM qmd_documents d WHERE d.id = NEW.doc_id;
78
+ END
79
+ `);
80
+
81
+ sql.exec(`
82
+ CREATE TRIGGER IF NOT EXISTS qmd_chunks_ad AFTER DELETE ON qmd_chunks
83
+ BEGIN
84
+ DELETE FROM qmd_chunks_fts
85
+ WHERE doc_id = OLD.doc_id AND seq = OLD.seq;
86
+ END
87
+ `);
88
+
89
+ sql.exec(`
90
+ CREATE TRIGGER IF NOT EXISTS qmd_chunks_au AFTER UPDATE ON qmd_chunks
91
+ BEGIN
92
+ DELETE FROM qmd_chunks_fts
93
+ WHERE doc_id = OLD.doc_id AND seq = OLD.seq;
94
+ INSERT INTO qmd_chunks_fts(doc_id, seq, title, content)
95
+ SELECT NEW.doc_id, NEW.seq, d.title, NEW.content
96
+ FROM qmd_documents d WHERE d.id = NEW.doc_id;
97
+ END
98
+ `);
99
+
100
+ // Index for namespace-scoped lookups
101
+ sql.exec(`
102
+ CREATE INDEX IF NOT EXISTS idx_qmd_documents_namespace
103
+ ON qmd_documents(namespace)
104
+ `);
105
+
106
+ // Index for doc_type filtering
107
+ sql.exec(`
108
+ CREATE INDEX IF NOT EXISTS idx_qmd_documents_doc_type
109
+ ON qmd_documents(doc_type)
110
+ `);
111
+ }
112
+
113
+ // Version 1 -> 2: add content_hash column + contexts table
114
+ if (currentVersion < 2) {
115
+ // Add content_hash column for skip-on-unchanged indexing.
116
+ // For fresh installs (currentVersion === 0), the column doesn't exist yet on the just-created table.
117
+ // For upgrades from v1, ALTER TABLE adds the column to the existing table.
118
+ // Both paths use ALTER TABLE which is idempotent-safe with IF NOT EXISTS workaround.
119
+ const cols = sql
120
+ .exec<{ name: string }>("PRAGMA table_info(qmd_documents)")
121
+ .toArray()
122
+ .map((c) => c.name);
123
+ if (!cols.includes("content_hash")) {
124
+ sql.exec("ALTER TABLE qmd_documents ADD COLUMN content_hash TEXT");
125
+ }
126
+
127
+ // Semantic context descriptions for path prefixes
128
+ sql.exec(`
129
+ CREATE TABLE IF NOT EXISTS qmd_contexts (
130
+ prefix TEXT NOT NULL,
131
+ namespace TEXT NOT NULL DEFAULT '',
132
+ description TEXT NOT NULL,
133
+ PRIMARY KEY (prefix, namespace)
134
+ )
135
+ `);
136
+
137
+ sql.exec(
138
+ "CREATE INDEX IF NOT EXISTS idx_qmd_contexts_namespace ON qmd_contexts(namespace)",
139
+ );
140
+ }
141
+
142
+ // Record schema version
143
+ sql.exec(
144
+ "INSERT OR REPLACE INTO qmd_meta (key, version) VALUES ('schema', ?)",
145
+ CURRENT_VERSION,
146
+ );
147
+ }
package/src/testing.ts ADDED
@@ -0,0 +1,303 @@
1
+ /**
2
+ * Testing utilities for @stablemodels/qmd-cf.
3
+ *
4
+ * Provides mock implementations of Cloudflare's SqlStorage, Vectorize, and
5
+ * the EmbedFn type so consuming projects can test their Qmd integration
6
+ * without Cloudflare Workers or Vectorize services.
7
+ *
8
+ * MockSqlStorage wraps bun:sqlite's in-memory Database, giving you real FTS5
9
+ * execution while matching the Cloudflare DO SqlStorage interface.
10
+ *
11
+ * MockVectorize is an in-memory vector store with brute-force cosine similarity
12
+ * for testing the full hybrid search pipeline locally.
13
+ *
14
+ * @example
15
+ * ```ts
16
+ * import { Qmd } from "@stablemodels/qmd-cf";
17
+ * import { MockSqlStorage, MockVectorize, createMockEmbedFn } from "@stablemodels/qmd-cf/testing";
18
+ *
19
+ * // FTS-only testing
20
+ * const sql = new MockSqlStorage();
21
+ * const qmd = new Qmd(sql);
22
+ * await qmd.index({ id: "doc1", content: "Hello world" });
23
+ * const results = qmd.searchFts("hello");
24
+ *
25
+ * // Hybrid search testing
26
+ * const vectorize = new MockVectorize();
27
+ * const embedFn = createMockEmbedFn();
28
+ * const qmd = new Qmd(sql, { vectorize, embedFn });
29
+ * ```
30
+ */
31
+ import { Database } from "bun:sqlite";
32
+ import type { EmbedFn } from "./types.js";
33
+
34
+ // ─── BunSqlCursor ───────────────────────────────────────────────────
35
+
36
+ /**
37
+ * Cursor wrapping bun:sqlite results. Structurally compatible with
38
+ * Cloudflare's SqlStorageCursor.
39
+ */
40
+ class BunSqlCursor<T extends Record<string, SqlStorageValue>> {
41
+ private rows: T[];
42
+ private index = 0;
43
+ readonly columnNames: string[];
44
+ readonly rowsRead: number;
45
+ readonly rowsWritten: number;
46
+
47
+ constructor(
48
+ rows: T[],
49
+ columnNames: string[],
50
+ rowsRead: number,
51
+ rowsWritten: number,
52
+ ) {
53
+ this.rows = rows;
54
+ this.columnNames = columnNames;
55
+ this.rowsRead = rowsRead;
56
+ this.rowsWritten = rowsWritten;
57
+ }
58
+
59
+ toArray(): T[] {
60
+ return this.rows;
61
+ }
62
+
63
+ one(): T {
64
+ if (this.rows.length !== 1) {
65
+ throw new Error(`Expected exactly one row, got ${this.rows.length}`);
66
+ }
67
+ return this.rows[0];
68
+ }
69
+
70
+ next(): { done?: false; value: T } | { done: true; value?: never } {
71
+ if (this.index < this.rows.length) {
72
+ return { value: this.rows[this.index++] };
73
+ }
74
+ return { done: true };
75
+ }
76
+
77
+ raw<U extends SqlStorageValue[]>(): IterableIterator<U> {
78
+ const data = this.rows.map((row) => Object.values(row)) as unknown as U[];
79
+ return data[Symbol.iterator]() as IterableIterator<U>;
80
+ }
81
+
82
+ [Symbol.iterator](): IterableIterator<T> {
83
+ return this.rows[Symbol.iterator]() as IterableIterator<T>;
84
+ }
85
+ }
86
+
87
+ // ─── MockSqlStorage ─────────────────────────────────────────────────
88
+
89
+ /**
90
+ * In-memory SqlStorage backed by bun:sqlite.
91
+ *
92
+ * Provides real SQLite with FTS5 support, structurally compatible with
93
+ * Cloudflare Durable Object's `ctx.storage.sql` interface.
94
+ */
95
+ export class MockSqlStorage {
96
+ private db: Database;
97
+
98
+ constructor() {
99
+ this.db = new Database(":memory:");
100
+ this.db.exec("PRAGMA journal_mode=WAL");
101
+ this.db.exec("PRAGMA foreign_keys=ON");
102
+ }
103
+
104
+ exec<T extends Record<string, SqlStorageValue>>(
105
+ query: string,
106
+ ...bindings: any[]
107
+ ): BunSqlCursor<T> {
108
+ const stmt = this.db.prepare(query);
109
+
110
+ const trimmed = query.trimStart().toUpperCase();
111
+ const isSelect =
112
+ trimmed.startsWith("SELECT") ||
113
+ trimmed.startsWith("WITH") ||
114
+ trimmed.startsWith("PRAGMA");
115
+ const isInsertReturning = trimmed.includes("RETURNING");
116
+
117
+ if (isSelect || isInsertReturning) {
118
+ const rows = stmt.all(...bindings) as T[];
119
+ const columnNames = rows.length > 0 ? Object.keys(rows[0] as object) : [];
120
+ return new BunSqlCursor<T>(rows, columnNames, rows.length, 0);
121
+ }
122
+
123
+ const result = stmt.run(...bindings);
124
+ return new BunSqlCursor<T>([], [], 0, result.changes);
125
+ }
126
+
127
+ get databaseSize(): number {
128
+ return 0;
129
+ }
130
+
131
+ get Cursor(): any {
132
+ return BunSqlCursor;
133
+ }
134
+
135
+ get Statement(): any {
136
+ return class {};
137
+ }
138
+
139
+ close(): void {
140
+ this.db.close();
141
+ }
142
+ }
143
+
144
+ // ─── MockVectorize ──────────────────────────────────────────────────
145
+
146
+ interface StoredVector {
147
+ id: string;
148
+ values: number[];
149
+ namespace?: string;
150
+ metadata?: Record<string, VectorizeVectorMetadata>;
151
+ }
152
+
153
+ function cosineSimilarity(a: number[], b: number[]): number {
154
+ let dot = 0;
155
+ let normA = 0;
156
+ let normB = 0;
157
+ for (let i = 0; i < a.length; i++) {
158
+ dot += a[i] * b[i];
159
+ normA += a[i] * a[i];
160
+ normB += b[i] * b[i];
161
+ }
162
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
163
+ return denom === 0 ? 0 : dot / denom;
164
+ }
165
+
166
+ /**
167
+ * In-memory Vectorize with brute-force cosine similarity.
168
+ *
169
+ * Structurally compatible with Cloudflare's Vectorize abstract class.
170
+ * Supports insert, upsert, query, queryById, getByIds, deleteByIds.
171
+ */
172
+ export class MockVectorize {
173
+ private vectors: Map<string, StoredVector> = new Map();
174
+
175
+ /** Inspect stored vectors for test assertions. */
176
+ get storedVectors(): Map<string, StoredVector> {
177
+ return this.vectors;
178
+ }
179
+
180
+ async describe(): Promise<VectorizeIndexInfo> {
181
+ return {
182
+ vectorCount: this.vectors.size,
183
+ dimensions: 0,
184
+ processedUpToDatetime: 0,
185
+ processedUpToMutation: 0,
186
+ };
187
+ }
188
+
189
+ async insert(vectors: VectorizeVector[]): Promise<VectorizeAsyncMutation> {
190
+ for (const v of vectors) {
191
+ if (this.vectors.has(v.id)) {
192
+ throw new Error(`Vector ${v.id} already exists`);
193
+ }
194
+ this.vectors.set(v.id, {
195
+ id: v.id,
196
+ values: Array.from(v.values),
197
+ namespace: v.namespace,
198
+ metadata: v.metadata,
199
+ });
200
+ }
201
+ return { mutationId: `mock-insert-${Date.now()}` };
202
+ }
203
+
204
+ async upsert(vectors: VectorizeVector[]): Promise<VectorizeAsyncMutation> {
205
+ for (const v of vectors) {
206
+ this.vectors.set(v.id, {
207
+ id: v.id,
208
+ values: Array.from(v.values),
209
+ namespace: v.namespace,
210
+ metadata: v.metadata,
211
+ });
212
+ }
213
+ return { mutationId: `mock-upsert-${Date.now()}` };
214
+ }
215
+
216
+ async query(
217
+ vector: number[] | Float32Array,
218
+ options?: VectorizeQueryOptions,
219
+ ): Promise<VectorizeMatches> {
220
+ const queryVec = Array.from(vector);
221
+ const topK = options?.topK ?? 5;
222
+
223
+ let candidates = Array.from(this.vectors.values());
224
+
225
+ if (options?.namespace) {
226
+ candidates = candidates.filter((v) => v.namespace === options.namespace);
227
+ }
228
+
229
+ const scored = candidates.map((v) => ({
230
+ id: v.id,
231
+ score: cosineSimilarity(queryVec, v.values),
232
+ namespace: v.namespace,
233
+ metadata: options?.returnMetadata === "all" ? v.metadata : undefined,
234
+ values: options?.returnValues ? v.values : undefined,
235
+ }));
236
+
237
+ scored.sort((a, b) => b.score - a.score);
238
+ const matches = scored.slice(0, topK);
239
+
240
+ return { matches, count: matches.length };
241
+ }
242
+
243
+ async queryById(
244
+ vectorId: string,
245
+ options?: VectorizeQueryOptions,
246
+ ): Promise<VectorizeMatches> {
247
+ const vec = this.vectors.get(vectorId);
248
+ if (!vec) return { matches: [], count: 0 };
249
+ return this.query(vec.values, options);
250
+ }
251
+
252
+ async getByIds(ids: string[]): Promise<VectorizeVector[]> {
253
+ return ids
254
+ .map((id) => this.vectors.get(id))
255
+ .filter((v): v is StoredVector => v !== undefined)
256
+ .map((v) => ({
257
+ id: v.id,
258
+ values: v.values,
259
+ namespace: v.namespace,
260
+ metadata: v.metadata,
261
+ }));
262
+ }
263
+
264
+ async deleteByIds(ids: string[]): Promise<VectorizeAsyncMutation> {
265
+ for (const id of ids) {
266
+ this.vectors.delete(id);
267
+ }
268
+ return { mutationId: `mock-delete-${Date.now()}` };
269
+ }
270
+
271
+ /** Reset all stored vectors. */
272
+ clear(): void {
273
+ this.vectors.clear();
274
+ }
275
+ }
276
+
277
+ // ─── Mock EmbedFn ───────────────────────────────────────────────────
278
+
279
+ /**
280
+ * Create a deterministic mock embedding function.
281
+ *
282
+ * Generates consistent vectors based on character frequency distribution.
283
+ * Similar texts produce similar vectors, enabling meaningful cosine similarity
284
+ * in tests without calling a real embedding model.
285
+ *
286
+ * @param dims - Number of embedding dimensions (default: 8)
287
+ */
288
+ export function createMockEmbedFn(dims = 8): EmbedFn {
289
+ return async (texts: string[]): Promise<number[][]> => {
290
+ return texts.map((text) => {
291
+ const lower = text.toLowerCase();
292
+ const vec = new Array(dims).fill(0);
293
+ for (let i = 0; i < lower.length; i++) {
294
+ const code = lower.charCodeAt(i);
295
+ vec[code % dims] += 1;
296
+ }
297
+ const norm = Math.sqrt(
298
+ vec.reduce((s: number, v: number) => s + v * v, 0),
299
+ );
300
+ return norm > 0 ? vec.map((v: number) => v / norm) : vec;
301
+ });
302
+ };
303
+ }
package/src/types.ts ADDED
@@ -0,0 +1,124 @@
1
+ /**
2
+ * Domain types for qmd-cf.
3
+ *
4
+ * Cloudflare platform types (SqlStorage, SqlStorageCursor, Vectorize,
5
+ * VectorizeVector, etc.) are ambient — provided by @cloudflare/workers-types
6
+ * via tsconfig's "types" array. They don't need to be imported or re-exported.
7
+ */
8
+
9
+ /** A document to be indexed. */
10
+ export interface Document {
11
+ /** Unique identifier for this document (e.g. file path). */
12
+ id: string;
13
+ /** The full text content. */
14
+ content: string;
15
+ /** Optional title (boosts search relevance when matched). */
16
+ title?: string;
17
+ /** Optional document type for filtering (e.g. "fact", "daily_note", "summary"). */
18
+ docType?: string;
19
+ /** Optional namespace for scoped search (e.g. entity path, agent ID). */
20
+ namespace?: string;
21
+ /** Arbitrary metadata stored alongside the document. */
22
+ metadata?: Record<string, string | number | boolean | null>;
23
+ }
24
+
25
+ /** A single chunk produced from a document. */
26
+ export interface Chunk {
27
+ /** Parent document ID. */
28
+ docId: string;
29
+ /** Sequence index within the document (0-based). */
30
+ seq: number;
31
+ /** The chunk text content. */
32
+ text: string;
33
+ /** Character offset in the original document. */
34
+ charOffset: number;
35
+ }
36
+
37
+ /** A search result returned from BM25 full-text search. */
38
+ export interface FtsResult {
39
+ docId: string;
40
+ /** BM25 score normalized to (0, 1] — higher is better. */
41
+ score: number;
42
+ /** The matching chunk text (snippet). */
43
+ snippet: string;
44
+ /** Chunk sequence number. */
45
+ seq: number;
46
+ title: string | null;
47
+ docType: string | null;
48
+ namespace: string | null;
49
+ metadata: Record<string, string | number | boolean | null> | null;
50
+ }
51
+
52
+ /** A search result returned from vector similarity search. */
53
+ export interface VectorResult {
54
+ docId: string;
55
+ /** Cosine similarity score in [0, 1] — higher is better. */
56
+ score: number;
57
+ /** The matching chunk text. */
58
+ snippet: string;
59
+ /** Chunk sequence number. */
60
+ seq: number;
61
+ title: string | null;
62
+ docType: string | null;
63
+ namespace: string | null;
64
+ metadata: Record<string, string | number | boolean | null> | null;
65
+ }
66
+
67
+ /** A merged search result after hybrid fusion. */
68
+ export interface SearchResult {
69
+ docId: string;
70
+ /** Final fused score — higher is better. */
71
+ score: number;
72
+ /** The best matching chunk text. */
73
+ snippet: string;
74
+ /** Source of the result: which retrieval methods contributed. */
75
+ sources: Array<"fts" | "vector">;
76
+ /** Individual scores from each source. */
77
+ sourceScores: { fts?: number; vector?: number };
78
+ title: string | null;
79
+ docType: string | null;
80
+ namespace: string | null;
81
+ metadata: Record<string, string | number | boolean | null> | null;
82
+ }
83
+
84
+ /** Options for search queries. */
85
+ export interface SearchOptions {
86
+ /** Maximum number of results to return. Default: 10. */
87
+ limit?: number;
88
+ /** Filter by document type. */
89
+ docType?: string;
90
+ /** Filter by namespace. */
91
+ namespace?: string;
92
+ }
93
+
94
+ /** Options for hybrid search queries (extends SearchOptions). */
95
+ export interface HybridSearchOptions extends SearchOptions {
96
+ /** Weight for FTS results in RRF fusion. Default: 1.0. */
97
+ ftsWeight?: number;
98
+ /** Weight for vector results in RRF fusion. Default: 1.0. */
99
+ vectorWeight?: number;
100
+ /** RRF constant k. Higher values reduce the impact of high rankings. Default: 60. */
101
+ rrfK?: number;
102
+ }
103
+
104
+ /** Configuration for the QMD index. */
105
+ export interface QmdConfig {
106
+ /** Maximum characters per chunk. Default: 3200 (~800 tokens). */
107
+ chunkSize?: number;
108
+ /** Overlap characters between chunks. Default: 480 (15% of chunkSize). */
109
+ chunkOverlap?: number;
110
+ /** FTS5 tokenizer configuration. Default: "unicode61". */
111
+ tokenizer?: string;
112
+ }
113
+
114
+ /** Embedding function signature — maps text to a vector. */
115
+ export type EmbedFn = (texts: string[]) => Promise<number[][]>;
116
+
117
+ /** Index statistics. */
118
+ export interface IndexStats {
119
+ totalDocuments: number;
120
+ totalChunks: number;
121
+ totalVectors: number;
122
+ namespaces: string[];
123
+ docTypes: string[];
124
+ }