@stablemodels/qmd-cf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/LICENSE +21 -0
  2. package/dist/chunker.d.ts +11 -0
  3. package/dist/chunker.d.ts.map +1 -0
  4. package/dist/chunker.js +199 -0
  5. package/dist/chunker.js.map +1 -0
  6. package/dist/fts.d.ts +19 -0
  7. package/dist/fts.d.ts.map +1 -0
  8. package/dist/fts.js +109 -0
  9. package/dist/fts.js.map +1 -0
  10. package/dist/hash.d.ts +7 -0
  11. package/dist/hash.d.ts.map +1 -0
  12. package/dist/hash.js +14 -0
  13. package/dist/hash.js.map +1 -0
  14. package/dist/index.d.ts +56 -0
  15. package/dist/index.d.ts.map +1 -0
  16. package/dist/index.js +57 -0
  17. package/dist/index.js.map +1 -0
  18. package/dist/qmd.d.ts +158 -0
  19. package/dist/qmd.d.ts.map +1 -0
  20. package/dist/qmd.js +462 -0
  21. package/dist/qmd.js.map +1 -0
  22. package/dist/rrf.d.ts +22 -0
  23. package/dist/rrf.d.ts.map +1 -0
  24. package/dist/rrf.js +92 -0
  25. package/dist/rrf.js.map +1 -0
  26. package/dist/schema.d.ts +14 -0
  27. package/dist/schema.d.ts.map +1 -0
  28. package/dist/schema.js +128 -0
  29. package/dist/schema.js.map +1 -0
  30. package/dist/testing.d.ts +77 -0
  31. package/dist/testing.d.ts.map +1 -0
  32. package/dist/testing.js +242 -0
  33. package/dist/testing.js.map +1 -0
  34. package/dist/types.d.ts +118 -0
  35. package/dist/types.d.ts.map +1 -0
  36. package/dist/types.js +9 -0
  37. package/dist/types.js.map +1 -0
  38. package/dist/vector.d.ts +38 -0
  39. package/dist/vector.d.ts.map +1 -0
  40. package/dist/vector.js +174 -0
  41. package/dist/vector.js.map +1 -0
  42. package/package.json +49 -0
  43. package/src/bun-sqlite.d.ts +17 -0
  44. package/src/chunker.ts +250 -0
  45. package/src/fts.ts +140 -0
  46. package/src/hash.ts +13 -0
  47. package/src/index.ts +72 -0
  48. package/src/qmd.ts +706 -0
  49. package/src/rrf.ts +115 -0
  50. package/src/schema.ts +147 -0
  51. package/src/testing.ts +303 -0
  52. package/src/types.ts +124 -0
  53. package/src/vector.ts +236 -0
package/src/vector.ts ADDED
@@ -0,0 +1,236 @@
1
+ import type { EmbedFn, SearchOptions, VectorResult } from "./types.js";
2
+
3
+ /**
4
+ * Format text for embedding (document indexing).
5
+ * Follows nomic/qmd convention of prefixing with title context.
6
+ */
7
+ export function formatDocForEmbedding(
8
+ text: string,
9
+ title?: string,
10
+ context?: string,
11
+ ): string {
12
+ const parts: string[] = [];
13
+ if (context) parts.push(`context: ${context}`);
14
+ parts.push(`title: ${title || "none"}`);
15
+ parts.push(`text: ${text}`);
16
+ return parts.join(" | ");
17
+ }
18
+
19
+ /**
20
+ * Format a query string for embedding (search time).
21
+ */
22
+ export function formatQueryForEmbedding(query: string): string {
23
+ return `search_query: ${query}`;
24
+ }
25
+
26
+ /**
27
+ * Index chunks into Vectorize with embeddings.
28
+ *
29
+ * Each chunk gets a vector ID of "{docId}_{seq}" which maps back to qmd_chunks.
30
+ * Vectors are stored in a namespace matching the document's namespace for scoped search.
31
+ */
32
+ export async function indexVectors(
33
+ vectorize: Vectorize,
34
+ embedFn: EmbedFn,
35
+ chunks: Array<{
36
+ docId: string;
37
+ seq: number;
38
+ text: string;
39
+ title?: string;
40
+ namespace?: string;
41
+ docType?: string;
42
+ context?: string;
43
+ }>,
44
+ ): Promise<void> {
45
+ if (chunks.length === 0) return;
46
+
47
+ // Format texts for embedding (includes context if provided)
48
+ const texts = chunks.map((c) =>
49
+ formatDocForEmbedding(c.text, c.title, c.context),
50
+ );
51
+
52
+ // Generate embeddings in batch (Workers AI supports up to 100 at a time)
53
+ const batchSize = 100;
54
+ for (let i = 0; i < texts.length; i += batchSize) {
55
+ const batchTexts = texts.slice(i, i + batchSize);
56
+ const batchChunks = chunks.slice(i, i + batchSize);
57
+
58
+ const embeddings = await embedFn(batchTexts);
59
+
60
+ const vectors = batchChunks.map((c, j) => ({
61
+ id: `${c.docId}_${c.seq}`,
62
+ values: embeddings[j],
63
+ namespace: c.namespace ? c.namespace.split("/")[0] : undefined,
64
+ metadata: {
65
+ docId: c.docId,
66
+ seq: c.seq,
67
+ docType: c.docType ?? "",
68
+ directory: c.namespace ?? "",
69
+ },
70
+ }));
71
+
72
+ await vectorize.upsert(vectors);
73
+ }
74
+ }
75
+
76
+ /**
77
+ * Remove all vectors for a document from Vectorize.
78
+ */
79
+ export async function removeVectors(
80
+ vectorize: Vectorize,
81
+ sql: SqlStorage,
82
+ docId: string,
83
+ ): Promise<void> {
84
+ // Look up all chunk seq numbers for this document
85
+ const chunks = sql
86
+ .exec<{ seq: number }>("SELECT seq FROM qmd_chunks WHERE doc_id = ?", docId)
87
+ .toArray();
88
+
89
+ if (chunks.length === 0) return;
90
+
91
+ const ids = chunks.map((c) => `${docId}_${c.seq}`);
92
+ await vectorize.deleteByIds(ids);
93
+ }
94
+
95
+ type ChunkRow = {
96
+ doc_id: string;
97
+ seq: number;
98
+ content: string;
99
+ title: string | null;
100
+ doc_type: string | null;
101
+ namespace: string | null;
102
+ metadata: string | null;
103
+ };
104
+
105
+ /**
106
+ * Execute a vector similarity search via Vectorize.
107
+ *
108
+ * 1. Embed the query
109
+ * 2. Query Vectorize for nearest neighbors (scoped by namespace if provided)
110
+ * 3. Look up chunk content from the local SQLite for snippet extraction
111
+ */
112
+ export async function searchVector(
113
+ vectorize: Vectorize,
114
+ embedFn: EmbedFn,
115
+ sql: SqlStorage,
116
+ query: string,
117
+ options: SearchOptions = {},
118
+ ): Promise<VectorResult[]> {
119
+ const limit = options.limit ?? 10;
120
+
121
+ // Embed the query
122
+ const queryText = formatQueryForEmbedding(query);
123
+ const [queryVector] = await embedFn([queryText]);
124
+
125
+ // Resolve namespace for Vectorize query: use first path segment for glob/path patterns
126
+ let vectorizeNamespace: string | undefined;
127
+ let directoryPrefix: string | undefined;
128
+ if (options.namespace) {
129
+ if (options.namespace.includes("*")) {
130
+ // Glob pattern: people/* → Vectorize ns "people", no post-filter needed for top-level
131
+ const prefix = options.namespace.replace(/\*+$/, "").replace(/\/+$/, "");
132
+ vectorizeNamespace = prefix.split("/")[0];
133
+ // Only need post-filter if glob is deeper than top-level (e.g. projects/ember/*)
134
+ if (prefix.includes("/")) {
135
+ directoryPrefix = `${prefix}/`;
136
+ }
137
+ } else {
138
+ // Exact directory: people/ryan → Vectorize ns "people", post-filter by full path
139
+ vectorizeNamespace = options.namespace.split("/")[0];
140
+ if (options.namespace.includes("/")) {
141
+ directoryPrefix = options.namespace;
142
+ }
143
+ }
144
+ }
145
+
146
+ // Query Vectorize
147
+ const matches = await vectorize.query(queryVector, {
148
+ topK: limit * 3, // Fetch extra for dedup
149
+ returnMetadata: "all",
150
+ namespace: vectorizeNamespace,
151
+ });
152
+
153
+ if (matches.matches.length === 0) return [];
154
+
155
+ // Collect chunk IDs to look up content from local SQLite
156
+ const chunkKeys = matches.matches.map((m) => {
157
+ const meta = m.metadata as
158
+ | { docId: string; seq: number; docType?: string }
159
+ | undefined;
160
+ return {
161
+ vectorId: m.id,
162
+ score: m.score,
163
+ docId: meta?.docId ?? m.id.split("_").slice(0, -1).join("_"),
164
+ seq: meta?.seq ?? Number.parseInt(m.id.split("_").pop() ?? "0", 10),
165
+ };
166
+ });
167
+
168
+ // Filter by docType if specified (Vectorize metadata filtering could also do this,
169
+ // but we filter here for portability)
170
+ let filteredKeys = options.docType
171
+ ? chunkKeys.filter((k) => {
172
+ const meta = matches.matches.find((m) => m.id === k.vectorId)?.metadata;
173
+ return meta?.docType === options.docType;
174
+ })
175
+ : chunkKeys;
176
+
177
+ // Post-filter by directory prefix when namespace is deeper than first segment
178
+ if (directoryPrefix) {
179
+ filteredKeys = filteredKeys.filter((k) => {
180
+ const meta = matches.matches.find((m) => m.id === k.vectorId)?.metadata;
181
+ const dir = meta?.directory as string | null;
182
+ if (!dir) return false;
183
+ return dir === directoryPrefix || dir.startsWith(`${directoryPrefix}/`);
184
+ });
185
+ }
186
+
187
+ if (filteredKeys.length === 0) return [];
188
+
189
+ // Batch look up chunk content from SQLite
190
+ const placeholders = filteredKeys.map(() => "(?, ?)").join(", ");
191
+ const bindings = filteredKeys.flatMap((k) => [k.docId, k.seq]);
192
+
193
+ const rows = sql
194
+ .exec<ChunkRow>(
195
+ `
196
+ SELECT c.doc_id, c.seq, c.content, d.title, d.doc_type, d.namespace, d.metadata
197
+ FROM qmd_chunks c
198
+ JOIN qmd_documents d ON d.id = c.doc_id
199
+ WHERE (c.doc_id, c.seq) IN (VALUES ${placeholders})
200
+ `,
201
+ ...bindings,
202
+ )
203
+ .toArray();
204
+
205
+ // Build lookup map
206
+ const chunkMap = new Map<string, ChunkRow>();
207
+ for (const row of rows) {
208
+ chunkMap.set(`${row.doc_id}_${row.seq}`, row);
209
+ }
210
+
211
+ // Merge scores with content, dedup by docId
212
+ const seen = new Map<string, VectorResult>();
213
+
214
+ for (const key of filteredKeys) {
215
+ const row = chunkMap.get(`${key.docId}_${key.seq}`);
216
+ if (!row) continue;
217
+
218
+ const existing = seen.get(key.docId);
219
+ if (!existing || key.score > existing.score) {
220
+ seen.set(key.docId, {
221
+ docId: key.docId,
222
+ score: key.score,
223
+ snippet: row.content,
224
+ seq: key.seq,
225
+ title: row.title,
226
+ docType: row.doc_type,
227
+ namespace: row.namespace,
228
+ metadata: row.metadata ? JSON.parse(row.metadata as string) : null,
229
+ });
230
+ }
231
+ }
232
+
233
+ return Array.from(seen.values())
234
+ .sort((a, b) => b.score - a.score)
235
+ .slice(0, limit);
236
+ }