@kyleparrott/where-was-i 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,220 @@
1
+ import fs from "node:fs";
2
+ import { SessionSearchDb } from "./database.js";
3
+ import { EMBEDDING_PROVIDER_UNCONFIGURED_MESSAGE, EmbeddingProviderUnavailableError, OpenAICompatibleEmbeddingProvider, isEmbeddingProviderConfigured } from "./embeddings.js";
4
+ import { clearEmbeddings, countChunksMissingEmbeddings, embeddingCoverage, ensureVectorStore, listChunksMissingEmbeddings, upsertChunkEmbedding } from "./vector.js";
5
+ export async function indexSemanticChunks(options) {
6
+ const provider = new OpenAICompatibleEmbeddingProvider(options.embedding);
7
+ const batchSize = Math.min(Math.max(options.batchSize ?? 32, 1), 128);
8
+ const maxChunks = Math.max(options.maxChunks ?? Number.MAX_SAFE_INTEGER, 1);
9
+ const db = new SessionSearchDb(options.dbPath);
10
+ let indexed = 0;
11
+ const notify = options.onProgress;
12
+ try {
13
+ notify?.({
14
+ phase: "probing",
15
+ providerId: provider.config.id,
16
+ model: provider.config.model,
17
+ dimensions: null,
18
+ indexed,
19
+ total: null
20
+ });
21
+ let dimensions;
22
+ try {
23
+ dimensions = await provider.probeDimensions();
24
+ }
25
+ catch (error) {
26
+ if (error instanceof EmbeddingProviderUnavailableError) {
27
+ const summary = unavailableSummary(provider, error);
28
+ notify?.({
29
+ phase: "unavailable",
30
+ providerId: provider.config.id,
31
+ model: provider.config.model,
32
+ dimensions: null,
33
+ indexed,
34
+ total: null,
35
+ failed: summary.failed
36
+ });
37
+ return summary;
38
+ }
39
+ throw error;
40
+ }
41
+ ensureVectorStore(db.db, provider.config, dimensions);
42
+ const totalMissing = countChunksMissingEmbeddings(db.db, provider.config, dimensions, options.sessionIds ?? []);
43
+ const total = Math.min(totalMissing, maxChunks);
44
+ notify?.({
45
+ phase: "start",
46
+ providerId: provider.config.id,
47
+ model: provider.config.model,
48
+ dimensions,
49
+ indexed,
50
+ total
51
+ });
52
+ while (indexed < total) {
53
+ const remaining = total - indexed;
54
+ const chunks = listChunksMissingEmbeddings(db.db, provider.config, dimensions, Math.min(batchSize, remaining), options.sessionIds ?? []);
55
+ if (chunks.length === 0) {
56
+ const summary = {
57
+ available: true,
58
+ providerId: provider.config.id,
59
+ model: provider.config.model,
60
+ dimensions,
61
+ indexed,
62
+ skipped: 0,
63
+ failed: null
64
+ };
65
+ notify?.({
66
+ phase: "complete",
67
+ providerId: provider.config.id,
68
+ model: provider.config.model,
69
+ dimensions,
70
+ indexed,
71
+ total
72
+ });
73
+ return summary;
74
+ }
75
+ notify?.({
76
+ phase: "batch",
77
+ providerId: provider.config.id,
78
+ model: provider.config.model,
79
+ dimensions,
80
+ indexed,
81
+ total,
82
+ batchSize: chunks.length
83
+ });
84
+ const batch = await provider.embedDocuments(chunks.map((chunk) => chunk.text));
85
+ for (let index = 0; index < chunks.length; index += 1) {
86
+ const chunk = chunks[index];
87
+ const embedding = batch.embeddings[index];
88
+ if (chunk && embedding) {
89
+ upsertChunkEmbedding(db.db, provider.config, dimensions, chunk, embedding);
90
+ indexed += 1;
91
+ }
92
+ }
93
+ notify?.({
94
+ phase: "progress",
95
+ providerId: provider.config.id,
96
+ model: provider.config.model,
97
+ dimensions,
98
+ indexed,
99
+ total,
100
+ batchSize: chunks.length
101
+ });
102
+ }
103
+ const summary = {
104
+ available: true,
105
+ providerId: provider.config.id,
106
+ model: provider.config.model,
107
+ dimensions,
108
+ indexed,
109
+ skipped: 0,
110
+ failed: null
111
+ };
112
+ notify?.({
113
+ phase: "complete",
114
+ providerId: provider.config.id,
115
+ model: provider.config.model,
116
+ dimensions,
117
+ indexed,
118
+ total
119
+ });
120
+ return summary;
121
+ }
122
+ catch (error) {
123
+ if (error instanceof EmbeddingProviderUnavailableError) {
124
+ const summary = unavailableSummary(provider, error, indexed);
125
+ notify?.({
126
+ phase: "unavailable",
127
+ providerId: provider.config.id,
128
+ model: provider.config.model,
129
+ dimensions: null,
130
+ indexed,
131
+ total: null,
132
+ failed: summary.failed
133
+ });
134
+ return summary;
135
+ }
136
+ throw error;
137
+ }
138
+ finally {
139
+ db.close();
140
+ }
141
+ }
142
+ export function clearSemanticEmbeddings(dbPath) {
143
+ const db = new SessionSearchDb(dbPath);
144
+ try {
145
+ return clearEmbeddings(db.db);
146
+ }
147
+ finally {
148
+ db.close();
149
+ }
150
+ }
151
+ export async function semanticFreshness(options) {
152
+ const provider = new OpenAICompatibleEmbeddingProvider(options.embedding);
153
+ let dimensions = options.dimensions ?? null;
154
+ let providerAvailable = null;
155
+ let providerError = null;
156
+ const configured = isEmbeddingProviderConfigured(provider.config);
157
+ if (!configured) {
158
+ providerAvailable = false;
159
+ providerError = EMBEDDING_PROVIDER_UNCONFIGURED_MESSAGE;
160
+ }
161
+ else if (options.probeProvider) {
162
+ try {
163
+ dimensions = await provider.probeDimensions();
164
+ providerAvailable = true;
165
+ }
166
+ catch (error) {
167
+ providerAvailable = false;
168
+ providerError = error instanceof Error ? error.message : String(error);
169
+ }
170
+ }
171
+ if (!fs.existsSync(options.dbPath)) {
172
+ const coverage = emptyEmbeddingCoverage(provider.config, configured ? dimensions : null);
173
+ return {
174
+ ...coverage,
175
+ staleForConfiguredProvider: false,
176
+ recommendation: configured ? coverage.recommendation : "Configure semantic embeddings before running semantic search.",
177
+ providerAvailable,
178
+ providerError
179
+ };
180
+ }
181
+ const db = new SessionSearchDb(options.dbPath);
182
+ try {
183
+ const coverage = embeddingCoverage(db.db, provider.config, configured ? dimensions : null);
184
+ return {
185
+ ...coverage,
186
+ staleForConfiguredProvider: configured ? coverage.staleForConfiguredProvider : false,
187
+ recommendation: configured ? coverage.recommendation : "Configure semantic embeddings before running semantic search.",
188
+ providerAvailable,
189
+ providerError
190
+ };
191
+ }
192
+ finally {
193
+ db.close();
194
+ }
195
+ }
196
+ function emptyEmbeddingCoverage(provider, dimensions) {
197
+ return {
198
+ providerId: provider.id,
199
+ model: provider.model,
200
+ dimensions,
201
+ totalEligibleChunks: 0,
202
+ indexedChunks: 0,
203
+ missingChunks: dimensions ? 0 : null,
204
+ lastIndexedAt: null,
205
+ incompatibleStoredVectors: 0,
206
+ staleForConfiguredProvider: false,
207
+ recommendation: null
208
+ };
209
+ }
210
+ function unavailableSummary(provider, error, indexed = 0) {
211
+ return {
212
+ available: false,
213
+ providerId: provider.config.id,
214
+ model: provider.config.model,
215
+ dimensions: null,
216
+ indexed,
217
+ skipped: 0,
218
+ failed: error.message
219
+ };
220
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,311 @@
1
+ import crypto from "node:crypto";
2
+ export function ensureVectorStore(db, provider, dimensions) {
3
+ if (!Number.isInteger(dimensions) || dimensions <= 0) {
4
+ throw new Error(`Vector dimensions must be a positive integer. Received ${dimensions}.`);
5
+ }
6
+ ensureVectorTable(db, dimensions);
7
+ assertNoIncompatibleStoredVectors(db, provider, dimensions);
8
+ db.prepare(`INSERT INTO embedding_providers (id, base_url, model, dimensions, updated_at)
9
+ VALUES (@id, @baseUrl, @model, @dimensions, @updatedAt)
10
+ ON CONFLICT(id) DO UPDATE SET
11
+ base_url = excluded.base_url,
12
+ model = excluded.model,
13
+ dimensions = excluded.dimensions,
14
+ updated_at = excluded.updated_at`).run({
15
+ id: provider.id,
16
+ baseUrl: provider.baseUrl,
17
+ model: provider.model,
18
+ dimensions,
19
+ updatedAt: new Date().toISOString()
20
+ });
21
+ }
22
+ export function upsertChunkEmbedding(db, provider, dimensions, chunk, embedding) {
23
+ if (embedding.length !== dimensions) {
24
+ throw new Error(`Embedding dimension mismatch for chunk ${chunk.id}: expected ${dimensions}, got ${embedding.length}.`);
25
+ }
26
+ ensureVectorStore(db, provider, dimensions);
27
+ deleteChunkEmbedding(db, chunk.id);
28
+ const indexedAt = new Date().toISOString();
29
+ const result = db
30
+ .prepare(`INSERT INTO vector_chunks_metadata
31
+ (chunk_id, message_id, session_id, provider_id, model, dimensions, indexed_at, source_fingerprint)
32
+ VALUES
33
+ (@chunkId, @messageId, @sessionId, @providerId, @model, @dimensions, @indexedAt, @sourceFingerprint)`)
34
+ .run({
35
+ chunkId: chunk.id,
36
+ messageId: chunk.messageId,
37
+ sessionId: chunk.sessionId,
38
+ providerId: provider.id,
39
+ model: provider.model,
40
+ dimensions,
41
+ indexedAt,
42
+ sourceFingerprint: sourceFingerprint(chunk)
43
+ });
44
+ const vectorRowid = BigInt(result.lastInsertRowid);
45
+ db.prepare("INSERT INTO vector_chunks(rowid, embedding) VALUES (?, ?)").run(vectorRowid, vectorToBuffer(embedding));
46
+ }
47
+ export function deleteChunkEmbedding(db, chunkId) {
48
+ if (!tableExists(db, "vector_chunks")) {
49
+ return;
50
+ }
51
+ const row = db
52
+ .prepare("SELECT vector_rowid AS vectorRowid FROM vector_chunks_metadata WHERE chunk_id = ?")
53
+ .get(chunkId);
54
+ if (!row) {
55
+ return;
56
+ }
57
+ db.prepare("DELETE FROM vector_chunks WHERE rowid = ?").run(BigInt(row.vectorRowid));
58
+ db.prepare("DELETE FROM vector_chunks_metadata WHERE vector_rowid = ?").run(row.vectorRowid);
59
+ }
60
+ export function searchVectorChunks(db, provider, embedding, limit) {
61
+ if (!tableExists(db, "vector_chunks")) {
62
+ return [];
63
+ }
64
+ assertVectorTableDimensions(db, embedding.length);
65
+ assertNoIncompatibleStoredVectors(db, provider, embedding.length);
66
+ const rows = db
67
+ .prepare(`SELECT
68
+ vector_chunks.rowid AS vectorRowid,
69
+ vector_chunks.distance AS distance,
70
+ m.chunk_id AS chunkId,
71
+ m.message_id AS messageId,
72
+ m.session_id AS sessionId,
73
+ m.provider_id AS providerId,
74
+ m.model AS model,
75
+ m.dimensions AS dimensions,
76
+ m.indexed_at AS indexedAt,
77
+ m.source_fingerprint AS sourceFingerprint
78
+ FROM vector_chunks
79
+ JOIN vector_chunks_metadata m ON m.vector_rowid = vector_chunks.rowid
80
+ WHERE embedding MATCH ? AND k = ?
81
+ AND m.provider_id = ?
82
+ AND m.model = ?
83
+ AND m.dimensions = ?
84
+ ORDER BY distance
85
+ LIMIT ?`)
86
+ .all(vectorToBuffer(embedding), Math.max(limit, 1), provider.id, provider.model, embedding.length, Math.max(limit, 1));
87
+ return rows;
88
+ }
89
+ export function listChunksMissingEmbeddings(db, provider, dimensions, limit = 500, sessionIds = []) {
90
+ const sessionFilter = sessionIds.length > 0 ? `AND c.session_id IN (${sessionIds.map(() => "?").join(", ")})` : "";
91
+ const rows = db
92
+ .prepare(`SELECT
93
+ c.id,
94
+ c.message_id AS messageId,
95
+ c.session_id AS sessionId,
96
+ c.conversation_id AS conversationId,
97
+ c.turn_id AS turnId,
98
+ c.source,
99
+ c.source_path AS sourcePath,
100
+ c.ordinal,
101
+ c.chunk_index AS chunkIndex,
102
+ c.role,
103
+ c.kind,
104
+ c.timestamp,
105
+ c.text,
106
+ c.line_start AS lineStart,
107
+ c.line_end AS lineEnd,
108
+ c.metadata_json AS metadataJson
109
+ FROM chunks c
110
+ LEFT JOIN vector_chunks_metadata v
111
+ ON v.chunk_id = c.id
112
+ AND v.provider_id = ?
113
+ AND v.model = ?
114
+ AND v.dimensions = ?
115
+ WHERE c.role IN ('user', 'assistant')
116
+ AND c.message_id IS NOT NULL
117
+ AND c.conversation_id IS NOT NULL
118
+ AND c.turn_id IS NOT NULL
119
+ ${sessionFilter}
120
+ AND v.chunk_id IS NULL
121
+ ORDER BY c.ordinal ASC
122
+ LIMIT ?`)
123
+ .all(provider.id, provider.model, dimensions, ...sessionIds, Math.max(limit, 1));
124
+ return rows.map((row) => {
125
+ const { metadataJson, ...rest } = row;
126
+ return {
127
+ ...rest,
128
+ metadata: JSON.parse(metadataJson)
129
+ };
130
+ });
131
+ }
132
+ export function countChunksMissingEmbeddings(db, provider, dimensions, sessionIds = []) {
133
+ const sessionFilter = sessionIds.length > 0 ? `AND c.session_id IN (${sessionIds.map(() => "?").join(", ")})` : "";
134
+ const row = db
135
+ .prepare(`SELECT COUNT(*) AS count
136
+ FROM chunks c
137
+ LEFT JOIN vector_chunks_metadata v
138
+ ON v.chunk_id = c.id
139
+ AND v.provider_id = ?
140
+ AND v.model = ?
141
+ AND v.dimensions = ?
142
+ WHERE c.role IN ('user', 'assistant')
143
+ AND c.message_id IS NOT NULL
144
+ AND c.conversation_id IS NOT NULL
145
+ AND c.turn_id IS NOT NULL
146
+ ${sessionFilter}
147
+ AND v.chunk_id IS NULL`)
148
+ .get(provider.id, provider.model, dimensions, ...sessionIds);
149
+ return row.count;
150
+ }
151
+ export function vectorToBuffer(vector) {
152
+ return Buffer.from(Float32Array.from(vector).buffer);
153
+ }
154
+ export function clearEmbeddings(db) {
155
+ const vectorsDeleted = tableExists(db, "vector_chunks_metadata")
156
+ ? (db.prepare("SELECT COUNT(*) AS count FROM vector_chunks_metadata").get().count ?? 0)
157
+ : 0;
158
+ const providersDeleted = tableExists(db, "embedding_providers")
159
+ ? (db.prepare("SELECT COUNT(*) AS count FROM embedding_providers").get().count ?? 0)
160
+ : 0;
161
+ const clear = db.transaction(() => {
162
+ if (tableExists(db, "vector_chunks")) {
163
+ db.exec("DROP TABLE vector_chunks");
164
+ }
165
+ if (tableExists(db, "vector_chunks_metadata")) {
166
+ db.prepare("DELETE FROM vector_chunks_metadata").run();
167
+ }
168
+ if (tableExists(db, "embedding_providers")) {
169
+ db.prepare("DELETE FROM embedding_providers").run();
170
+ }
171
+ });
172
+ clear();
173
+ return { vectorsDeleted, providersDeleted };
174
+ }
175
+ export function embeddingCoverage(db, provider, dimensions) {
176
+ const storedDimensions = dimensions ?? storedProviderDimensions(db, provider);
177
+ const totalEligibleChunks = countEligibleChunks(db);
178
+ const incompatibleStoredVectors = countIncompatibleStoredVectors(db, provider, storedDimensions);
179
+ const lastIndexedAt = latestEmbeddingIndexedAt(db, provider, storedDimensions);
180
+ if (!storedDimensions) {
181
+ return {
182
+ providerId: provider.id,
183
+ model: provider.model,
184
+ dimensions: null,
185
+ totalEligibleChunks,
186
+ indexedChunks: 0,
187
+ missingChunks: null,
188
+ lastIndexedAt,
189
+ incompatibleStoredVectors,
190
+ staleForConfiguredProvider: totalEligibleChunks > 0 || incompatibleStoredVectors > 0,
191
+ recommendation: incompatibleStoredVectors > 0
192
+ ? "Run `wwi embeddings clear` before switching embedding models or dimensions."
193
+ : totalEligibleChunks > 0
194
+ ? "Run `wwi index --semantic` to build embeddings for the configured provider."
195
+ : null
196
+ };
197
+ }
198
+ const indexedChunks = tableExists(db, "vector_chunks_metadata")
199
+ ? db
200
+ .prepare(`SELECT COUNT(*) AS count
201
+ FROM vector_chunks_metadata
202
+ WHERE provider_id = ? AND model = ? AND dimensions = ?`)
203
+ .get(provider.id, provider.model, storedDimensions).count
204
+ : 0;
205
+ const missingChunks = Math.max(totalEligibleChunks - indexedChunks, 0);
206
+ return {
207
+ providerId: provider.id,
208
+ model: provider.model,
209
+ dimensions: storedDimensions,
210
+ totalEligibleChunks,
211
+ indexedChunks,
212
+ missingChunks,
213
+ lastIndexedAt,
214
+ incompatibleStoredVectors,
215
+ staleForConfiguredProvider: missingChunks > 0 || incompatibleStoredVectors > 0,
216
+ recommendation: incompatibleStoredVectors > 0
217
+ ? "Run `wwi embeddings clear` before switching embedding models or dimensions."
218
+ : missingChunks > 0
219
+ ? "Run `wwi index --semantic` to refresh embeddings for the configured provider."
220
+ : null
221
+ };
222
+ }
223
+ function latestEmbeddingIndexedAt(db, provider, dimensions) {
224
+ if (!tableExists(db, "vector_chunks_metadata")) {
225
+ return null;
226
+ }
227
+ if (!dimensions) {
228
+ return db
229
+ .prepare(`SELECT MAX(indexed_at) AS lastIndexedAt
230
+ FROM vector_chunks_metadata
231
+ WHERE provider_id = ? AND model = ?`)
232
+ .get(provider.id, provider.model).lastIndexedAt;
233
+ }
234
+ return db
235
+ .prepare(`SELECT MAX(indexed_at) AS lastIndexedAt
236
+ FROM vector_chunks_metadata
237
+ WHERE provider_id = ? AND model = ? AND dimensions = ?`)
238
+ .get(provider.id, provider.model, dimensions).lastIndexedAt;
239
+ }
240
+ function countIncompatibleStoredVectors(db, provider, dimensions) {
241
+ if (!tableExists(db, "vector_chunks_metadata")) {
242
+ return 0;
243
+ }
244
+ if (!dimensions) {
245
+ return db.prepare("SELECT COUNT(*) AS count FROM vector_chunks_metadata").get().count;
246
+ }
247
+ return db
248
+ .prepare(`SELECT COUNT(*) AS count
249
+ FROM vector_chunks_metadata
250
+ WHERE provider_id != ? OR model != ? OR dimensions != ?`)
251
+ .get(provider.id, provider.model, dimensions).count;
252
+ }
253
+ function assertNoIncompatibleStoredVectors(db, provider, dimensions) {
254
+ const incompatibleStoredVectors = countIncompatibleStoredVectors(db, provider, dimensions);
255
+ if (incompatibleStoredVectors > 0) {
256
+ throw new Error(`Found ${incompatibleStoredVectors} stored vector(s) for a different embedding model or dimension count. Run \`wwi embeddings clear\` and rebuild semantic embeddings before switching embedding settings.`);
257
+ }
258
+ }
259
+ function ensureVectorTable(db, dimensions) {
260
+ const existing = db
261
+ .prepare("SELECT sql FROM sqlite_master WHERE name = 'vector_chunks'")
262
+ .get();
263
+ if (existing) {
264
+ const expected = `float[${dimensions}]`;
265
+ if (!existing.sql.includes(expected)) {
266
+ throw new Error(`Existing vector_chunks table does not match ${expected}. Run \`wwi embeddings clear\` and rebuild semantic embeddings. Table SQL: ${existing.sql}`);
267
+ }
268
+ return;
269
+ }
270
+ db.exec(`CREATE VIRTUAL TABLE vector_chunks USING vec0(embedding float[${dimensions}])`);
271
+ }
272
+ function assertVectorTableDimensions(db, dimensions) {
273
+ const existingDimensions = vectorTableDimensions(db);
274
+ if (existingDimensions !== null && existingDimensions !== dimensions) {
275
+ throw new Error(`Existing vector store dimensions (${existingDimensions}) do not match configured embedding dimensions (${dimensions}). Run \`wwi embeddings clear\` and rebuild semantic embeddings.`);
276
+ }
277
+ }
278
+ function vectorTableDimensions(db) {
279
+ const existing = db
280
+ .prepare("SELECT sql FROM sqlite_master WHERE name = 'vector_chunks'")
281
+ .get();
282
+ const match = existing?.sql.match(/float\[(\d+)\]/);
283
+ return match ? Number(match[1]) : null;
284
+ }
285
+ function storedProviderDimensions(db, provider) {
286
+ if (!tableExists(db, "embedding_providers")) {
287
+ return null;
288
+ }
289
+ const row = db
290
+ .prepare("SELECT dimensions FROM embedding_providers WHERE id = ? AND model = ?")
291
+ .get(provider.id, provider.model);
292
+ return row?.dimensions ?? null;
293
+ }
294
+ function countEligibleChunks(db) {
295
+ const row = db
296
+ .prepare(`SELECT COUNT(*) AS count
297
+ FROM chunks
298
+ WHERE role IN ('user', 'assistant')
299
+ AND message_id IS NOT NULL
300
+ AND conversation_id IS NOT NULL
301
+ AND turn_id IS NOT NULL`)
302
+ .get();
303
+ return row.count;
304
+ }
305
+ function sourceFingerprint(chunk) {
306
+ return `${chunk.id}:${crypto.createHash("sha1").update(chunk.text).digest("hex")}`;
307
+ }
308
+ function tableExists(db, table) {
309
+ const row = db.prepare("SELECT 1 FROM sqlite_master WHERE name = ?").get(table);
310
+ return Boolean(row);
311
+ }