pkm-mcp-server 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +52 -0
- package/LICENSE +21 -0
- package/README.md +246 -0
- package/activity.js +147 -0
- package/embeddings.js +672 -0
- package/graph.js +340 -0
- package/handlers.js +871 -0
- package/helpers.js +855 -0
- package/index.js +498 -0
- package/package.json +63 -0
- package/sample-project/CLAUDE.md +193 -0
- package/templates/adr.md +52 -0
- package/templates/daily-note.md +19 -0
- package/templates/devlog.md +35 -0
- package/templates/fleeting-note.md +11 -0
- package/templates/literature-note.md +25 -0
- package/templates/meeting-notes.md +28 -0
- package/templates/moc.md +22 -0
- package/templates/permanent-note.md +26 -0
- package/templates/project-index.md +38 -0
- package/templates/research-note.md +35 -0
- package/templates/task.md +22 -0
- package/templates/troubleshooting-log.md +32 -0
- package/utils.js +31 -0
package/embeddings.js
ADDED
|
@@ -0,0 +1,672 @@
|
|
|
1
|
+
import Database from "better-sqlite3";
|
|
2
|
+
import * as sqliteVec from "sqlite-vec";
|
|
3
|
+
import fs from "fs/promises";
|
|
4
|
+
import fsSync from "fs";
|
|
5
|
+
import path from "path";
|
|
6
|
+
import crypto from "crypto";
|
|
7
|
+
import { getAllMarkdownFiles } from "./utils.js";
|
|
8
|
+
|
|
9
|
+
const EMBEDDING_MODEL = "text-embedding-3-large";
|
|
10
|
+
const EMBEDDING_DIMENSIONS = 3072;
|
|
11
|
+
const MAX_CHARS_PER_CHUNK = 8000; // ~2000 tokens
|
|
12
|
+
const BATCH_SIZE = 100; // max texts per OpenAI API call
|
|
13
|
+
const REINDEX_BATCH_SIZE = 10; // files per batch during startup sync
|
|
14
|
+
const DEBOUNCE_MS = 2000;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Semantic similarity search over vault notes using OpenAI embeddings.
|
|
18
|
+
* Stores embeddings in SQLite + sqlite-vec for fast KNN lookups.
|
|
19
|
+
* Automatically indexes on startup and watches for file changes.
|
|
20
|
+
*/
|
|
21
|
+
export class SemanticIndex {
|
|
22
|
+
/**
|
|
23
|
+
* @param {Object} opts
|
|
24
|
+
* @param {string} opts.vaultPath - absolute path to vault root
|
|
25
|
+
* @param {string} opts.openaiApiKey - OpenAI API key for embeddings
|
|
26
|
+
* @param {string} [opts.dbPath] - override path for the SQLite database
|
|
27
|
+
*/
|
|
28
|
+
constructor({ vaultPath, openaiApiKey, dbPath }) {
|
|
29
|
+
this.vaultPath = vaultPath;
|
|
30
|
+
this.openaiApiKey = openaiApiKey;
|
|
31
|
+
this.dbPath = dbPath || path.join(vaultPath, ".obsidian", "semantic-index.db");
|
|
32
|
+
this.db = null;
|
|
33
|
+
this.watcher = null;
|
|
34
|
+
this._debounceTimers = new Map();
|
|
35
|
+
this._syncState = { syncing: false, total: 0, done: 0 };
|
|
36
|
+
this._inflight = new Set();
|
|
37
|
+
this._abortController = null;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
get isAvailable() {
|
|
41
|
+
return this.db !== null && !!this.openaiApiKey;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async initialize() {
|
|
45
|
+
// Ensure .obsidian dir exists
|
|
46
|
+
const dbDir = path.dirname(this.dbPath);
|
|
47
|
+
await fs.mkdir(dbDir, { recursive: true });
|
|
48
|
+
|
|
49
|
+
// Open DB and load sqlite-vec
|
|
50
|
+
this.db = new Database(this.dbPath);
|
|
51
|
+
sqliteVec.load(this.db);
|
|
52
|
+
this.db.pragma("journal_mode = WAL");
|
|
53
|
+
this.db.pragma("journal_size_limit = 32000000");
|
|
54
|
+
|
|
55
|
+
// Create schema
|
|
56
|
+
this.db.exec(`
|
|
57
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS vec_chunks USING vec0(
|
|
58
|
+
embedding float[${EMBEDDING_DIMENSIONS}]
|
|
59
|
+
);
|
|
60
|
+
|
|
61
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
62
|
+
id INTEGER PRIMARY KEY,
|
|
63
|
+
file_path TEXT NOT NULL,
|
|
64
|
+
chunk_index INTEGER NOT NULL,
|
|
65
|
+
heading TEXT,
|
|
66
|
+
content_preview TEXT NOT NULL,
|
|
67
|
+
UNIQUE(file_path, chunk_index)
|
|
68
|
+
);
|
|
69
|
+
|
|
70
|
+
CREATE TABLE IF NOT EXISTS files (
|
|
71
|
+
path TEXT PRIMARY KEY,
|
|
72
|
+
mtime_ms INTEGER NOT NULL,
|
|
73
|
+
content_hash TEXT NOT NULL,
|
|
74
|
+
chunk_count INTEGER NOT NULL,
|
|
75
|
+
updated_at TEXT NOT NULL
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_path);
|
|
79
|
+
`);
|
|
80
|
+
|
|
81
|
+
// Start background sync (non-blocking)
|
|
82
|
+
this._abortController = new AbortController();
|
|
83
|
+
this._startupSync().catch(err => {
|
|
84
|
+
console.error(`Semantic index startup sync error: ${err.message}`);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
// Start file watcher
|
|
88
|
+
this._startWatcher();
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
async shutdown() {
|
|
92
|
+
if (this._abortController) {
|
|
93
|
+
this._abortController.abort();
|
|
94
|
+
this._abortController = null;
|
|
95
|
+
}
|
|
96
|
+
if (this.watcher) {
|
|
97
|
+
this.watcher.close();
|
|
98
|
+
this.watcher = null;
|
|
99
|
+
}
|
|
100
|
+
for (const timer of this._debounceTimers.values()) {
|
|
101
|
+
clearTimeout(timer);
|
|
102
|
+
}
|
|
103
|
+
this._debounceTimers.clear();
|
|
104
|
+
if (this._inflight.size > 0) {
|
|
105
|
+
await Promise.allSettled([...this._inflight]);
|
|
106
|
+
}
|
|
107
|
+
if (this.db) {
|
|
108
|
+
this.db.close();
|
|
109
|
+
this.db = null;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/** Track a reindexFile call so shutdown() can await it. */
|
|
114
|
+
_trackedReindex(relativePath) {
|
|
115
|
+
const p = this.reindexFile(relativePath).finally(() => {
|
|
116
|
+
this._inflight.delete(p);
|
|
117
|
+
});
|
|
118
|
+
this._inflight.add(p);
|
|
119
|
+
return p;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/**
|
|
123
|
+
* Search for semantically similar notes and return formatted results.
|
|
124
|
+
* @param {Object} opts
|
|
125
|
+
* @param {string} opts.query - natural language search query
|
|
126
|
+
* @param {number} [opts.limit=5] - max results
|
|
127
|
+
* @param {string} [opts.folder] - restrict to folder prefix
|
|
128
|
+
* @param {number} [opts.threshold] - minimum similarity score (0-1)
|
|
129
|
+
* @returns {Promise<string>} formatted results text
|
|
130
|
+
*/
|
|
131
|
+
async search({ query, limit = 5, folder, threshold }) {
|
|
132
|
+
if (!this.isAvailable) {
|
|
133
|
+
throw new Error("Semantic index not available");
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Embed the query
|
|
137
|
+
const [queryEmbedding] = await getEmbeddings([query], this.openaiApiKey);
|
|
138
|
+
|
|
139
|
+
// KNN search via sqlite-vec
|
|
140
|
+
const vecResults = this.db.prepare(`
|
|
141
|
+
SELECT rowid, distance
|
|
142
|
+
FROM vec_chunks
|
|
143
|
+
WHERE embedding MATCH ?
|
|
144
|
+
ORDER BY distance
|
|
145
|
+
LIMIT ?
|
|
146
|
+
`).all(
|
|
147
|
+
new Float32Array(queryEmbedding),
|
|
148
|
+
Math.min(limit * 3, 50) // overfetch for folder filtering
|
|
149
|
+
);
|
|
150
|
+
|
|
151
|
+
// Join with chunk metadata
|
|
152
|
+
const results = [];
|
|
153
|
+
const getChunk = this.db.prepare(`
|
|
154
|
+
SELECT file_path, chunk_index, heading, content_preview
|
|
155
|
+
FROM chunks WHERE id = ?
|
|
156
|
+
`);
|
|
157
|
+
|
|
158
|
+
const seenFiles = new Set();
|
|
159
|
+
for (const { rowid, distance } of vecResults) {
|
|
160
|
+
if (results.length >= limit) break;
|
|
161
|
+
|
|
162
|
+
const chunk = getChunk.get(rowid);
|
|
163
|
+
if (!chunk) continue;
|
|
164
|
+
|
|
165
|
+
// Folder filter (ensure prefix matches at directory boundary)
|
|
166
|
+
if (folder) {
|
|
167
|
+
const prefix = folder.endsWith("/") ? folder : folder + "/";
|
|
168
|
+
if (!chunk.file_path.startsWith(prefix)) continue;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Threshold filter (convert L2 to similarity: 1 - distance/2)
|
|
172
|
+
const score = Math.max(0, Math.min(1, 1 - distance / 2));
|
|
173
|
+
if (threshold && score < threshold) continue;
|
|
174
|
+
|
|
175
|
+
// Deduplicate by file (show best chunk per file)
|
|
176
|
+
if (seenFiles.has(chunk.file_path)) continue;
|
|
177
|
+
seenFiles.add(chunk.file_path);
|
|
178
|
+
|
|
179
|
+
results.push({
|
|
180
|
+
path: chunk.file_path,
|
|
181
|
+
heading: chunk.heading,
|
|
182
|
+
score: Math.round(score * 1000) / 1000,
|
|
183
|
+
preview: chunk.content_preview
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Format output
|
|
188
|
+
let syncNote = "";
|
|
189
|
+
if (this._syncState.syncing) {
|
|
190
|
+
syncNote = `\n\n*Index syncing (${this._syncState.done}/${this._syncState.total} files)...*`;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
if (results.length === 0) {
|
|
194
|
+
return `No semantically related notes found.${syncNote}`;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const formatted = results.map(r => {
|
|
198
|
+
const heading = r.heading ? ` > ${r.heading}` : "";
|
|
199
|
+
return `**${r.path}**${heading} (score: ${r.score})\n${r.preview}`;
|
|
200
|
+
}).join("\n\n");
|
|
201
|
+
|
|
202
|
+
return `Found ${results.length} semantically related note${results.length === 1 ? "" : "s"}:\n\n${formatted}${syncNote}`;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Search for semantically similar notes and return raw result objects.
|
|
207
|
+
* @param {Object} opts
|
|
208
|
+
* @param {string} opts.query - natural language search query
|
|
209
|
+
* @param {number} [opts.limit=5] - max results
|
|
210
|
+
* @param {string} [opts.folder] - restrict to folder prefix
|
|
211
|
+
* @param {number} [opts.threshold] - minimum similarity score (0-1)
|
|
212
|
+
* @param {Set<string>} [opts.excludeFiles] - file paths to exclude
|
|
213
|
+
* @returns {Promise<Array<{path: string, score: number, preview: string}>>}
|
|
214
|
+
*/
|
|
215
|
+
async searchRaw({ query, limit = 5, folder, threshold, excludeFiles }) {
|
|
216
|
+
if (!this.isAvailable) {
|
|
217
|
+
throw new Error("Semantic index not available");
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const [queryEmbedding] = await getEmbeddings([query], this.openaiApiKey);
|
|
221
|
+
|
|
222
|
+
const vecResults = this.db.prepare(`
|
|
223
|
+
SELECT rowid, distance
|
|
224
|
+
FROM vec_chunks
|
|
225
|
+
WHERE embedding MATCH ?
|
|
226
|
+
ORDER BY distance
|
|
227
|
+
LIMIT ?
|
|
228
|
+
`).all(
|
|
229
|
+
new Float32Array(queryEmbedding),
|
|
230
|
+
Math.min(limit * 3, 50)
|
|
231
|
+
);
|
|
232
|
+
|
|
233
|
+
const results = [];
|
|
234
|
+
const getChunk = this.db.prepare(`
|
|
235
|
+
SELECT file_path, chunk_index, heading, content_preview
|
|
236
|
+
FROM chunks WHERE id = ?
|
|
237
|
+
`);
|
|
238
|
+
|
|
239
|
+
const seenFiles = new Set();
|
|
240
|
+
for (const { rowid, distance } of vecResults) {
|
|
241
|
+
if (results.length >= limit) break;
|
|
242
|
+
|
|
243
|
+
const chunk = getChunk.get(rowid);
|
|
244
|
+
if (!chunk) continue;
|
|
245
|
+
|
|
246
|
+
if (folder) {
|
|
247
|
+
const prefix = folder.endsWith("/") ? folder : folder + "/";
|
|
248
|
+
if (!chunk.file_path.startsWith(prefix)) continue;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
const score = Math.max(0, Math.min(1, 1 - distance / 2));
|
|
252
|
+
if (threshold && score < threshold) continue;
|
|
253
|
+
|
|
254
|
+
if (excludeFiles?.has(chunk.file_path)) continue;
|
|
255
|
+
|
|
256
|
+
if (seenFiles.has(chunk.file_path)) continue;
|
|
257
|
+
seenFiles.add(chunk.file_path);
|
|
258
|
+
|
|
259
|
+
results.push({
|
|
260
|
+
path: chunk.file_path,
|
|
261
|
+
score: Math.round(score * 1000) / 1000,
|
|
262
|
+
preview: chunk.content_preview
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
return results;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* (Re-)index a single file: chunk it, embed it, store in SQLite.
|
|
271
|
+
* @param {string} relativePath - vault-relative file path
|
|
272
|
+
*/
|
|
273
|
+
async reindexFile(relativePath) {
|
|
274
|
+
if (!this.db) return;
|
|
275
|
+
|
|
276
|
+
const absPath = path.resolve(this.vaultPath, relativePath);
|
|
277
|
+
let content;
|
|
278
|
+
try {
|
|
279
|
+
content = await fs.readFile(absPath, "utf-8");
|
|
280
|
+
} catch (e) {
|
|
281
|
+
if (e.code === "ENOENT") {
|
|
282
|
+
this.removeFile(relativePath);
|
|
283
|
+
return;
|
|
284
|
+
}
|
|
285
|
+
throw e;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
const hash = contentHash(content);
|
|
289
|
+
const stat = await fs.stat(absPath);
|
|
290
|
+
|
|
291
|
+
// Check if unchanged
|
|
292
|
+
const existing = this.db.prepare("SELECT content_hash FROM files WHERE path = ?").get(relativePath);
|
|
293
|
+
if (existing && existing.content_hash === hash) return;
|
|
294
|
+
|
|
295
|
+
// Chunk the note
|
|
296
|
+
const chunks = chunkNote(content, relativePath);
|
|
297
|
+
if (chunks.length === 0) {
|
|
298
|
+
// Note has no indexable content — clean up any old chunks
|
|
299
|
+
this.removeFile(relativePath);
|
|
300
|
+
return;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Get embeddings
|
|
304
|
+
const texts = chunks.map(c => c.text);
|
|
305
|
+
let embeddings;
|
|
306
|
+
try {
|
|
307
|
+
embeddings = await getEmbeddings(texts, this.openaiApiKey);
|
|
308
|
+
} catch (e) {
|
|
309
|
+
console.error(`Embedding error for ${relativePath}: ${e.message}`);
|
|
310
|
+
return;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Store in transaction
|
|
314
|
+
const txn = this.db.transaction(() => {
|
|
315
|
+
// Remove old chunks
|
|
316
|
+
const oldChunks = this.db.prepare(
|
|
317
|
+
"SELECT id FROM chunks WHERE file_path = ?"
|
|
318
|
+
).all(relativePath);
|
|
319
|
+
|
|
320
|
+
if (oldChunks.length > 0) {
|
|
321
|
+
const deleteVec = this.db.prepare("DELETE FROM vec_chunks WHERE rowid = ?");
|
|
322
|
+
const deleteChunk = this.db.prepare("DELETE FROM chunks WHERE id = ?");
|
|
323
|
+
for (const { id } of oldChunks) {
|
|
324
|
+
deleteVec.run(BigInt(id));
|
|
325
|
+
deleteChunk.run(id);
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Insert new chunks
|
|
330
|
+
const insertChunk = this.db.prepare(`
|
|
331
|
+
INSERT INTO chunks (file_path, chunk_index, heading, content_preview)
|
|
332
|
+
VALUES (?, ?, ?, ?)
|
|
333
|
+
`);
|
|
334
|
+
const insertVec = this.db.prepare(`
|
|
335
|
+
INSERT INTO vec_chunks (rowid, embedding) VALUES (?, ?)
|
|
336
|
+
`);
|
|
337
|
+
|
|
338
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
339
|
+
const result = insertChunk.run(
|
|
340
|
+
relativePath,
|
|
341
|
+
i,
|
|
342
|
+
chunks[i].heading || null,
|
|
343
|
+
chunks[i].preview
|
|
344
|
+
);
|
|
345
|
+
insertVec.run(BigInt(result.lastInsertRowid), new Float32Array(embeddings[i]));
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
// Update file record
|
|
349
|
+
this.db.prepare(`
|
|
350
|
+
INSERT OR REPLACE INTO files (path, mtime_ms, content_hash, chunk_count, updated_at)
|
|
351
|
+
VALUES (?, ?, ?, ?, ?)
|
|
352
|
+
`).run(
|
|
353
|
+
relativePath,
|
|
354
|
+
Math.floor(stat.mtimeMs),
|
|
355
|
+
hash,
|
|
356
|
+
chunks.length,
|
|
357
|
+
new Date().toISOString()
|
|
358
|
+
);
|
|
359
|
+
});
|
|
360
|
+
|
|
361
|
+
txn();
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Remove all chunks and metadata for a file from the index.
|
|
366
|
+
* @param {string} relativePath - vault-relative file path
|
|
367
|
+
*/
|
|
368
|
+
removeFile(relativePath) {
|
|
369
|
+
if (!this.db) return;
|
|
370
|
+
|
|
371
|
+
const txn = this.db.transaction(() => {
|
|
372
|
+
const oldChunks = this.db.prepare(
|
|
373
|
+
"SELECT id FROM chunks WHERE file_path = ?"
|
|
374
|
+
).all(relativePath);
|
|
375
|
+
|
|
376
|
+
if (oldChunks.length > 0) {
|
|
377
|
+
const deleteVec = this.db.prepare("DELETE FROM vec_chunks WHERE rowid = ?");
|
|
378
|
+
const deleteChunk = this.db.prepare("DELETE FROM chunks WHERE id = ?");
|
|
379
|
+
for (const { id } of oldChunks) {
|
|
380
|
+
deleteVec.run(BigInt(id));
|
|
381
|
+
deleteChunk.run(id);
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
this.db.prepare("DELETE FROM files WHERE path = ?").run(relativePath);
|
|
386
|
+
});
|
|
387
|
+
|
|
388
|
+
txn();
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// --- Private methods ---
|
|
392
|
+
|
|
393
|
+
async _startupSync() {
|
|
394
|
+
this._syncState.syncing = true;
|
|
395
|
+
|
|
396
|
+
try {
|
|
397
|
+
// Get all vault .md files
|
|
398
|
+
const vaultFiles = await getAllMarkdownFiles(this.vaultPath);
|
|
399
|
+
|
|
400
|
+
// Get all indexed files
|
|
401
|
+
const indexedFiles = new Map();
|
|
402
|
+
for (const row of this.db.prepare("SELECT path, mtime_ms, content_hash FROM files").all()) {
|
|
403
|
+
indexedFiles.set(row.path, row);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// Find files needing reindex
|
|
407
|
+
const toReindex = [];
|
|
408
|
+
const vaultFileSet = new Set(vaultFiles);
|
|
409
|
+
|
|
410
|
+
for (const relPath of vaultFiles) {
|
|
411
|
+
const absPath = path.resolve(this.vaultPath, relPath);
|
|
412
|
+
try {
|
|
413
|
+
const stat = await fs.stat(absPath);
|
|
414
|
+
const indexed = indexedFiles.get(relPath);
|
|
415
|
+
if (!indexed || Math.floor(stat.mtimeMs) !== indexed.mtime_ms) {
|
|
416
|
+
toReindex.push(relPath);
|
|
417
|
+
}
|
|
418
|
+
} catch {
|
|
419
|
+
// File disappeared between listing and stat
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// Find deleted files
|
|
424
|
+
for (const indexedPath of indexedFiles.keys()) {
|
|
425
|
+
if (!vaultFileSet.has(indexedPath)) {
|
|
426
|
+
this.removeFile(indexedPath);
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
if (toReindex.length === 0) {
|
|
431
|
+
console.error("Semantic index: up to date");
|
|
432
|
+
return;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
this._syncState.total = toReindex.length;
|
|
436
|
+
this._syncState.done = 0;
|
|
437
|
+
console.error(`Semantic index: syncing ${toReindex.length} files...`);
|
|
438
|
+
|
|
439
|
+
// Process in batches
|
|
440
|
+
for (let i = 0; i < toReindex.length; i += REINDEX_BATCH_SIZE) {
|
|
441
|
+
if (this._abortController?.signal.aborted) {
|
|
442
|
+
console.error("Semantic index: startup sync aborted");
|
|
443
|
+
break;
|
|
444
|
+
}
|
|
445
|
+
const batch = toReindex.slice(i, i + REINDEX_BATCH_SIZE);
|
|
446
|
+
const results = await Promise.allSettled(batch.map(f => this._trackedReindex(f)));
|
|
447
|
+
const failures = results.filter(r => r.status === "rejected");
|
|
448
|
+
if (failures.length > 0) {
|
|
449
|
+
console.error(`Semantic index: ${failures.length} files failed in batch`);
|
|
450
|
+
}
|
|
451
|
+
this._syncState.done += batch.length;
|
|
452
|
+
console.error(`Semantic index: syncing ${this._syncState.done}/${this._syncState.total} files...`);
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
console.error(`Semantic index: sync complete (${toReindex.length} files updated)`);
|
|
456
|
+
} finally {
|
|
457
|
+
this._syncState.syncing = false;
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
_startWatcher() {
|
|
462
|
+
try {
|
|
463
|
+
this.watcher = fsSync.watch(this.vaultPath, { recursive: true }, (eventType, filename) => {
|
|
464
|
+
if (!filename) return;
|
|
465
|
+
if (!filename.endsWith(".md")) return;
|
|
466
|
+
if (this._syncState.syncing) return;
|
|
467
|
+
|
|
468
|
+
// Ignore dotfiles/dot directories
|
|
469
|
+
const parts = filename.split(path.sep);
|
|
470
|
+
if (parts.some(p => p.startsWith("."))) return;
|
|
471
|
+
|
|
472
|
+
// Normalize to forward slashes (consistent with vault paths)
|
|
473
|
+
const relativePath = filename.split(path.sep).join("/");
|
|
474
|
+
|
|
475
|
+
// Debounce per file
|
|
476
|
+
if (this._debounceTimers.has(relativePath)) {
|
|
477
|
+
clearTimeout(this._debounceTimers.get(relativePath));
|
|
478
|
+
this._debounceTimers.delete(relativePath);
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
const timer = setTimeout(async () => {
|
|
482
|
+
this._debounceTimers.delete(relativePath);
|
|
483
|
+
try {
|
|
484
|
+
// Check if file still exists
|
|
485
|
+
await fs.access(path.resolve(this.vaultPath, relativePath));
|
|
486
|
+
await this._trackedReindex(relativePath);
|
|
487
|
+
} catch (e) {
|
|
488
|
+
if (e.code === "ENOENT") {
|
|
489
|
+
this.removeFile(relativePath);
|
|
490
|
+
} else {
|
|
491
|
+
console.error(`Watcher reindex error for ${relativePath}: ${e.message}`);
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
}, DEBOUNCE_MS);
|
|
495
|
+
|
|
496
|
+
this._debounceTimers.set(relativePath, timer);
|
|
497
|
+
});
|
|
498
|
+
|
|
499
|
+
this.watcher.on("error", (err) => {
|
|
500
|
+
console.error(`File watcher error: ${err.message}. Stopping watcher.`);
|
|
501
|
+
if (this.watcher) {
|
|
502
|
+
this.watcher.close();
|
|
503
|
+
this.watcher = null;
|
|
504
|
+
}
|
|
505
|
+
});
|
|
506
|
+
} catch (err) {
|
|
507
|
+
console.error(`Could not start file watcher: ${err.message}`);
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
// --- Module-level helpers ---
|
|
513
|
+
|
|
514
|
+
function chunkNote(content, filePath) {
|
|
515
|
+
// Strip frontmatter
|
|
516
|
+
let body = content;
|
|
517
|
+
if (content.startsWith("---")) {
|
|
518
|
+
const endIndex = content.indexOf("\n---", 3);
|
|
519
|
+
if (endIndex !== -1) {
|
|
520
|
+
body = content.slice(endIndex + 4).trim();
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
if (!body) return [];
|
|
525
|
+
|
|
526
|
+
// Derive title from file path
|
|
527
|
+
const title = path.basename(filePath, ".md");
|
|
528
|
+
|
|
529
|
+
// Short note: single chunk
|
|
530
|
+
if (body.length <= MAX_CHARS_PER_CHUNK) {
|
|
531
|
+
return [{
|
|
532
|
+
text: `# ${title}\n\n${body}`,
|
|
533
|
+
heading: null,
|
|
534
|
+
preview: getPreview(body)
|
|
535
|
+
}];
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
// Long note: split at ## headings
|
|
539
|
+
const sections = splitByHeadings(body);
|
|
540
|
+
const chunks = [];
|
|
541
|
+
|
|
542
|
+
for (const section of sections) {
|
|
543
|
+
const sectionText = section.text.trim();
|
|
544
|
+
if (!sectionText) continue;
|
|
545
|
+
|
|
546
|
+
if (sectionText.length <= MAX_CHARS_PER_CHUNK) {
|
|
547
|
+
chunks.push({
|
|
548
|
+
text: `# ${title}\n\n${sectionText}`,
|
|
549
|
+
heading: section.heading,
|
|
550
|
+
preview: getPreview(sectionText)
|
|
551
|
+
});
|
|
552
|
+
} else {
|
|
553
|
+
// Further split at paragraph breaks
|
|
554
|
+
const paragraphChunks = splitByParagraphs(sectionText, MAX_CHARS_PER_CHUNK);
|
|
555
|
+
for (let i = 0; i < paragraphChunks.length; i++) {
|
|
556
|
+
chunks.push({
|
|
557
|
+
text: `# ${title}\n\n${paragraphChunks[i]}`,
|
|
558
|
+
heading: section.heading ? `${section.heading} (${i + 1})` : null,
|
|
559
|
+
preview: getPreview(paragraphChunks[i])
|
|
560
|
+
});
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return chunks;
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
function splitByHeadings(body) {
|
|
569
|
+
const lines = body.split("\n");
|
|
570
|
+
const sections = [];
|
|
571
|
+
let currentHeading = null;
|
|
572
|
+
let currentLines = [];
|
|
573
|
+
|
|
574
|
+
for (const line of lines) {
|
|
575
|
+
if (line.startsWith("## ")) {
|
|
576
|
+
if (currentLines.length > 0) {
|
|
577
|
+
sections.push({ heading: currentHeading, text: currentLines.join("\n") });
|
|
578
|
+
}
|
|
579
|
+
currentHeading = line.replace(/^##\s+/, "");
|
|
580
|
+
currentLines = [line];
|
|
581
|
+
} else {
|
|
582
|
+
currentLines.push(line);
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
if (currentLines.length > 0) {
|
|
587
|
+
sections.push({ heading: currentHeading, text: currentLines.join("\n") });
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
return sections;
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
function splitByParagraphs(text, maxChars) {
|
|
594
|
+
const paragraphs = text.split(/\n\n+/);
|
|
595
|
+
const chunks = [];
|
|
596
|
+
let current = "";
|
|
597
|
+
|
|
598
|
+
for (const para of paragraphs) {
|
|
599
|
+
if (current && (current.length + para.length + 2) > maxChars) {
|
|
600
|
+
chunks.push(current);
|
|
601
|
+
current = para;
|
|
602
|
+
} else {
|
|
603
|
+
current = current ? current + "\n\n" + para : para;
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
if (current) chunks.push(current);
|
|
608
|
+
return chunks;
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
function getPreview(text) {
|
|
612
|
+
// Strip markdown heading markers for preview
|
|
613
|
+
const cleaned = text.replace(/^#+\s+/gm, "").trim();
|
|
614
|
+
const words = cleaned.split(/\s+/);
|
|
615
|
+
const preview = words.slice(0, 100).join(" ");
|
|
616
|
+
return preview.length < cleaned.length ? preview + "..." : preview;
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
async function getEmbeddings(texts, apiKey) {
|
|
620
|
+
const allEmbeddings = [];
|
|
621
|
+
|
|
622
|
+
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
623
|
+
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
624
|
+
const embeddings = await callEmbeddingAPI(batch, apiKey);
|
|
625
|
+
allEmbeddings.push(...embeddings);
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
return allEmbeddings;
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
async function callEmbeddingAPI(texts, apiKey, retries = 3) {
|
|
632
|
+
for (let attempt = 0; attempt < retries; attempt++) {
|
|
633
|
+
const response = await fetch("https://api.openai.com/v1/embeddings", {
|
|
634
|
+
method: "POST",
|
|
635
|
+
headers: {
|
|
636
|
+
"Authorization": `Bearer ${apiKey}`,
|
|
637
|
+
"Content-Type": "application/json"
|
|
638
|
+
},
|
|
639
|
+
body: JSON.stringify({
|
|
640
|
+
model: EMBEDDING_MODEL,
|
|
641
|
+
input: texts
|
|
642
|
+
}),
|
|
643
|
+
signal: AbortSignal.timeout(30000)
|
|
644
|
+
});
|
|
645
|
+
|
|
646
|
+
if (response.ok) {
|
|
647
|
+
const data = await response.json();
|
|
648
|
+
// Sort by index to maintain order
|
|
649
|
+
return data.data
|
|
650
|
+
.sort((a, b) => a.index - b.index)
|
|
651
|
+
.map(d => d.embedding);
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
if (response.status === 429 && attempt < retries - 1) {
|
|
655
|
+
// Rate limited — exponential backoff
|
|
656
|
+
const delay = Math.pow(2, attempt) * 1000;
|
|
657
|
+
console.error(`Rate limited, retrying in ${delay}ms...`);
|
|
658
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
659
|
+
continue;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
const errBody = await response.text();
|
|
663
|
+
throw new Error(`OpenAI API error (${response.status}): ${errBody}`);
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
function contentHash(text) {
|
|
668
|
+
return crypto.createHash("sha256").update(text).digest("hex");
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
export { chunkNote, splitByHeadings, splitByParagraphs, getPreview, contentHash };
|
|
672
|
+
|