@tekmidian/pai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +567 -0
- package/FEATURE.md +108 -0
- package/LICENSE +21 -0
- package/README.md +101 -0
- package/dist/auto-route-D7W6RE06.mjs +86 -0
- package/dist/auto-route-D7W6RE06.mjs.map +1 -0
- package/dist/cli/index.d.mts +1 -0
- package/dist/cli/index.mjs +5927 -0
- package/dist/cli/index.mjs.map +1 -0
- package/dist/config-DBh1bYM2.mjs +151 -0
- package/dist/config-DBh1bYM2.mjs.map +1 -0
- package/dist/daemon/index.d.mts +1 -0
- package/dist/daemon/index.mjs +56 -0
- package/dist/daemon/index.mjs.map +1 -0
- package/dist/daemon-mcp/index.d.mts +1 -0
- package/dist/daemon-mcp/index.mjs +185 -0
- package/dist/daemon-mcp/index.mjs.map +1 -0
- package/dist/daemon-v5O897D4.mjs +773 -0
- package/dist/daemon-v5O897D4.mjs.map +1 -0
- package/dist/db-4lSqLFb8.mjs +199 -0
- package/dist/db-4lSqLFb8.mjs.map +1 -0
- package/dist/db-BcDxXVBu.mjs +110 -0
- package/dist/db-BcDxXVBu.mjs.map +1 -0
- package/dist/detect-BHqYcjJ1.mjs +86 -0
- package/dist/detect-BHqYcjJ1.mjs.map +1 -0
- package/dist/detector-DKA83aTZ.mjs +74 -0
- package/dist/detector-DKA83aTZ.mjs.map +1 -0
- package/dist/embeddings-mfqv-jFu.mjs +91 -0
- package/dist/embeddings-mfqv-jFu.mjs.map +1 -0
- package/dist/factory-BDAiKtYR.mjs +42 -0
- package/dist/factory-BDAiKtYR.mjs.map +1 -0
- package/dist/index.d.mts +307 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +11 -0
- package/dist/indexer-B20bPHL-.mjs +677 -0
- package/dist/indexer-B20bPHL-.mjs.map +1 -0
- package/dist/indexer-backend-BXaocO5r.mjs +360 -0
- package/dist/indexer-backend-BXaocO5r.mjs.map +1 -0
- package/dist/ipc-client-DPy7s3iu.mjs +156 -0
- package/dist/ipc-client-DPy7s3iu.mjs.map +1 -0
- package/dist/mcp/index.d.mts +1 -0
- package/dist/mcp/index.mjs +373 -0
- package/dist/mcp/index.mjs.map +1 -0
- package/dist/migrate-Bwj7qPaE.mjs +241 -0
- package/dist/migrate-Bwj7qPaE.mjs.map +1 -0
- package/dist/pai-marker-DX_mFLum.mjs +186 -0
- package/dist/pai-marker-DX_mFLum.mjs.map +1 -0
- package/dist/postgres-Ccvpc6fC.mjs +335 -0
- package/dist/postgres-Ccvpc6fC.mjs.map +1 -0
- package/dist/rolldown-runtime-95iHPtFO.mjs +18 -0
- package/dist/schemas-DjdwzIQ8.mjs +3405 -0
- package/dist/schemas-DjdwzIQ8.mjs.map +1 -0
- package/dist/search-PjftDxxs.mjs +282 -0
- package/dist/search-PjftDxxs.mjs.map +1 -0
- package/dist/sqlite-CHUrNtbI.mjs +90 -0
- package/dist/sqlite-CHUrNtbI.mjs.map +1 -0
- package/dist/tools-CLK4080-.mjs +805 -0
- package/dist/tools-CLK4080-.mjs.map +1 -0
- package/dist/utils-DEWdIFQ0.mjs +160 -0
- package/dist/utils-DEWdIFQ0.mjs.map +1 -0
- package/package.json +72 -0
- package/templates/README.md +181 -0
- package/templates/agent-prefs.example.md +362 -0
- package/templates/claude-md.template.md +733 -0
- package/templates/pai-project.template.md +13 -0
- package/templates/voices.example.json +251 -0
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
import { existsSync, readFileSync, readdirSync, statSync } from "node:fs";
|
|
2
|
+
import { homedir } from "node:os";
|
|
3
|
+
import { basename, join, normalize, relative } from "node:path";
|
|
4
|
+
import { createHash } from "node:crypto";
|
|
5
|
+
|
|
6
|
+
//#region src/memory/chunker.ts
|
|
7
|
+
/**
|
|
8
|
+
* Markdown text chunker for the PAI memory engine.
|
|
9
|
+
*
|
|
10
|
+
* Splits markdown files into overlapping text segments suitable for BM25
|
|
11
|
+
* full-text indexing. Respects heading boundaries where possible, falling
|
|
12
|
+
* back to paragraph and sentence splitting when sections are large.
|
|
13
|
+
*/
|
|
14
|
+
const DEFAULT_MAX_TOKENS = 400;
|
|
15
|
+
const DEFAULT_OVERLAP = 80;
|
|
16
|
+
/**
|
|
17
|
+
* Approximate token count using a words * 1.3 heuristic.
|
|
18
|
+
* Matches the OpenClaw estimate approach.
|
|
19
|
+
*/
|
|
20
|
+
function estimateTokens(text) {
|
|
21
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
22
|
+
return Math.ceil(wordCount * 1.3);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Compute SHA-256 hash of a string, returning a hex string.
|
|
26
|
+
*/
|
|
27
|
+
function sha256(text) {
|
|
28
|
+
return createHash("sha256").update(text).digest("hex");
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Split content into sections delimited by ## or ### headings.
|
|
32
|
+
* Each section starts at its heading line (or at line 1 for a preamble).
|
|
33
|
+
*/
|
|
34
|
+
function splitBySections(lines) {
|
|
35
|
+
const sections = [];
|
|
36
|
+
let current = [];
|
|
37
|
+
for (const line of lines) {
|
|
38
|
+
if (/^#{1,3}\s/.test(line.text) && current.length > 0) {
|
|
39
|
+
const text = current.map((l) => l.text).join("\n");
|
|
40
|
+
sections.push({
|
|
41
|
+
lines: current,
|
|
42
|
+
tokens: estimateTokens(text)
|
|
43
|
+
});
|
|
44
|
+
current = [];
|
|
45
|
+
}
|
|
46
|
+
current.push(line);
|
|
47
|
+
}
|
|
48
|
+
if (current.length > 0) {
|
|
49
|
+
const text = current.map((l) => l.text).join("\n");
|
|
50
|
+
sections.push({
|
|
51
|
+
lines: current,
|
|
52
|
+
tokens: estimateTokens(text)
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
return sections;
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Split a LineBlock by double-newline paragraph boundaries.
|
|
59
|
+
*/
|
|
60
|
+
function splitByParagraphs(block) {
|
|
61
|
+
const paragraphs = [];
|
|
62
|
+
let current = [];
|
|
63
|
+
for (const line of block.lines) if (line.text.trim() === "" && current.length > 0) {
|
|
64
|
+
const text = current.map((l) => l.text).join("\n");
|
|
65
|
+
paragraphs.push({
|
|
66
|
+
lines: [...current],
|
|
67
|
+
tokens: estimateTokens(text)
|
|
68
|
+
});
|
|
69
|
+
current = [];
|
|
70
|
+
} else current.push(line);
|
|
71
|
+
if (current.length > 0) {
|
|
72
|
+
const text = current.map((l) => l.text).join("\n");
|
|
73
|
+
paragraphs.push({
|
|
74
|
+
lines: current,
|
|
75
|
+
tokens: estimateTokens(text)
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
return paragraphs.length > 0 ? paragraphs : [block];
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Split a LineBlock by sentence boundaries (. ! ?) when even paragraphs are
|
|
82
|
+
* too large. Works character-by-character within joined lines.
|
|
83
|
+
*/
|
|
84
|
+
function splitBySentences(block, maxTokens) {
|
|
85
|
+
const sentences = block.lines.map((l) => l.text).join(" ").split(/(?<=[.!?])\s+(?=[A-Z"'])/g);
|
|
86
|
+
const result = [];
|
|
87
|
+
let accText = "";
|
|
88
|
+
const startLine = block.lines[0]?.lineNo ?? 1;
|
|
89
|
+
const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;
|
|
90
|
+
const totalLines = endLine - startLine + 1;
|
|
91
|
+
const linesPerSentence = Math.max(1, Math.floor(totalLines / Math.max(1, sentences.length)));
|
|
92
|
+
let sentenceIdx = 0;
|
|
93
|
+
let approxLine = startLine;
|
|
94
|
+
const flush = () => {
|
|
95
|
+
if (!accText.trim()) return;
|
|
96
|
+
const endApprox = Math.min(approxLine + linesPerSentence - 1, endLine);
|
|
97
|
+
result.push({
|
|
98
|
+
lines: [{
|
|
99
|
+
text: accText.trim(),
|
|
100
|
+
lineNo: approxLine
|
|
101
|
+
}],
|
|
102
|
+
tokens: estimateTokens(accText)
|
|
103
|
+
});
|
|
104
|
+
approxLine = endApprox + 1;
|
|
105
|
+
accText = "";
|
|
106
|
+
};
|
|
107
|
+
for (const sentence of sentences) {
|
|
108
|
+
sentenceIdx++;
|
|
109
|
+
const candidateText = accText ? accText + " " + sentence : sentence;
|
|
110
|
+
if (estimateTokens(candidateText) > maxTokens && accText) {
|
|
111
|
+
flush();
|
|
112
|
+
accText = sentence;
|
|
113
|
+
} else accText = candidateText;
|
|
114
|
+
}
|
|
115
|
+
flush();
|
|
116
|
+
return result.length > 0 ? result : [block];
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Extract the last `overlapTokens` worth of text from a list of previously
|
|
120
|
+
* emitted chunks to prepend to the next chunk.
|
|
121
|
+
*/
|
|
122
|
+
function buildOverlapPrefix(chunks, overlapTokens) {
|
|
123
|
+
if (overlapTokens <= 0 || chunks.length === 0) return [];
|
|
124
|
+
const lastChunk = chunks[chunks.length - 1];
|
|
125
|
+
if (!lastChunk) return [];
|
|
126
|
+
const lines = lastChunk.text.split("\n");
|
|
127
|
+
const kept = [];
|
|
128
|
+
let acc = 0;
|
|
129
|
+
for (let i = lines.length - 1; i >= 0; i--) {
|
|
130
|
+
const lineTokens = estimateTokens(lines[i] ?? "");
|
|
131
|
+
acc += lineTokens;
|
|
132
|
+
kept.unshift(lines[i] ?? "");
|
|
133
|
+
if (acc >= overlapTokens) break;
|
|
134
|
+
}
|
|
135
|
+
const startLine = lastChunk.endLine - kept.length + 1;
|
|
136
|
+
return kept.map((text, idx) => ({
|
|
137
|
+
text,
|
|
138
|
+
lineNo: Math.max(lastChunk.startLine, startLine + idx)
|
|
139
|
+
}));
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Chunk a markdown file into overlapping segments for BM25 indexing.
|
|
143
|
+
*
|
|
144
|
+
* Strategy:
|
|
145
|
+
* 1. Split by headings (##, ###) as natural boundaries.
|
|
146
|
+
* 2. If a section exceeds maxTokens, split by paragraphs.
|
|
147
|
+
* 3. If a paragraph still exceeds maxTokens, split by sentences.
|
|
148
|
+
* 4. Apply overlap: each chunk includes the last `overlap` tokens from the
|
|
149
|
+
* previous chunk.
|
|
150
|
+
*/
|
|
151
|
+
function chunkMarkdown(content, opts) {
|
|
152
|
+
const maxTokens = opts?.maxTokens ?? DEFAULT_MAX_TOKENS;
|
|
153
|
+
const overlapTokens = opts?.overlap ?? DEFAULT_OVERLAP;
|
|
154
|
+
if (!content.trim()) return [];
|
|
155
|
+
const sections = splitBySections(content.split("\n").map((text, idx) => ({
|
|
156
|
+
text,
|
|
157
|
+
lineNo: idx + 1
|
|
158
|
+
})));
|
|
159
|
+
const finalBlocks = [];
|
|
160
|
+
for (const section of sections) {
|
|
161
|
+
if (section.tokens <= maxTokens) {
|
|
162
|
+
finalBlocks.push(section);
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
const paras = splitByParagraphs(section);
|
|
166
|
+
for (const para of paras) {
|
|
167
|
+
if (para.tokens <= maxTokens) {
|
|
168
|
+
finalBlocks.push(para);
|
|
169
|
+
continue;
|
|
170
|
+
}
|
|
171
|
+
const sentences = splitBySentences(para, maxTokens);
|
|
172
|
+
finalBlocks.push(...sentences);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
const chunks = [];
|
|
176
|
+
for (const block of finalBlocks) {
|
|
177
|
+
if (block.lines.length === 0) continue;
|
|
178
|
+
const text = [...buildOverlapPrefix(chunks, overlapTokens), ...block.lines].map((l) => l.text).join("\n").trim();
|
|
179
|
+
if (!text) continue;
|
|
180
|
+
const startLine = block.lines[0]?.lineNo ?? 1;
|
|
181
|
+
const endLine = block.lines[block.lines.length - 1]?.lineNo ?? startLine;
|
|
182
|
+
chunks.push({
|
|
183
|
+
text,
|
|
184
|
+
startLine,
|
|
185
|
+
endLine,
|
|
186
|
+
hash: sha256(text)
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
return chunks;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
//#endregion
|
|
193
|
+
//#region src/memory/indexer.ts
|
|
194
|
+
/**
|
|
195
|
+
* File indexer for the PAI federation memory engine.
|
|
196
|
+
*
|
|
197
|
+
* Scans project memory/ and Notes/ directories, chunks markdown files, and
|
|
198
|
+
* inserts the resulting chunks into federation.db for BM25 search.
|
|
199
|
+
*
|
|
200
|
+
* Change detection: files whose SHA-256 hash has not changed since the last
|
|
201
|
+
* index run are skipped, keeping incremental re-indexing fast.
|
|
202
|
+
*
|
|
203
|
+
* Phase 2.5: adds embedChunks() for generating vector embeddings on indexed
|
|
204
|
+
* chunks that do not yet have an embedding stored.
|
|
205
|
+
*/
|
|
206
|
+
/**
|
|
207
|
+
* Classify a relative file path into one of the four memory tiers.
|
|
208
|
+
*
|
|
209
|
+
* Rules (in priority order):
|
|
210
|
+
* - MEMORY.md anywhere in memory/ → 'evergreen'
|
|
211
|
+
* - YYYY-MM-DD.md in memory/ → 'daily'
|
|
212
|
+
* - anything else in memory/ → 'topic'
|
|
213
|
+
* - anything in Notes/ → 'session'
|
|
214
|
+
*/
|
|
215
|
+
function detectTier(relativePath) {
|
|
216
|
+
const p = relativePath.replace(/\\/g, "/").replace(/^\.\//, "");
|
|
217
|
+
if (p.startsWith("Notes/") || p === "Notes") return "session";
|
|
218
|
+
const fileName = basename(p);
|
|
219
|
+
if (fileName === "MEMORY.md") return "evergreen";
|
|
220
|
+
if (/^\d{4}-\d{2}-\d{2}\.md$/.test(fileName)) return "daily";
|
|
221
|
+
return "topic";
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Generate a deterministic chunk ID from its coordinates.
|
|
225
|
+
* Format: sha256("projectId:path:chunkIndex:startLine:endLine")
|
|
226
|
+
*
|
|
227
|
+
* The chunkIndex (0-based position within the file) is included so that
|
|
228
|
+
* chunks with approximated line numbers (e.g. from splitBySentences) never
|
|
229
|
+
* produce colliding IDs even when multiple chunks share the same startLine/endLine.
|
|
230
|
+
*/
|
|
231
|
+
function chunkId(projectId, path, chunkIndex, startLine, endLine) {
|
|
232
|
+
return createHash("sha256").update(`${projectId}:${path}:${chunkIndex}:${startLine}:${endLine}`).digest("hex");
|
|
233
|
+
}
|
|
234
|
+
function sha256File(content) {
|
|
235
|
+
return createHash("sha256").update(content).digest("hex");
|
|
236
|
+
}
|
|
237
|
+
/**
|
|
238
|
+
* Index a single file into the federation database.
|
|
239
|
+
*
|
|
240
|
+
* @returns true if the file was re-indexed (changed or new), false if skipped.
|
|
241
|
+
*/
|
|
242
|
+
function indexFile(db, projectId, rootPath, relativePath, source, tier) {
|
|
243
|
+
const absPath = join(rootPath, relativePath);
|
|
244
|
+
let content;
|
|
245
|
+
let stat;
|
|
246
|
+
try {
|
|
247
|
+
content = readFileSync(absPath, "utf8");
|
|
248
|
+
stat = statSync(absPath);
|
|
249
|
+
} catch {
|
|
250
|
+
return false;
|
|
251
|
+
}
|
|
252
|
+
const hash = sha256File(content);
|
|
253
|
+
const mtime = Math.floor(stat.mtimeMs);
|
|
254
|
+
const size = stat.size;
|
|
255
|
+
if (db.prepare("SELECT hash FROM memory_files WHERE project_id = ? AND path = ?").get(projectId, relativePath)?.hash === hash) return false;
|
|
256
|
+
const oldChunkIds = db.prepare("SELECT id FROM memory_chunks WHERE project_id = ? AND path = ?").all(projectId, relativePath);
|
|
257
|
+
const deleteFts = db.prepare("DELETE FROM memory_fts WHERE id = ?");
|
|
258
|
+
const deleteChunk = db.prepare("DELETE FROM memory_chunks WHERE project_id = ? AND path = ?");
|
|
259
|
+
db.transaction(() => {
|
|
260
|
+
for (const row of oldChunkIds) deleteFts.run(row.id);
|
|
261
|
+
deleteChunk.run(projectId, relativePath);
|
|
262
|
+
})();
|
|
263
|
+
const chunks = chunkMarkdown(content);
|
|
264
|
+
const insertChunk = db.prepare(`
|
|
265
|
+
INSERT INTO memory_chunks (id, project_id, source, tier, path, start_line, end_line, hash, text, updated_at)
|
|
266
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
267
|
+
`);
|
|
268
|
+
const insertFts = db.prepare(`
|
|
269
|
+
INSERT INTO memory_fts (text, id, project_id, path, source, tier, start_line, end_line)
|
|
270
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
271
|
+
`);
|
|
272
|
+
const upsertFile = db.prepare(`
|
|
273
|
+
INSERT INTO memory_files (project_id, path, source, tier, hash, mtime, size)
|
|
274
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
275
|
+
ON CONFLICT(project_id, path) DO UPDATE SET
|
|
276
|
+
source = excluded.source,
|
|
277
|
+
tier = excluded.tier,
|
|
278
|
+
hash = excluded.hash,
|
|
279
|
+
mtime = excluded.mtime,
|
|
280
|
+
size = excluded.size
|
|
281
|
+
`);
|
|
282
|
+
const updatedAt = Date.now();
|
|
283
|
+
db.transaction(() => {
|
|
284
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
285
|
+
const chunk = chunks[i];
|
|
286
|
+
const id = chunkId(projectId, relativePath, i, chunk.startLine, chunk.endLine);
|
|
287
|
+
insertChunk.run(id, projectId, source, tier, relativePath, chunk.startLine, chunk.endLine, chunk.hash, chunk.text, updatedAt);
|
|
288
|
+
insertFts.run(chunk.text, id, projectId, relativePath, source, tier, chunk.startLine, chunk.endLine);
|
|
289
|
+
}
|
|
290
|
+
upsertFile.run(projectId, relativePath, source, tier, hash, mtime, size);
|
|
291
|
+
})();
|
|
292
|
+
return true;
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Safety cap: maximum number of .md files collected per project scan.
|
|
296
|
+
* Prevents runaway scans on huge root paths (e.g. home directory).
|
|
297
|
+
* Projects with more files than this are scanned up to the cap only.
|
|
298
|
+
*/
|
|
299
|
+
const MAX_FILES_PER_PROJECT = 5e3;
|
|
300
|
+
/**
|
|
301
|
+
* Maximum recursion depth for directory walks.
|
|
302
|
+
* Prevents deep traversal of large directory trees (e.g. development repos).
|
|
303
|
+
* Depth 0 = the given directory itself (no recursion).
|
|
304
|
+
* Value 6 allows: root → subdirs → sub-subdirs → ... up to 6 levels.
|
|
305
|
+
* Sufficient for memory/, Notes/, and typical docs structures.
|
|
306
|
+
*/
|
|
307
|
+
const MAX_WALK_DEPTH = 6;
|
|
308
|
+
/**
|
|
309
|
+
* Recursively collect all .md files under a directory.
|
|
310
|
+
* Returns absolute paths. Stops early if the accumulated count hits the cap
|
|
311
|
+
* or if the recursion depth exceeds MAX_WALK_DEPTH.
|
|
312
|
+
*
|
|
313
|
+
* @param dir Directory to scan.
|
|
314
|
+
* @param acc Shared accumulator array (mutated in place for early exit).
|
|
315
|
+
* @param cap Maximum number of files to collect (across all recursive calls).
|
|
316
|
+
* @param depth Current recursion depth (0 = the initial call).
|
|
317
|
+
*/
|
|
318
|
+
function walkMdFiles(dir, acc, cap = MAX_FILES_PER_PROJECT, depth = 0) {
|
|
319
|
+
const results = acc ?? [];
|
|
320
|
+
if (!existsSync(dir)) return results;
|
|
321
|
+
if (results.length >= cap) return results;
|
|
322
|
+
if (depth > MAX_WALK_DEPTH) return results;
|
|
323
|
+
try {
|
|
324
|
+
for (const entry of readdirSync(dir, { withFileTypes: true })) {
|
|
325
|
+
if (results.length >= cap) break;
|
|
326
|
+
if (entry.isSymbolicLink()) continue;
|
|
327
|
+
if (ALWAYS_SKIP_DIRS.has(entry.name)) continue;
|
|
328
|
+
const full = join(dir, entry.name);
|
|
329
|
+
if (entry.isDirectory()) walkMdFiles(full, results, cap, depth + 1);
|
|
330
|
+
else if (entry.isFile() && entry.name.endsWith(".md")) results.push(full);
|
|
331
|
+
}
|
|
332
|
+
} catch {}
|
|
333
|
+
return results;
|
|
334
|
+
}
|
|
335
|
+
/**
|
|
336
|
+
* Directories to ALWAYS skip, at any depth, during any directory walk.
|
|
337
|
+
* These are build artifacts, dependency trees, and VCS internals that
|
|
338
|
+
* should never be indexed regardless of where they appear in the tree.
|
|
339
|
+
*/
|
|
340
|
+
const ALWAYS_SKIP_DIRS = new Set([
|
|
341
|
+
".git",
|
|
342
|
+
"node_modules",
|
|
343
|
+
"vendor",
|
|
344
|
+
"Pods",
|
|
345
|
+
"dist",
|
|
346
|
+
"build",
|
|
347
|
+
"out",
|
|
348
|
+
"DerivedData",
|
|
349
|
+
".next",
|
|
350
|
+
".venv",
|
|
351
|
+
"venv",
|
|
352
|
+
"__pycache__",
|
|
353
|
+
".cache",
|
|
354
|
+
".bun"
|
|
355
|
+
]);
|
|
356
|
+
/**
|
|
357
|
+
* Directories to skip when doing a root-level content scan.
|
|
358
|
+
* These are either already handled by dedicated scans or should never be indexed.
|
|
359
|
+
*/
|
|
360
|
+
const ROOT_SCAN_SKIP_DIRS = new Set([
|
|
361
|
+
"memory",
|
|
362
|
+
"Notes",
|
|
363
|
+
".claude",
|
|
364
|
+
".DS_Store",
|
|
365
|
+
...ALWAYS_SKIP_DIRS
|
|
366
|
+
]);
|
|
367
|
+
/**
|
|
368
|
+
* Additional directories to skip at the content-scan level (first level below root).
|
|
369
|
+
* These are common macOS/Linux home-directory or repo noise directories that are
|
|
370
|
+
* never meaningful as project content.
|
|
371
|
+
*/
|
|
372
|
+
const CONTENT_SCAN_SKIP_DIRS = new Set([
|
|
373
|
+
"Library",
|
|
374
|
+
"Applications",
|
|
375
|
+
"Music",
|
|
376
|
+
"Movies",
|
|
377
|
+
"Pictures",
|
|
378
|
+
"Desktop",
|
|
379
|
+
"Downloads",
|
|
380
|
+
"Public",
|
|
381
|
+
"coverage",
|
|
382
|
+
...ALWAYS_SKIP_DIRS
|
|
383
|
+
]);
|
|
384
|
+
/**
|
|
385
|
+
* Recursively collect all .md files under rootPath, excluding directories
|
|
386
|
+
* that are already covered by dedicated scans (memory/, Notes/) and
|
|
387
|
+
* common noise directories (.git, node_modules, etc.).
|
|
388
|
+
*
|
|
389
|
+
* Returns absolute paths for files NOT already handled by the specific scanners.
|
|
390
|
+
* Stops collecting once MAX_FILES_PER_PROJECT is reached.
|
|
391
|
+
*/
|
|
392
|
+
function walkContentFiles(rootPath) {
|
|
393
|
+
if (!existsSync(rootPath)) return [];
|
|
394
|
+
const results = [];
|
|
395
|
+
try {
|
|
396
|
+
for (const entry of readdirSync(rootPath, { withFileTypes: true })) {
|
|
397
|
+
if (results.length >= MAX_FILES_PER_PROJECT) break;
|
|
398
|
+
if (entry.isSymbolicLink()) continue;
|
|
399
|
+
if (ROOT_SCAN_SKIP_DIRS.has(entry.name)) continue;
|
|
400
|
+
if (CONTENT_SCAN_SKIP_DIRS.has(entry.name)) continue;
|
|
401
|
+
const full = join(rootPath, entry.name);
|
|
402
|
+
if (entry.isDirectory()) walkMdFiles(full, results, MAX_FILES_PER_PROJECT);
|
|
403
|
+
else if (entry.isFile() && entry.name.endsWith(".md")) {
|
|
404
|
+
if (entry.name !== "MEMORY.md") results.push(full);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
} catch {}
|
|
408
|
+
return results;
|
|
409
|
+
}
|
|
410
|
+
/**
|
|
411
|
+
* Index all memory, Notes, and content files for a single registered project.
|
|
412
|
+
*
|
|
413
|
+
* Scans:
|
|
414
|
+
* - {rootPath}/MEMORY.md → source='memory', tier='evergreen'
|
|
415
|
+
* - {rootPath}/memory/ → source='memory', tier from detectTier()
|
|
416
|
+
* - {rootPath}/Notes/ → source='notes', tier='session'
|
|
417
|
+
* - {rootPath}/**\/\*.md → source='content', tier='topic' (all other .md files, recursive)
|
|
418
|
+
* - {claudeNotesDir}/ → source='notes', tier='session' (if set and different)
|
|
419
|
+
*
|
|
420
|
+
* The content scan covers projects like job-discussions where markdown files
|
|
421
|
+
* live in date/topic subdirectories rather than a memory/ folder. The
|
|
422
|
+
* memory/, Notes/, .git/, and node_modules/ directories are excluded from
|
|
423
|
+
* the content scan to avoid double-indexing.
|
|
424
|
+
*
|
|
425
|
+
* The claudeNotesDir parameter points to ~/.claude/projects/{encoded}/Notes/
|
|
426
|
+
* where Claude Code writes session notes for a given working directory.
|
|
427
|
+
* It is stored on the project row as claude_notes_dir after a registry scan.
|
|
428
|
+
*/
|
|
429
|
+
/**
|
|
430
|
+
* Number of files to process before yielding to the event loop inside
|
|
431
|
+
* indexProject. Keeps IPC responsive even while indexing large projects.
|
|
432
|
+
* Lower = more responsive but more overhead. 10 is a good balance.
|
|
433
|
+
*/
|
|
434
|
+
const INDEX_YIELD_EVERY = 10;
|
|
435
|
+
/**
|
|
436
|
+
* Returns true if rootPath should skip the recursive content scan.
|
|
437
|
+
*
|
|
438
|
+
* Skips content scanning for:
|
|
439
|
+
* - The home directory itself or any ancestor (too broad — millions of files)
|
|
440
|
+
* - Git repositories (code repos — index memory/ and Notes/ only, not all .md files)
|
|
441
|
+
*
|
|
442
|
+
* The content scan is still useful for Obsidian vaults, Notes folders, and
|
|
443
|
+
* other doc-centric project trees where ALL markdown files are meaningful.
|
|
444
|
+
*
|
|
445
|
+
* The memory/, Notes/, and claude_notes_dir scans always run regardless.
|
|
446
|
+
*/
|
|
447
|
+
function isPathTooBroadForContentScan(rootPath) {
|
|
448
|
+
const normalized = normalize(rootPath);
|
|
449
|
+
const home = homedir();
|
|
450
|
+
if (home.startsWith(normalized) || normalized === "/") return true;
|
|
451
|
+
if (normalized.startsWith(home)) {
|
|
452
|
+
const rel = normalized.slice(home.length).replace(/^\//, "");
|
|
453
|
+
if ((rel ? rel.split("/").length : 0) === 0) return true;
|
|
454
|
+
}
|
|
455
|
+
if (existsSync(join(normalized, ".git"))) return true;
|
|
456
|
+
return false;
|
|
457
|
+
}
|
|
458
|
+
async function indexProject(db, projectId, rootPath, claudeNotesDir) {
|
|
459
|
+
const result = {
|
|
460
|
+
filesProcessed: 0,
|
|
461
|
+
chunksCreated: 0,
|
|
462
|
+
filesSkipped: 0
|
|
463
|
+
};
|
|
464
|
+
const filesToIndex = [];
|
|
465
|
+
const rootMemoryMd = join(rootPath, "MEMORY.md");
|
|
466
|
+
if (existsSync(rootMemoryMd)) filesToIndex.push({
|
|
467
|
+
absPath: rootMemoryMd,
|
|
468
|
+
rootBase: rootPath,
|
|
469
|
+
source: "memory",
|
|
470
|
+
tier: "evergreen"
|
|
471
|
+
});
|
|
472
|
+
const memoryDir = join(rootPath, "memory");
|
|
473
|
+
for (const absPath of walkMdFiles(memoryDir)) {
|
|
474
|
+
const tier = detectTier(relative(rootPath, absPath));
|
|
475
|
+
filesToIndex.push({
|
|
476
|
+
absPath,
|
|
477
|
+
rootBase: rootPath,
|
|
478
|
+
source: "memory",
|
|
479
|
+
tier
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
const notesDir = join(rootPath, "Notes");
|
|
483
|
+
for (const absPath of walkMdFiles(notesDir)) filesToIndex.push({
|
|
484
|
+
absPath,
|
|
485
|
+
rootBase: rootPath,
|
|
486
|
+
source: "notes",
|
|
487
|
+
tier: "session"
|
|
488
|
+
});
|
|
489
|
+
{
|
|
490
|
+
const SESSION_TITLE_RE = /^(\d{4})\s*-\s*(\d{4}-\d{2}-\d{2})\s*-\s*(.+)\.md$/;
|
|
491
|
+
const titleInsertChunk = db.prepare(`
|
|
492
|
+
INSERT OR IGNORE INTO memory_chunks (id, project_id, source, tier, path, start_line, end_line, hash, text, updated_at)
|
|
493
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
494
|
+
`);
|
|
495
|
+
const titleInsertFts = db.prepare(`
|
|
496
|
+
INSERT OR IGNORE INTO memory_fts (text, id, project_id, path, source, tier, start_line, end_line)
|
|
497
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
498
|
+
`);
|
|
499
|
+
const updatedAt = Date.now();
|
|
500
|
+
for (const absPath of walkMdFiles(notesDir)) {
|
|
501
|
+
const fileName = basename(absPath);
|
|
502
|
+
const m = SESSION_TITLE_RE.exec(fileName);
|
|
503
|
+
if (!m) continue;
|
|
504
|
+
const [, num, date, title] = m;
|
|
505
|
+
const text = `Session #${num} ${date}: ${title}`;
|
|
506
|
+
const syntheticPath = `${relative(rootPath, absPath)}::title`;
|
|
507
|
+
const id = chunkId(projectId, syntheticPath, 0, 0, 0);
|
|
508
|
+
const hash = sha256File(text);
|
|
509
|
+
db.transaction(() => {
|
|
510
|
+
titleInsertChunk.run(id, projectId, "notes", "session", syntheticPath, 0, 0, hash, text, updatedAt);
|
|
511
|
+
titleInsertFts.run(text, id, projectId, syntheticPath, "notes", "session", 0, 0);
|
|
512
|
+
})();
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
if (!isPathTooBroadForContentScan(rootPath)) for (const absPath of walkContentFiles(rootPath)) filesToIndex.push({
|
|
516
|
+
absPath,
|
|
517
|
+
rootBase: rootPath,
|
|
518
|
+
source: "content",
|
|
519
|
+
tier: "topic"
|
|
520
|
+
});
|
|
521
|
+
if (claudeNotesDir && claudeNotesDir !== notesDir) {
|
|
522
|
+
for (const absPath of walkMdFiles(claudeNotesDir)) filesToIndex.push({
|
|
523
|
+
absPath,
|
|
524
|
+
rootBase: claudeNotesDir,
|
|
525
|
+
source: "notes",
|
|
526
|
+
tier: "session"
|
|
527
|
+
});
|
|
528
|
+
{
|
|
529
|
+
const SESSION_TITLE_RE_CLAUDE = /^(\d{4})\s*-\s*(\d{4}-\d{2}-\d{2})\s*-\s*(.+)\.md$/;
|
|
530
|
+
const updatedAt = Date.now();
|
|
531
|
+
const titleInsertChunk2 = db.prepare(`
|
|
532
|
+
INSERT OR IGNORE INTO memory_chunks (id, project_id, source, tier, path, start_line, end_line, hash, text, updated_at)
|
|
533
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
534
|
+
`);
|
|
535
|
+
const titleInsertFts2 = db.prepare(`
|
|
536
|
+
INSERT OR IGNORE INTO memory_fts (text, id, project_id, path, source, tier, start_line, end_line)
|
|
537
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
538
|
+
`);
|
|
539
|
+
for (const absPath of walkMdFiles(claudeNotesDir)) {
|
|
540
|
+
const fileName = basename(absPath);
|
|
541
|
+
const m = SESSION_TITLE_RE_CLAUDE.exec(fileName);
|
|
542
|
+
if (!m) continue;
|
|
543
|
+
const [, num, date, title] = m;
|
|
544
|
+
const text = `Session #${num} ${date}: ${title}`;
|
|
545
|
+
const syntheticPath = `${relative(claudeNotesDir, absPath)}::title`;
|
|
546
|
+
const id = chunkId(projectId, syntheticPath, 0, 0, 0);
|
|
547
|
+
const hash = sha256File(text);
|
|
548
|
+
db.transaction(() => {
|
|
549
|
+
titleInsertChunk2.run(id, projectId, "notes", "session", syntheticPath, 0, 0, hash, text, updatedAt);
|
|
550
|
+
titleInsertFts2.run(text, id, projectId, syntheticPath, "notes", "session", 0, 0);
|
|
551
|
+
})();
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
if (claudeNotesDir.endsWith("/Notes")) {
|
|
555
|
+
const claudeProjectDir = claudeNotesDir.slice(0, -6);
|
|
556
|
+
const claudeMemoryDir = join(claudeProjectDir, "memory");
|
|
557
|
+
const claudeMemoryMd = join(claudeProjectDir, "MEMORY.md");
|
|
558
|
+
if (existsSync(claudeMemoryMd)) filesToIndex.push({
|
|
559
|
+
absPath: claudeMemoryMd,
|
|
560
|
+
rootBase: claudeProjectDir,
|
|
561
|
+
source: "memory",
|
|
562
|
+
tier: "evergreen"
|
|
563
|
+
});
|
|
564
|
+
for (const absPath of walkMdFiles(claudeMemoryDir)) {
|
|
565
|
+
const tier = detectTier(relative(claudeProjectDir, absPath));
|
|
566
|
+
filesToIndex.push({
|
|
567
|
+
absPath,
|
|
568
|
+
rootBase: claudeProjectDir,
|
|
569
|
+
source: "memory",
|
|
570
|
+
tier
|
|
571
|
+
});
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
await yieldToEventLoop();
|
|
576
|
+
let filesSinceYield = 0;
|
|
577
|
+
for (const { absPath, rootBase, source, tier } of filesToIndex) {
|
|
578
|
+
if (filesSinceYield >= INDEX_YIELD_EVERY) {
|
|
579
|
+
await yieldToEventLoop();
|
|
580
|
+
filesSinceYield = 0;
|
|
581
|
+
}
|
|
582
|
+
filesSinceYield++;
|
|
583
|
+
const relPath = relative(rootBase, absPath);
|
|
584
|
+
if (indexFile(db, projectId, rootBase, relPath, source, tier)) {
|
|
585
|
+
const count = db.prepare("SELECT COUNT(*) as n FROM memory_chunks WHERE project_id = ? AND path = ?").get(projectId, relPath);
|
|
586
|
+
result.filesProcessed++;
|
|
587
|
+
result.chunksCreated += count.n;
|
|
588
|
+
} else result.filesSkipped++;
|
|
589
|
+
}
|
|
590
|
+
return result;
|
|
591
|
+
}
|
|
592
|
+
/**
|
|
593
|
+
* Yield to the Node.js event loop between projects so the IPC server
|
|
594
|
+
* remains responsive during long index runs.
|
|
595
|
+
*/
|
|
596
|
+
function yieldToEventLoop() {
|
|
597
|
+
return new Promise((resolve) => setImmediate(resolve));
|
|
598
|
+
}
|
|
599
|
+
/**
|
|
600
|
+
* Index all active projects registered in the registry DB.
|
|
601
|
+
*
|
|
602
|
+
* Async: yields to the event loop between each project so that the daemon's
|
|
603
|
+
* Unix socket server can process IPC requests (e.g. status) while indexing.
|
|
604
|
+
*/
|
|
605
|
+
async function indexAll(db, registryDb) {
|
|
606
|
+
const projects = registryDb.prepare("SELECT id, root_path, claude_notes_dir FROM projects WHERE status = 'active'").all();
|
|
607
|
+
const totals = {
|
|
608
|
+
filesProcessed: 0,
|
|
609
|
+
chunksCreated: 0,
|
|
610
|
+
filesSkipped: 0
|
|
611
|
+
};
|
|
612
|
+
for (const project of projects) {
|
|
613
|
+
await yieldToEventLoop();
|
|
614
|
+
const r = await indexProject(db, project.id, project.root_path, project.claude_notes_dir);
|
|
615
|
+
totals.filesProcessed += r.filesProcessed;
|
|
616
|
+
totals.chunksCreated += r.chunksCreated;
|
|
617
|
+
totals.filesSkipped += r.filesSkipped;
|
|
618
|
+
}
|
|
619
|
+
return {
|
|
620
|
+
projects: projects.length,
|
|
621
|
+
result: totals
|
|
622
|
+
};
|
|
623
|
+
}
|
|
624
|
+
/**
|
|
625
|
+
* Generate and store embeddings for chunks that do not yet have one.
|
|
626
|
+
*
|
|
627
|
+
* Because better-sqlite3 is synchronous but the embedding pipeline is async,
|
|
628
|
+
* we fetch all unembedded chunk texts first, generate embeddings in batches,
|
|
629
|
+
* and then write them back in a transaction.
|
|
630
|
+
*
|
|
631
|
+
* @param db Open federation database.
|
|
632
|
+
* @param projectId Optional — restrict to a specific project.
|
|
633
|
+
* @param batchSize Number of chunks to embed per round. Default 50.
|
|
634
|
+
* @param onProgress Optional callback called after each batch with running totals.
|
|
635
|
+
*/
|
|
636
|
+
async function embedChunks(db, projectId, batchSize = 50, onProgress) {
|
|
637
|
+
const { generateEmbedding, serializeEmbedding } = await import("./embeddings-mfqv-jFu.mjs").then((n) => n.i);
|
|
638
|
+
const conditions = ["embedding IS NULL"];
|
|
639
|
+
const params = [];
|
|
640
|
+
if (projectId !== void 0) {
|
|
641
|
+
conditions.push("project_id = ?");
|
|
642
|
+
params.push(projectId);
|
|
643
|
+
}
|
|
644
|
+
const where = "WHERE " + conditions.join(" AND ");
|
|
645
|
+
const rows = db.prepare(`SELECT id, text FROM memory_chunks ${where} ORDER BY id`).all(...params);
|
|
646
|
+
if (rows.length === 0) return {
|
|
647
|
+
chunksEmbedded: 0,
|
|
648
|
+
chunksSkipped: 0
|
|
649
|
+
};
|
|
650
|
+
const updateStmt = db.prepare("UPDATE memory_chunks SET embedding = ? WHERE id = ?");
|
|
651
|
+
let embedded = 0;
|
|
652
|
+
const total = rows.length;
|
|
653
|
+
for (let i = 0; i < rows.length; i += batchSize) {
|
|
654
|
+
const batch = rows.slice(i, i + batchSize);
|
|
655
|
+
const embeddings = [];
|
|
656
|
+
for (const row of batch) {
|
|
657
|
+
const blob = serializeEmbedding(await generateEmbedding(row.text));
|
|
658
|
+
embeddings.push({
|
|
659
|
+
id: row.id,
|
|
660
|
+
blob
|
|
661
|
+
});
|
|
662
|
+
}
|
|
663
|
+
db.transaction(() => {
|
|
664
|
+
for (const { id, blob } of embeddings) updateStmt.run(blob, id);
|
|
665
|
+
})();
|
|
666
|
+
embedded += embeddings.length;
|
|
667
|
+
onProgress?.(embedded, total);
|
|
668
|
+
}
|
|
669
|
+
return {
|
|
670
|
+
chunksEmbedded: embedded,
|
|
671
|
+
chunksSkipped: 0
|
|
672
|
+
};
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
//#endregion
|
|
676
|
+
export { indexProject as a, indexFile as i, embedChunks as n, chunkMarkdown as o, indexAll as r, estimateTokens as s, detectTier as t };
|
|
677
|
+
//# sourceMappingURL=indexer-B20bPHL-.mjs.map
|