@disco_trooper/apple-notes-mcp 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -24
- package/package.json +10 -8
- package/src/config/claude.test.ts +47 -0
- package/src/config/claude.ts +106 -0
- package/src/config/constants.ts +11 -2
- package/src/config/paths.test.ts +40 -0
- package/src/config/paths.ts +86 -0
- package/src/db/arrow-fix.test.ts +101 -0
- package/src/db/lancedb.test.ts +209 -2
- package/src/db/lancedb.ts +345 -7
- package/src/embeddings/cache.test.ts +150 -0
- package/src/embeddings/cache.ts +204 -0
- package/src/embeddings/index.ts +21 -2
- package/src/embeddings/local.ts +61 -10
- package/src/embeddings/openrouter.ts +233 -11
- package/src/graph/export.test.ts +81 -0
- package/src/graph/export.ts +163 -0
- package/src/graph/extract.test.ts +90 -0
- package/src/graph/extract.ts +52 -0
- package/src/graph/queries.test.ts +156 -0
- package/src/graph/queries.ts +224 -0
- package/src/index.ts +249 -9
- package/src/notes/crud.test.ts +26 -2
- package/src/notes/crud.ts +43 -5
- package/src/notes/read.ts +83 -68
- package/src/search/chunk-indexer.test.ts +353 -0
- package/src/search/chunk-indexer.ts +207 -0
- package/src/search/chunk-search.test.ts +327 -0
- package/src/search/chunk-search.ts +298 -0
- package/src/search/indexer.ts +151 -109
- package/src/setup.ts +46 -67
- package/src/utils/chunker.test.ts +182 -0
- package/src/utils/chunker.ts +170 -0
- package/src/utils/content-filter.test.ts +225 -0
- package/src/utils/content-filter.ts +275 -0
- package/src/utils/runtime.test.ts +70 -0
- package/src/utils/runtime.ts +40 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach, type Mock } from "vitest";
|
|
2
|
+
import type { NoteDetails } from "../notes/read.js";
|
|
3
|
+
import type { ChunkRecord } from "../db/lancedb.js";
|
|
4
|
+
|
|
5
|
+
// Mock dependencies before importing the module under test
|
|
6
|
+
vi.mock("../embeddings/index.js", () => ({
|
|
7
|
+
getEmbeddingBatch: vi.fn(),
|
|
8
|
+
}));
|
|
9
|
+
|
|
10
|
+
vi.mock("../db/lancedb.js", () => ({
|
|
11
|
+
getChunkStore: vi.fn(() => ({
|
|
12
|
+
indexChunks: vi.fn(),
|
|
13
|
+
count: vi.fn().mockResolvedValue(0),
|
|
14
|
+
})),
|
|
15
|
+
}));
|
|
16
|
+
|
|
17
|
+
vi.mock("../notes/read.js", () => ({
|
|
18
|
+
getAllNotesWithContent: vi.fn(),
|
|
19
|
+
}));
|
|
20
|
+
|
|
21
|
+
vi.mock("../utils/debug.js", () => ({
|
|
22
|
+
createDebugLogger: vi.fn(() => vi.fn()),
|
|
23
|
+
}));
|
|
24
|
+
|
|
25
|
+
// Import after mocking
|
|
26
|
+
import { chunkNote, fullChunkIndex, hasChunkIndex } from "./chunk-indexer.js";
|
|
27
|
+
import { getEmbeddingBatch } from "../embeddings/index.js";
|
|
28
|
+
import { getChunkStore } from "../db/lancedb.js";
|
|
29
|
+
import { getAllNotesWithContent } from "../notes/read.js";
|
|
30
|
+
|
|
31
|
+
describe("chunk-indexer", () => {
|
|
32
|
+
beforeEach(() => {
|
|
33
|
+
vi.clearAllMocks();
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
describe("chunkNote", () => {
|
|
37
|
+
// Note: content must be at least 50 chars to pass the content filter
|
|
38
|
+
it("creates chunks for a note with content", () => {
|
|
39
|
+
const content = "This is a test note content that is long enough to pass the minimum content length requirement for indexing.";
|
|
40
|
+
const note: NoteDetails = {
|
|
41
|
+
id: "note-123",
|
|
42
|
+
title: "Test Note",
|
|
43
|
+
folder: "Work",
|
|
44
|
+
content,
|
|
45
|
+
htmlContent: `<p>${content}</p>`,
|
|
46
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
47
|
+
modified: "2024-01-02T00:00:00.000Z",
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
const chunks = chunkNote(note);
|
|
51
|
+
|
|
52
|
+
expect(chunks).toHaveLength(1);
|
|
53
|
+
expect(chunks[0]).toMatchObject({
|
|
54
|
+
chunk_id: "note-123_chunk_0",
|
|
55
|
+
note_id: "note-123",
|
|
56
|
+
note_title: "Test Note",
|
|
57
|
+
folder: "Work",
|
|
58
|
+
chunk_index: 0,
|
|
59
|
+
total_chunks: 1,
|
|
60
|
+
content,
|
|
61
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
62
|
+
modified: "2024-01-02T00:00:00.000Z",
|
|
63
|
+
});
|
|
64
|
+
// Vector should be empty - not generated yet
|
|
65
|
+
expect(chunks[0].vector).toEqual([]);
|
|
66
|
+
// indexed_at should be empty - set during batch processing
|
|
67
|
+
expect(chunks[0].indexed_at).toBe("");
|
|
68
|
+
// Tags and outlinks should be extracted
|
|
69
|
+
expect(chunks[0].tags).toEqual([]);
|
|
70
|
+
expect(chunks[0].outlinks).toEqual([]);
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it("returns single chunk for notes under chunk size", () => {
|
|
74
|
+
const note: NoteDetails = {
|
|
75
|
+
id: "short-note",
|
|
76
|
+
title: "Short",
|
|
77
|
+
folder: "Notes",
|
|
78
|
+
content: "This is a shorter note but still has enough content to pass the minimum length filter requirement.",
|
|
79
|
+
htmlContent: "",
|
|
80
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
81
|
+
modified: "2024-01-01T00:00:00.000Z",
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
const chunks = chunkNote(note);
|
|
85
|
+
|
|
86
|
+
expect(chunks).toHaveLength(1);
|
|
87
|
+
expect(chunks[0].chunk_index).toBe(0);
|
|
88
|
+
expect(chunks[0].total_chunks).toBe(1);
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it("returns empty array for empty notes", () => {
|
|
92
|
+
const note: NoteDetails = {
|
|
93
|
+
id: "empty-note",
|
|
94
|
+
title: "Empty",
|
|
95
|
+
folder: "Notes",
|
|
96
|
+
content: "",
|
|
97
|
+
htmlContent: "",
|
|
98
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
99
|
+
modified: "2024-01-01T00:00:00.000Z",
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
const chunks = chunkNote(note);
|
|
103
|
+
|
|
104
|
+
expect(chunks).toHaveLength(0);
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
it("returns empty array for whitespace-only notes", () => {
|
|
108
|
+
const note: NoteDetails = {
|
|
109
|
+
id: "whitespace-note",
|
|
110
|
+
title: "Whitespace",
|
|
111
|
+
folder: "Notes",
|
|
112
|
+
content: " \n\n ",
|
|
113
|
+
htmlContent: "",
|
|
114
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
115
|
+
modified: "2024-01-01T00:00:00.000Z",
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
const chunks = chunkNote(note);
|
|
119
|
+
|
|
120
|
+
expect(chunks).toHaveLength(0);
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
it("extracts tags from note content", () => {
|
|
124
|
+
const note: NoteDetails = {
|
|
125
|
+
id: "tagged-note",
|
|
126
|
+
title: "Tagged Note",
|
|
127
|
+
folder: "Work",
|
|
128
|
+
content: "This note has #important and #work tags. It also contains enough text to pass the minimum content length requirement.",
|
|
129
|
+
htmlContent: "",
|
|
130
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
131
|
+
modified: "2024-01-01T00:00:00.000Z",
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
const chunks = chunkNote(note);
|
|
135
|
+
|
|
136
|
+
expect(chunks).toHaveLength(1);
|
|
137
|
+
expect(chunks[0].tags).toContain("important");
|
|
138
|
+
expect(chunks[0].tags).toContain("work");
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
it("extracts outlinks from note content", () => {
|
|
142
|
+
const note: NoteDetails = {
|
|
143
|
+
id: "linked-note",
|
|
144
|
+
title: "Linked Note",
|
|
145
|
+
folder: "Work",
|
|
146
|
+
content: "This links to [[Other Note]] and [[Another Note]]. This is additional content to meet the minimum length requirement.",
|
|
147
|
+
htmlContent: "",
|
|
148
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
149
|
+
modified: "2024-01-01T00:00:00.000Z",
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
const chunks = chunkNote(note);
|
|
153
|
+
|
|
154
|
+
expect(chunks).toHaveLength(1);
|
|
155
|
+
expect(chunks[0].outlinks).toContain("Other Note");
|
|
156
|
+
expect(chunks[0].outlinks).toContain("Another Note");
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
it("creates multiple chunks for long notes", () => {
|
|
160
|
+
// Create a long note that will produce multiple chunks
|
|
161
|
+
const longContent = "This is paragraph one. ".repeat(50) + "\n\n" +
|
|
162
|
+
"This is paragraph two. ".repeat(50);
|
|
163
|
+
const note: NoteDetails = {
|
|
164
|
+
id: "long-note",
|
|
165
|
+
title: "Long Note",
|
|
166
|
+
folder: "Work",
|
|
167
|
+
content: longContent,
|
|
168
|
+
htmlContent: "",
|
|
169
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
170
|
+
modified: "2024-01-01T00:00:00.000Z",
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
const chunks = chunkNote(note);
|
|
174
|
+
|
|
175
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
176
|
+
// Verify chunk IDs are unique
|
|
177
|
+
const chunkIds = chunks.map(c => c.chunk_id);
|
|
178
|
+
expect(new Set(chunkIds).size).toBe(chunks.length);
|
|
179
|
+
// Verify indices are correct
|
|
180
|
+
chunks.forEach((chunk, i) => {
|
|
181
|
+
expect(chunk.chunk_index).toBe(i);
|
|
182
|
+
expect(chunk.total_chunks).toBe(chunks.length);
|
|
183
|
+
expect(chunk.chunk_id).toBe(`long-note_chunk_${i}`);
|
|
184
|
+
});
|
|
185
|
+
});
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
describe("fullChunkIndex", () => {
|
|
189
|
+
it("processes notes and creates chunks with embeddings", async () => {
|
|
190
|
+
// Note: content must be at least 50 chars to pass content filter
|
|
191
|
+
const mockNotes: NoteDetails[] = [
|
|
192
|
+
{
|
|
193
|
+
id: "note-1",
|
|
194
|
+
title: "Note 1",
|
|
195
|
+
folder: "Work",
|
|
196
|
+
content: "This is the first note content with enough text to pass the minimum length filter requirement.",
|
|
197
|
+
htmlContent: "",
|
|
198
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
199
|
+
modified: "2024-01-02T00:00:00.000Z",
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
id: "note-2",
|
|
203
|
+
title: "Note 2",
|
|
204
|
+
folder: "Personal",
|
|
205
|
+
content: "This is the second note content with enough text to pass the minimum length filter requirement.",
|
|
206
|
+
htmlContent: "",
|
|
207
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
208
|
+
modified: "2024-01-02T00:00:00.000Z",
|
|
209
|
+
},
|
|
210
|
+
];
|
|
211
|
+
|
|
212
|
+
const mockVectors = [
|
|
213
|
+
[0.1, 0.2, 0.3],
|
|
214
|
+
[0.4, 0.5, 0.6],
|
|
215
|
+
];
|
|
216
|
+
|
|
217
|
+
const mockIndexChunks = vi.fn();
|
|
218
|
+
(getAllNotesWithContent as Mock).mockResolvedValue(mockNotes);
|
|
219
|
+
(getEmbeddingBatch as Mock).mockResolvedValue(mockVectors);
|
|
220
|
+
(getChunkStore as Mock).mockReturnValue({
|
|
221
|
+
indexChunks: mockIndexChunks,
|
|
222
|
+
count: vi.fn().mockResolvedValue(0),
|
|
223
|
+
});
|
|
224
|
+
|
|
225
|
+
const result = await fullChunkIndex();
|
|
226
|
+
|
|
227
|
+
// Verify all notes were fetched
|
|
228
|
+
expect(getAllNotesWithContent).toHaveBeenCalledOnce();
|
|
229
|
+
|
|
230
|
+
// Verify embeddings were generated for chunks
|
|
231
|
+
expect(getEmbeddingBatch).toHaveBeenCalledOnce();
|
|
232
|
+
const embeddingTexts = (getEmbeddingBatch as Mock).mock.calls[0][0];
|
|
233
|
+
expect(embeddingTexts).toHaveLength(2);
|
|
234
|
+
|
|
235
|
+
// Verify chunks were stored with vectors
|
|
236
|
+
expect(mockIndexChunks).toHaveBeenCalledOnce();
|
|
237
|
+
const storedChunks = mockIndexChunks.mock.calls[0][0] as ChunkRecord[];
|
|
238
|
+
expect(storedChunks).toHaveLength(2);
|
|
239
|
+
expect(storedChunks[0].vector).toEqual([0.1, 0.2, 0.3]);
|
|
240
|
+
expect(storedChunks[1].vector).toEqual([0.4, 0.5, 0.6]);
|
|
241
|
+
|
|
242
|
+
// Verify indexed_at is set
|
|
243
|
+
storedChunks.forEach(chunk => {
|
|
244
|
+
expect(chunk.indexed_at).toBeTruthy();
|
|
245
|
+
// Should be valid ISO date
|
|
246
|
+
expect(new Date(chunk.indexed_at).toISOString()).toBe(chunk.indexed_at);
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
// Verify result
|
|
250
|
+
expect(result).toMatchObject({
|
|
251
|
+
totalNotes: 2,
|
|
252
|
+
totalChunks: 2,
|
|
253
|
+
indexed: 2,
|
|
254
|
+
});
|
|
255
|
+
expect(result.timeMs).toBeGreaterThanOrEqual(0);
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
it("handles empty note list", async () => {
|
|
259
|
+
(getAllNotesWithContent as Mock).mockResolvedValue([]);
|
|
260
|
+
const mockIndexChunks = vi.fn();
|
|
261
|
+
(getChunkStore as Mock).mockReturnValue({
|
|
262
|
+
indexChunks: mockIndexChunks,
|
|
263
|
+
count: vi.fn().mockResolvedValue(0),
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
const result = await fullChunkIndex();
|
|
267
|
+
|
|
268
|
+
expect(result).toMatchObject({
|
|
269
|
+
totalNotes: 0,
|
|
270
|
+
totalChunks: 0,
|
|
271
|
+
indexed: 0,
|
|
272
|
+
});
|
|
273
|
+
// Should not call embedding or indexing for empty list
|
|
274
|
+
expect(getEmbeddingBatch).not.toHaveBeenCalled();
|
|
275
|
+
expect(mockIndexChunks).not.toHaveBeenCalled();
|
|
276
|
+
});
|
|
277
|
+
|
|
278
|
+
it("skips empty notes in chunking", async () => {
|
|
279
|
+
// Note: content must be at least 50 chars to pass content filter
|
|
280
|
+
const mockNotes: NoteDetails[] = [
|
|
281
|
+
{
|
|
282
|
+
id: "note-1",
|
|
283
|
+
title: "Note 1",
|
|
284
|
+
folder: "Work",
|
|
285
|
+
content: "This is valid note content with enough characters to pass the minimum length requirement for indexing.",
|
|
286
|
+
htmlContent: "",
|
|
287
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
288
|
+
modified: "2024-01-02T00:00:00.000Z",
|
|
289
|
+
},
|
|
290
|
+
{
|
|
291
|
+
id: "note-2",
|
|
292
|
+
title: "Empty Note",
|
|
293
|
+
folder: "Work",
|
|
294
|
+
content: "", // Empty!
|
|
295
|
+
htmlContent: "",
|
|
296
|
+
created: "2024-01-01T00:00:00.000Z",
|
|
297
|
+
modified: "2024-01-02T00:00:00.000Z",
|
|
298
|
+
},
|
|
299
|
+
];
|
|
300
|
+
|
|
301
|
+
const mockIndexChunks = vi.fn();
|
|
302
|
+
(getAllNotesWithContent as Mock).mockResolvedValue(mockNotes);
|
|
303
|
+
(getEmbeddingBatch as Mock).mockResolvedValue([[0.1, 0.2, 0.3]]);
|
|
304
|
+
(getChunkStore as Mock).mockReturnValue({
|
|
305
|
+
indexChunks: mockIndexChunks,
|
|
306
|
+
count: vi.fn().mockResolvedValue(0),
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
const result = await fullChunkIndex();
|
|
310
|
+
|
|
311
|
+
// Only 1 chunk should be created (empty note skipped)
|
|
312
|
+
expect(result.totalNotes).toBe(2);
|
|
313
|
+
expect(result.totalChunks).toBe(1);
|
|
314
|
+
expect(result.indexed).toBe(1);
|
|
315
|
+
|
|
316
|
+
const storedChunks = mockIndexChunks.mock.calls[0][0] as ChunkRecord[];
|
|
317
|
+
expect(storedChunks).toHaveLength(1);
|
|
318
|
+
expect(storedChunks[0].note_title).toBe("Note 1");
|
|
319
|
+
});
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
describe("hasChunkIndex", () => {
|
|
323
|
+
it("returns true when chunk index exists", async () => {
|
|
324
|
+
(getChunkStore as Mock).mockReturnValue({
|
|
325
|
+
count: vi.fn().mockResolvedValue(10),
|
|
326
|
+
});
|
|
327
|
+
|
|
328
|
+
const result = await hasChunkIndex();
|
|
329
|
+
|
|
330
|
+
expect(result).toBe(true);
|
|
331
|
+
});
|
|
332
|
+
|
|
333
|
+
it("returns false when chunk index is empty", async () => {
|
|
334
|
+
(getChunkStore as Mock).mockReturnValue({
|
|
335
|
+
count: vi.fn().mockResolvedValue(0),
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
const result = await hasChunkIndex();
|
|
339
|
+
|
|
340
|
+
expect(result).toBe(false);
|
|
341
|
+
});
|
|
342
|
+
|
|
343
|
+
it("returns false when chunk store throws (table not found)", async () => {
|
|
344
|
+
(getChunkStore as Mock).mockReturnValue({
|
|
345
|
+
count: vi.fn().mockRejectedValue(new Error("Chunk index not found")),
|
|
346
|
+
});
|
|
347
|
+
|
|
348
|
+
const result = await hasChunkIndex();
|
|
349
|
+
|
|
350
|
+
expect(result).toBe(false);
|
|
351
|
+
});
|
|
352
|
+
});
|
|
353
|
+
});
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Chunk indexer for Parent Document Retriever pattern.
|
|
3
|
+
* Splits notes into overlapping chunks, generates embeddings, and stores in LanceDB.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { getEmbeddingBatch } from "../embeddings/index.js";
|
|
7
|
+
import { getChunkStore, type ChunkRecord } from "../db/lancedb.js";
|
|
8
|
+
import { getAllNotesWithContent, type NoteDetails } from "../notes/read.js";
|
|
9
|
+
import { chunkText } from "../utils/chunker.js";
|
|
10
|
+
import { extractMetadata } from "../graph/extract.js";
|
|
11
|
+
import { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP } from "../config/constants.js";
|
|
12
|
+
import { createDebugLogger } from "../utils/debug.js";
|
|
13
|
+
import { filterContent, shouldIndexContent } from "../utils/content-filter.js";
|
|
14
|
+
|
|
15
|
+
// Debug logging
|
|
16
|
+
const debug = createDebugLogger("CHUNK-INDEXER");
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Result of a chunk indexing operation.
|
|
20
|
+
*/
|
|
21
|
+
export interface ChunkIndexResult {
|
|
22
|
+
/** Total number of notes processed */
|
|
23
|
+
totalNotes: number;
|
|
24
|
+
/** Total number of chunks created */
|
|
25
|
+
totalChunks: number;
|
|
26
|
+
/** Number of chunks indexed (with embeddings) */
|
|
27
|
+
indexed: number;
|
|
28
|
+
/** Time taken in milliseconds */
|
|
29
|
+
timeMs: number;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Chunk record for internal processing - explicit types to avoid index signature issues */
|
|
33
|
+
interface InternalChunkRecord {
|
|
34
|
+
chunk_id: string;
|
|
35
|
+
note_id: string;
|
|
36
|
+
note_title: string;
|
|
37
|
+
folder: string;
|
|
38
|
+
chunk_index: number;
|
|
39
|
+
total_chunks: number;
|
|
40
|
+
content: string;
|
|
41
|
+
vector: number[];
|
|
42
|
+
created: string;
|
|
43
|
+
modified: string;
|
|
44
|
+
indexed_at: string;
|
|
45
|
+
tags: string[];
|
|
46
|
+
outlinks: string[];
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Convert a note into chunk records WITHOUT vectors.
|
|
51
|
+
* Vectors are added later during batch embedding generation.
|
|
52
|
+
*
|
|
53
|
+
* Filters out Base64/binary content before chunking to improve search quality.
|
|
54
|
+
*
|
|
55
|
+
* @param note - The note to chunk
|
|
56
|
+
* @returns Array of ChunkRecord with empty vectors
|
|
57
|
+
*/
|
|
58
|
+
export function chunkNote(note: NoteDetails): InternalChunkRecord[] {
|
|
59
|
+
// Quick check - skip notes with mostly encoded content
|
|
60
|
+
if (!shouldIndexContent(note.content)) {
|
|
61
|
+
debug(`Note "${note.title}" skipped: contains mostly encoded/binary content`);
|
|
62
|
+
return [];
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Filter content to remove Base64 blocks and redact secrets
|
|
66
|
+
const filterResult = filterContent(note.content);
|
|
67
|
+
|
|
68
|
+
if (filterResult.action === "skip") {
|
|
69
|
+
debug(`Note "${note.title}" skipped: ${filterResult.reasons.join(", ")}`);
|
|
70
|
+
return [];
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const contentToChunk = filterResult.cleanedContent || note.content;
|
|
74
|
+
|
|
75
|
+
if (filterResult.action === "filter") {
|
|
76
|
+
debug(`Note "${note.title}" filtered: ${filterResult.reasons.join(", ")}`);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Extract metadata from the ORIGINAL content (tags/links should be preserved)
|
|
80
|
+
const { tags, outlinks } = extractMetadata(note.content);
|
|
81
|
+
|
|
82
|
+
// Chunk the filtered content
|
|
83
|
+
const chunks = chunkText(contentToChunk, {
|
|
84
|
+
chunkSize: DEFAULT_CHUNK_SIZE,
|
|
85
|
+
overlap: DEFAULT_CHUNK_OVERLAP,
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
// Return empty array for empty notes (chunkText handles this)
|
|
89
|
+
if (chunks.length === 0) {
|
|
90
|
+
debug(`Note "${note.title}" has no content to chunk`);
|
|
91
|
+
return [];
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
debug(`Note "${note.title}" chunked into ${chunks.length} chunks`);
|
|
95
|
+
|
|
96
|
+
// Convert to ChunkRecord format
|
|
97
|
+
return chunks.map((chunk) => ({
|
|
98
|
+
chunk_id: `${note.id}_chunk_${chunk.index}`,
|
|
99
|
+
note_id: note.id,
|
|
100
|
+
note_title: note.title,
|
|
101
|
+
folder: note.folder,
|
|
102
|
+
chunk_index: chunk.index,
|
|
103
|
+
total_chunks: chunk.totalChunks,
|
|
104
|
+
content: chunk.content,
|
|
105
|
+
vector: [], // Empty - to be filled during embedding generation
|
|
106
|
+
created: note.created,
|
|
107
|
+
modified: note.modified,
|
|
108
|
+
indexed_at: "", // Empty - to be set during batch processing
|
|
109
|
+
tags,
|
|
110
|
+
outlinks,
|
|
111
|
+
}));
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Perform a full chunk index of all notes.
|
|
116
|
+
*
|
|
117
|
+
* Phases:
|
|
118
|
+
* 1. Fetch all notes via getAllNotesWithContent
|
|
119
|
+
* 2. Chunk all notes using chunkNote
|
|
120
|
+
* 3. Generate embeddings in batch using getEmbeddingBatch
|
|
121
|
+
* 4. Combine chunks with vectors and set indexed_at
|
|
122
|
+
* 5. Store via getChunkStore().indexChunks()
|
|
123
|
+
*
|
|
124
|
+
* @returns ChunkIndexResult with stats
|
|
125
|
+
*/
|
|
126
|
+
export async function fullChunkIndex(): Promise<ChunkIndexResult> {
|
|
127
|
+
const startTime = Date.now();
|
|
128
|
+
|
|
129
|
+
// Phase 1: Fetch all notes
|
|
130
|
+
debug("Phase 1: Fetching all notes...");
|
|
131
|
+
const notes = await getAllNotesWithContent();
|
|
132
|
+
debug(`Fetched ${notes.length} notes`);
|
|
133
|
+
|
|
134
|
+
if (notes.length === 0) {
|
|
135
|
+
return {
|
|
136
|
+
totalNotes: 0,
|
|
137
|
+
totalChunks: 0,
|
|
138
|
+
indexed: 0,
|
|
139
|
+
timeMs: Date.now() - startTime,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Phase 2: Chunk all notes
|
|
144
|
+
debug("Phase 2: Chunking all notes...");
|
|
145
|
+
const allChunks: InternalChunkRecord[] = [];
|
|
146
|
+
for (const note of notes) {
|
|
147
|
+
const noteChunks = chunkNote(note);
|
|
148
|
+
allChunks.push(...noteChunks);
|
|
149
|
+
}
|
|
150
|
+
debug(`Created ${allChunks.length} chunks from ${notes.length} notes`);
|
|
151
|
+
|
|
152
|
+
if (allChunks.length === 0) {
|
|
153
|
+
return {
|
|
154
|
+
totalNotes: notes.length,
|
|
155
|
+
totalChunks: 0,
|
|
156
|
+
indexed: 0,
|
|
157
|
+
timeMs: Date.now() - startTime,
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Phase 3: Generate embeddings in batch
|
|
162
|
+
debug("Phase 3: Generating embeddings...");
|
|
163
|
+
const chunkTexts: string[] = allChunks.map((chunk) => chunk.content);
|
|
164
|
+
const vectors = await getEmbeddingBatch(chunkTexts);
|
|
165
|
+
debug(`Generated ${vectors.length} embeddings`);
|
|
166
|
+
|
|
167
|
+
// Phase 4: Combine chunks with vectors and set indexed_at
|
|
168
|
+
debug("Phase 4: Combining chunks with vectors...");
|
|
169
|
+
const indexedAt = new Date().toISOString();
|
|
170
|
+
const completeChunks: ChunkRecord[] = allChunks.map((chunk, i) => ({
|
|
171
|
+
...chunk,
|
|
172
|
+
vector: vectors[i],
|
|
173
|
+
indexed_at: indexedAt,
|
|
174
|
+
}));
|
|
175
|
+
|
|
176
|
+
// Phase 5: Store in LanceDB
|
|
177
|
+
debug("Phase 5: Storing chunks...");
|
|
178
|
+
const chunkStore = getChunkStore();
|
|
179
|
+
await chunkStore.indexChunks(completeChunks);
|
|
180
|
+
debug(`Stored ${completeChunks.length} chunks`);
|
|
181
|
+
|
|
182
|
+
const timeMs = Date.now() - startTime;
|
|
183
|
+
debug(`Chunk indexing completed in ${timeMs}ms`);
|
|
184
|
+
|
|
185
|
+
return {
|
|
186
|
+
totalNotes: notes.length,
|
|
187
|
+
totalChunks: allChunks.length,
|
|
188
|
+
indexed: completeChunks.length,
|
|
189
|
+
timeMs,
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Check if a chunk index exists.
|
|
195
|
+
*
|
|
196
|
+
* @returns true if chunk index has records, false otherwise
|
|
197
|
+
*/
|
|
198
|
+
export async function hasChunkIndex(): Promise<boolean> {
|
|
199
|
+
try {
|
|
200
|
+
const chunkStore = getChunkStore();
|
|
201
|
+
const count = await chunkStore.count();
|
|
202
|
+
return count > 0;
|
|
203
|
+
} catch {
|
|
204
|
+
// Table doesn't exist or error - no index
|
|
205
|
+
return false;
|
|
206
|
+
}
|
|
207
|
+
}
|