@disco_trooper/apple-notes-mcp 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,353 @@
1
+ import { describe, it, expect, vi, beforeEach, type Mock } from "vitest";
2
+ import type { NoteDetails } from "../notes/read.js";
3
+ import type { ChunkRecord } from "../db/lancedb.js";
4
+
5
+ // Mock dependencies before importing the module under test
6
+ vi.mock("../embeddings/index.js", () => ({
7
+ getEmbeddingBatch: vi.fn(),
8
+ }));
9
+
10
+ vi.mock("../db/lancedb.js", () => ({
11
+ getChunkStore: vi.fn(() => ({
12
+ indexChunks: vi.fn(),
13
+ count: vi.fn().mockResolvedValue(0),
14
+ })),
15
+ }));
16
+
17
+ vi.mock("../notes/read.js", () => ({
18
+ getAllNotesWithContent: vi.fn(),
19
+ }));
20
+
21
+ vi.mock("../utils/debug.js", () => ({
22
+ createDebugLogger: vi.fn(() => vi.fn()),
23
+ }));
24
+
25
+ // Import after mocking
26
+ import { chunkNote, fullChunkIndex, hasChunkIndex } from "./chunk-indexer.js";
27
+ import { getEmbeddingBatch } from "../embeddings/index.js";
28
+ import { getChunkStore } from "../db/lancedb.js";
29
+ import { getAllNotesWithContent } from "../notes/read.js";
30
+
31
+ describe("chunk-indexer", () => {
32
+ beforeEach(() => {
33
+ vi.clearAllMocks();
34
+ });
35
+
36
+ describe("chunkNote", () => {
37
+ // Note: content must be at least 50 chars to pass the content filter
38
+ it("creates chunks for a note with content", () => {
39
+ const content = "This is a test note content that is long enough to pass the minimum content length requirement for indexing.";
40
+ const note: NoteDetails = {
41
+ id: "note-123",
42
+ title: "Test Note",
43
+ folder: "Work",
44
+ content,
45
+ htmlContent: `<p>${content}</p>`,
46
+ created: "2024-01-01T00:00:00.000Z",
47
+ modified: "2024-01-02T00:00:00.000Z",
48
+ };
49
+
50
+ const chunks = chunkNote(note);
51
+
52
+ expect(chunks).toHaveLength(1);
53
+ expect(chunks[0]).toMatchObject({
54
+ chunk_id: "note-123_chunk_0",
55
+ note_id: "note-123",
56
+ note_title: "Test Note",
57
+ folder: "Work",
58
+ chunk_index: 0,
59
+ total_chunks: 1,
60
+ content,
61
+ created: "2024-01-01T00:00:00.000Z",
62
+ modified: "2024-01-02T00:00:00.000Z",
63
+ });
64
+ // Vector should be empty - not generated yet
65
+ expect(chunks[0].vector).toEqual([]);
66
+ // indexed_at should be empty - set during batch processing
67
+ expect(chunks[0].indexed_at).toBe("");
68
+ // Tags and outlinks should be extracted
69
+ expect(chunks[0].tags).toEqual([]);
70
+ expect(chunks[0].outlinks).toEqual([]);
71
+ });
72
+
73
+ it("returns single chunk for notes under chunk size", () => {
74
+ const note: NoteDetails = {
75
+ id: "short-note",
76
+ title: "Short",
77
+ folder: "Notes",
78
+ content: "This is a shorter note but still has enough content to pass the minimum length filter requirement.",
79
+ htmlContent: "",
80
+ created: "2024-01-01T00:00:00.000Z",
81
+ modified: "2024-01-01T00:00:00.000Z",
82
+ };
83
+
84
+ const chunks = chunkNote(note);
85
+
86
+ expect(chunks).toHaveLength(1);
87
+ expect(chunks[0].chunk_index).toBe(0);
88
+ expect(chunks[0].total_chunks).toBe(1);
89
+ });
90
+
91
+ it("returns empty array for empty notes", () => {
92
+ const note: NoteDetails = {
93
+ id: "empty-note",
94
+ title: "Empty",
95
+ folder: "Notes",
96
+ content: "",
97
+ htmlContent: "",
98
+ created: "2024-01-01T00:00:00.000Z",
99
+ modified: "2024-01-01T00:00:00.000Z",
100
+ };
101
+
102
+ const chunks = chunkNote(note);
103
+
104
+ expect(chunks).toHaveLength(0);
105
+ });
106
+
107
+ it("returns empty array for whitespace-only notes", () => {
108
+ const note: NoteDetails = {
109
+ id: "whitespace-note",
110
+ title: "Whitespace",
111
+ folder: "Notes",
112
+ content: " \n\n ",
113
+ htmlContent: "",
114
+ created: "2024-01-01T00:00:00.000Z",
115
+ modified: "2024-01-01T00:00:00.000Z",
116
+ };
117
+
118
+ const chunks = chunkNote(note);
119
+
120
+ expect(chunks).toHaveLength(0);
121
+ });
122
+
123
+ it("extracts tags from note content", () => {
124
+ const note: NoteDetails = {
125
+ id: "tagged-note",
126
+ title: "Tagged Note",
127
+ folder: "Work",
128
+ content: "This note has #important and #work tags. It also contains enough text to pass the minimum content length requirement.",
129
+ htmlContent: "",
130
+ created: "2024-01-01T00:00:00.000Z",
131
+ modified: "2024-01-01T00:00:00.000Z",
132
+ };
133
+
134
+ const chunks = chunkNote(note);
135
+
136
+ expect(chunks).toHaveLength(1);
137
+ expect(chunks[0].tags).toContain("important");
138
+ expect(chunks[0].tags).toContain("work");
139
+ });
140
+
141
+ it("extracts outlinks from note content", () => {
142
+ const note: NoteDetails = {
143
+ id: "linked-note",
144
+ title: "Linked Note",
145
+ folder: "Work",
146
+ content: "This links to [[Other Note]] and [[Another Note]]. This is additional content to meet the minimum length requirement.",
147
+ htmlContent: "",
148
+ created: "2024-01-01T00:00:00.000Z",
149
+ modified: "2024-01-01T00:00:00.000Z",
150
+ };
151
+
152
+ const chunks = chunkNote(note);
153
+
154
+ expect(chunks).toHaveLength(1);
155
+ expect(chunks[0].outlinks).toContain("Other Note");
156
+ expect(chunks[0].outlinks).toContain("Another Note");
157
+ });
158
+
159
+ it("creates multiple chunks for long notes", () => {
160
+ // Create a long note that will produce multiple chunks
161
+ const longContent = "This is paragraph one. ".repeat(50) + "\n\n" +
162
+ "This is paragraph two. ".repeat(50);
163
+ const note: NoteDetails = {
164
+ id: "long-note",
165
+ title: "Long Note",
166
+ folder: "Work",
167
+ content: longContent,
168
+ htmlContent: "",
169
+ created: "2024-01-01T00:00:00.000Z",
170
+ modified: "2024-01-01T00:00:00.000Z",
171
+ };
172
+
173
+ const chunks = chunkNote(note);
174
+
175
+ expect(chunks.length).toBeGreaterThan(1);
176
+ // Verify chunk IDs are unique
177
+ const chunkIds = chunks.map(c => c.chunk_id);
178
+ expect(new Set(chunkIds).size).toBe(chunks.length);
179
+ // Verify indices are correct
180
+ chunks.forEach((chunk, i) => {
181
+ expect(chunk.chunk_index).toBe(i);
182
+ expect(chunk.total_chunks).toBe(chunks.length);
183
+ expect(chunk.chunk_id).toBe(`long-note_chunk_${i}`);
184
+ });
185
+ });
186
+ });
187
+
188
+ describe("fullChunkIndex", () => {
189
+ it("processes notes and creates chunks with embeddings", async () => {
190
+ // Note: content must be at least 50 chars to pass content filter
191
+ const mockNotes: NoteDetails[] = [
192
+ {
193
+ id: "note-1",
194
+ title: "Note 1",
195
+ folder: "Work",
196
+ content: "This is the first note content with enough text to pass the minimum length filter requirement.",
197
+ htmlContent: "",
198
+ created: "2024-01-01T00:00:00.000Z",
199
+ modified: "2024-01-02T00:00:00.000Z",
200
+ },
201
+ {
202
+ id: "note-2",
203
+ title: "Note 2",
204
+ folder: "Personal",
205
+ content: "This is the second note content with enough text to pass the minimum length filter requirement.",
206
+ htmlContent: "",
207
+ created: "2024-01-01T00:00:00.000Z",
208
+ modified: "2024-01-02T00:00:00.000Z",
209
+ },
210
+ ];
211
+
212
+ const mockVectors = [
213
+ [0.1, 0.2, 0.3],
214
+ [0.4, 0.5, 0.6],
215
+ ];
216
+
217
+ const mockIndexChunks = vi.fn();
218
+ (getAllNotesWithContent as Mock).mockResolvedValue(mockNotes);
219
+ (getEmbeddingBatch as Mock).mockResolvedValue(mockVectors);
220
+ (getChunkStore as Mock).mockReturnValue({
221
+ indexChunks: mockIndexChunks,
222
+ count: vi.fn().mockResolvedValue(0),
223
+ });
224
+
225
+ const result = await fullChunkIndex();
226
+
227
+ // Verify all notes were fetched
228
+ expect(getAllNotesWithContent).toHaveBeenCalledOnce();
229
+
230
+ // Verify embeddings were generated for chunks
231
+ expect(getEmbeddingBatch).toHaveBeenCalledOnce();
232
+ const embeddingTexts = (getEmbeddingBatch as Mock).mock.calls[0][0];
233
+ expect(embeddingTexts).toHaveLength(2);
234
+
235
+ // Verify chunks were stored with vectors
236
+ expect(mockIndexChunks).toHaveBeenCalledOnce();
237
+ const storedChunks = mockIndexChunks.mock.calls[0][0] as ChunkRecord[];
238
+ expect(storedChunks).toHaveLength(2);
239
+ expect(storedChunks[0].vector).toEqual([0.1, 0.2, 0.3]);
240
+ expect(storedChunks[1].vector).toEqual([0.4, 0.5, 0.6]);
241
+
242
+ // Verify indexed_at is set
243
+ storedChunks.forEach(chunk => {
244
+ expect(chunk.indexed_at).toBeTruthy();
245
+ // Should be valid ISO date
246
+ expect(new Date(chunk.indexed_at).toISOString()).toBe(chunk.indexed_at);
247
+ });
248
+
249
+ // Verify result
250
+ expect(result).toMatchObject({
251
+ totalNotes: 2,
252
+ totalChunks: 2,
253
+ indexed: 2,
254
+ });
255
+ expect(result.timeMs).toBeGreaterThanOrEqual(0);
256
+ });
257
+
258
+ it("handles empty note list", async () => {
259
+ (getAllNotesWithContent as Mock).mockResolvedValue([]);
260
+ const mockIndexChunks = vi.fn();
261
+ (getChunkStore as Mock).mockReturnValue({
262
+ indexChunks: mockIndexChunks,
263
+ count: vi.fn().mockResolvedValue(0),
264
+ });
265
+
266
+ const result = await fullChunkIndex();
267
+
268
+ expect(result).toMatchObject({
269
+ totalNotes: 0,
270
+ totalChunks: 0,
271
+ indexed: 0,
272
+ });
273
+ // Should not call embedding or indexing for empty list
274
+ expect(getEmbeddingBatch).not.toHaveBeenCalled();
275
+ expect(mockIndexChunks).not.toHaveBeenCalled();
276
+ });
277
+
278
+ it("skips empty notes in chunking", async () => {
279
+ // Note: content must be at least 50 chars to pass content filter
280
+ const mockNotes: NoteDetails[] = [
281
+ {
282
+ id: "note-1",
283
+ title: "Note 1",
284
+ folder: "Work",
285
+ content: "This is valid note content with enough characters to pass the minimum length requirement for indexing.",
286
+ htmlContent: "",
287
+ created: "2024-01-01T00:00:00.000Z",
288
+ modified: "2024-01-02T00:00:00.000Z",
289
+ },
290
+ {
291
+ id: "note-2",
292
+ title: "Empty Note",
293
+ folder: "Work",
294
+ content: "", // Empty!
295
+ htmlContent: "",
296
+ created: "2024-01-01T00:00:00.000Z",
297
+ modified: "2024-01-02T00:00:00.000Z",
298
+ },
299
+ ];
300
+
301
+ const mockIndexChunks = vi.fn();
302
+ (getAllNotesWithContent as Mock).mockResolvedValue(mockNotes);
303
+ (getEmbeddingBatch as Mock).mockResolvedValue([[0.1, 0.2, 0.3]]);
304
+ (getChunkStore as Mock).mockReturnValue({
305
+ indexChunks: mockIndexChunks,
306
+ count: vi.fn().mockResolvedValue(0),
307
+ });
308
+
309
+ const result = await fullChunkIndex();
310
+
311
+ // Only 1 chunk should be created (empty note skipped)
312
+ expect(result.totalNotes).toBe(2);
313
+ expect(result.totalChunks).toBe(1);
314
+ expect(result.indexed).toBe(1);
315
+
316
+ const storedChunks = mockIndexChunks.mock.calls[0][0] as ChunkRecord[];
317
+ expect(storedChunks).toHaveLength(1);
318
+ expect(storedChunks[0].note_title).toBe("Note 1");
319
+ });
320
+ });
321
+
322
+ describe("hasChunkIndex", () => {
323
+ it("returns true when chunk index exists", async () => {
324
+ (getChunkStore as Mock).mockReturnValue({
325
+ count: vi.fn().mockResolvedValue(10),
326
+ });
327
+
328
+ const result = await hasChunkIndex();
329
+
330
+ expect(result).toBe(true);
331
+ });
332
+
333
+ it("returns false when chunk index is empty", async () => {
334
+ (getChunkStore as Mock).mockReturnValue({
335
+ count: vi.fn().mockResolvedValue(0),
336
+ });
337
+
338
+ const result = await hasChunkIndex();
339
+
340
+ expect(result).toBe(false);
341
+ });
342
+
343
+ it("returns false when chunk store throws (table not found)", async () => {
344
+ (getChunkStore as Mock).mockReturnValue({
345
+ count: vi.fn().mockRejectedValue(new Error("Chunk index not found")),
346
+ });
347
+
348
+ const result = await hasChunkIndex();
349
+
350
+ expect(result).toBe(false);
351
+ });
352
+ });
353
+ });
@@ -0,0 +1,207 @@
1
+ /**
2
+ * Chunk indexer for Parent Document Retriever pattern.
3
+ * Splits notes into overlapping chunks, generates embeddings, and stores in LanceDB.
4
+ */
5
+
6
+ import { getEmbeddingBatch } from "../embeddings/index.js";
7
+ import { getChunkStore, type ChunkRecord } from "../db/lancedb.js";
8
+ import { getAllNotesWithContent, type NoteDetails } from "../notes/read.js";
9
+ import { chunkText } from "../utils/chunker.js";
10
+ import { extractMetadata } from "../graph/extract.js";
11
+ import { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP } from "../config/constants.js";
12
+ import { createDebugLogger } from "../utils/debug.js";
13
+ import { filterContent, shouldIndexContent } from "../utils/content-filter.js";
14
+
15
+ // Debug logging
16
+ const debug = createDebugLogger("CHUNK-INDEXER");
17
+
18
+ /**
19
+ * Result of a chunk indexing operation.
20
+ */
21
+ export interface ChunkIndexResult {
22
+ /** Total number of notes processed */
23
+ totalNotes: number;
24
+ /** Total number of chunks created */
25
+ totalChunks: number;
26
+ /** Number of chunks indexed (with embeddings) */
27
+ indexed: number;
28
+ /** Time taken in milliseconds */
29
+ timeMs: number;
30
+ }
31
+
32
+ /** Chunk record for internal processing - explicit types to avoid index signature issues */
33
+ interface InternalChunkRecord {
34
+ chunk_id: string;
35
+ note_id: string;
36
+ note_title: string;
37
+ folder: string;
38
+ chunk_index: number;
39
+ total_chunks: number;
40
+ content: string;
41
+ vector: number[];
42
+ created: string;
43
+ modified: string;
44
+ indexed_at: string;
45
+ tags: string[];
46
+ outlinks: string[];
47
+ }
48
+
49
+ /**
50
+ * Convert a note into chunk records WITHOUT vectors.
51
+ * Vectors are added later during batch embedding generation.
52
+ *
53
+ * Filters out Base64/binary content before chunking to improve search quality.
54
+ *
55
+ * @param note - The note to chunk
56
+ * @returns Array of ChunkRecord with empty vectors
57
+ */
58
+ export function chunkNote(note: NoteDetails): InternalChunkRecord[] {
59
+ // Quick check - skip notes with mostly encoded content
60
+ if (!shouldIndexContent(note.content)) {
61
+ debug(`Note "${note.title}" skipped: contains mostly encoded/binary content`);
62
+ return [];
63
+ }
64
+
65
+ // Filter content to remove Base64 blocks and redact secrets
66
+ const filterResult = filterContent(note.content);
67
+
68
+ if (filterResult.action === "skip") {
69
+ debug(`Note "${note.title}" skipped: ${filterResult.reasons.join(", ")}`);
70
+ return [];
71
+ }
72
+
73
+ const contentToChunk = filterResult.cleanedContent || note.content;
74
+
75
+ if (filterResult.action === "filter") {
76
+ debug(`Note "${note.title}" filtered: ${filterResult.reasons.join(", ")}`);
77
+ }
78
+
79
+ // Extract metadata from the ORIGINAL content (tags/links should be preserved)
80
+ const { tags, outlinks } = extractMetadata(note.content);
81
+
82
+ // Chunk the filtered content
83
+ const chunks = chunkText(contentToChunk, {
84
+ chunkSize: DEFAULT_CHUNK_SIZE,
85
+ overlap: DEFAULT_CHUNK_OVERLAP,
86
+ });
87
+
88
+ // Return empty array for empty notes (chunkText handles this)
89
+ if (chunks.length === 0) {
90
+ debug(`Note "${note.title}" has no content to chunk`);
91
+ return [];
92
+ }
93
+
94
+ debug(`Note "${note.title}" chunked into ${chunks.length} chunks`);
95
+
96
+ // Convert to ChunkRecord format
97
+ return chunks.map((chunk) => ({
98
+ chunk_id: `${note.id}_chunk_${chunk.index}`,
99
+ note_id: note.id,
100
+ note_title: note.title,
101
+ folder: note.folder,
102
+ chunk_index: chunk.index,
103
+ total_chunks: chunk.totalChunks,
104
+ content: chunk.content,
105
+ vector: [], // Empty - to be filled during embedding generation
106
+ created: note.created,
107
+ modified: note.modified,
108
+ indexed_at: "", // Empty - to be set during batch processing
109
+ tags,
110
+ outlinks,
111
+ }));
112
+ }
113
+
114
+ /**
115
+ * Perform a full chunk index of all notes.
116
+ *
117
+ * Phases:
118
+ * 1. Fetch all notes via getAllNotesWithContent
119
+ * 2. Chunk all notes using chunkNote
120
+ * 3. Generate embeddings in batch using getEmbeddingBatch
121
+ * 4. Combine chunks with vectors and set indexed_at
122
+ * 5. Store via getChunkStore().indexChunks()
123
+ *
124
+ * @returns ChunkIndexResult with stats
125
+ */
126
+ export async function fullChunkIndex(): Promise<ChunkIndexResult> {
127
+ const startTime = Date.now();
128
+
129
+ // Phase 1: Fetch all notes
130
+ debug("Phase 1: Fetching all notes...");
131
+ const notes = await getAllNotesWithContent();
132
+ debug(`Fetched ${notes.length} notes`);
133
+
134
+ if (notes.length === 0) {
135
+ return {
136
+ totalNotes: 0,
137
+ totalChunks: 0,
138
+ indexed: 0,
139
+ timeMs: Date.now() - startTime,
140
+ };
141
+ }
142
+
143
+ // Phase 2: Chunk all notes
144
+ debug("Phase 2: Chunking all notes...");
145
+ const allChunks: InternalChunkRecord[] = [];
146
+ for (const note of notes) {
147
+ const noteChunks = chunkNote(note);
148
+ allChunks.push(...noteChunks);
149
+ }
150
+ debug(`Created ${allChunks.length} chunks from ${notes.length} notes`);
151
+
152
+ if (allChunks.length === 0) {
153
+ return {
154
+ totalNotes: notes.length,
155
+ totalChunks: 0,
156
+ indexed: 0,
157
+ timeMs: Date.now() - startTime,
158
+ };
159
+ }
160
+
161
+ // Phase 3: Generate embeddings in batch
162
+ debug("Phase 3: Generating embeddings...");
163
+ const chunkTexts: string[] = allChunks.map((chunk) => chunk.content);
164
+ const vectors = await getEmbeddingBatch(chunkTexts);
165
+ debug(`Generated ${vectors.length} embeddings`);
166
+
167
+ // Phase 4: Combine chunks with vectors and set indexed_at
168
+ debug("Phase 4: Combining chunks with vectors...");
169
+ const indexedAt = new Date().toISOString();
170
+ const completeChunks: ChunkRecord[] = allChunks.map((chunk, i) => ({
171
+ ...chunk,
172
+ vector: vectors[i],
173
+ indexed_at: indexedAt,
174
+ }));
175
+
176
+ // Phase 5: Store in LanceDB
177
+ debug("Phase 5: Storing chunks...");
178
+ const chunkStore = getChunkStore();
179
+ await chunkStore.indexChunks(completeChunks);
180
+ debug(`Stored ${completeChunks.length} chunks`);
181
+
182
+ const timeMs = Date.now() - startTime;
183
+ debug(`Chunk indexing completed in ${timeMs}ms`);
184
+
185
+ return {
186
+ totalNotes: notes.length,
187
+ totalChunks: allChunks.length,
188
+ indexed: completeChunks.length,
189
+ timeMs,
190
+ };
191
+ }
192
+
193
+ /**
194
+ * Check if a chunk index exists.
195
+ *
196
+ * @returns true if chunk index has records, false otherwise
197
+ */
198
+ export async function hasChunkIndex(): Promise<boolean> {
199
+ try {
200
+ const chunkStore = getChunkStore();
201
+ const count = await chunkStore.count();
202
+ return count > 0;
203
+ } catch {
204
+ // Table doesn't exist or error - no index
205
+ return false;
206
+ }
207
+ }