@disco_trooper/apple-notes-mcp 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +104 -24
  2. package/package.json +11 -12
  3. package/src/config/claude.test.ts +47 -0
  4. package/src/config/claude.ts +106 -0
  5. package/src/config/constants.ts +11 -2
  6. package/src/config/paths.test.ts +40 -0
  7. package/src/config/paths.ts +86 -0
  8. package/src/db/arrow-fix.test.ts +101 -0
  9. package/src/db/lancedb.test.ts +254 -2
  10. package/src/db/lancedb.ts +385 -38
  11. package/src/embeddings/cache.test.ts +150 -0
  12. package/src/embeddings/cache.ts +204 -0
  13. package/src/embeddings/index.ts +22 -4
  14. package/src/embeddings/local.ts +57 -17
  15. package/src/embeddings/openrouter.ts +233 -11
  16. package/src/errors/index.test.ts +64 -0
  17. package/src/errors/index.ts +62 -0
  18. package/src/graph/export.test.ts +81 -0
  19. package/src/graph/export.ts +163 -0
  20. package/src/graph/extract.test.ts +90 -0
  21. package/src/graph/extract.ts +52 -0
  22. package/src/graph/queries.test.ts +156 -0
  23. package/src/graph/queries.ts +224 -0
  24. package/src/index.ts +309 -23
  25. package/src/notes/conversion.ts +62 -0
  26. package/src/notes/crud.test.ts +41 -8
  27. package/src/notes/crud.ts +75 -64
  28. package/src/notes/read.test.ts +58 -3
  29. package/src/notes/read.ts +142 -210
  30. package/src/notes/resolve.ts +174 -0
  31. package/src/notes/tables.ts +69 -40
  32. package/src/search/chunk-indexer.test.ts +353 -0
  33. package/src/search/chunk-indexer.ts +207 -0
  34. package/src/search/chunk-search.test.ts +327 -0
  35. package/src/search/chunk-search.ts +298 -0
  36. package/src/search/index.ts +4 -6
  37. package/src/search/indexer.ts +164 -109
  38. package/src/setup.ts +46 -67
  39. package/src/types/index.ts +4 -0
  40. package/src/utils/chunker.test.ts +182 -0
  41. package/src/utils/chunker.ts +170 -0
  42. package/src/utils/content-filter.test.ts +225 -0
  43. package/src/utils/content-filter.ts +275 -0
  44. package/src/utils/debug.ts +0 -2
  45. package/src/utils/runtime.test.ts +70 -0
  46. package/src/utils/runtime.ts +40 -0
  47. package/src/utils/text.test.ts +32 -0
  48. package/CLAUDE.md +0 -56
  49. package/src/server.ts +0 -427
@@ -65,6 +65,68 @@ export function parseTable(html: string): TableData {
65
65
  return result;
66
66
  }
67
67
 
68
+ /**
69
+ * Find the HTML of a specific row in a table.
70
+ */
71
+ function findRowHtml(tableHtml: string, rowIndex: number): { match: RegExpMatchArray; content: string } | null {
72
+ const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
73
+ let currentRow = 0;
74
+ let rowMatch;
75
+
76
+ while ((rowMatch = rowRegex.exec(tableHtml)) !== null) {
77
+ if (currentRow === rowIndex) {
78
+ return { match: rowMatch, content: rowMatch[1] };
79
+ }
80
+ currentRow++;
81
+ }
82
+ return null;
83
+ }
84
+
85
+ /**
86
+ * Escape HTML special characters to prevent injection.
87
+ */
88
+ function escapeHtml(text: string): string {
89
+ return text
90
+ .replace(/&/g, "&amp;")
91
+ .replace(/</g, "&lt;")
92
+ .replace(/>/g, "&gt;")
93
+ .replace(/"/g, "&quot;");
94
+ }
95
+
96
+ /**
97
+ * Update a specific cell within a row's HTML.
98
+ */
99
+ function updateCellInRow(rowContent: string, columnIndex: number, value: string, isBold: boolean): string {
100
+ const cellRegex = /(<td[^>]*>[\s\S]*?<div[^>]*>)([\s\S]*?)(<\/div>[\s\S]*?<\/td>)/gi;
101
+ let currentCol = 0;
102
+ let result = rowContent;
103
+ let cellMatch;
104
+
105
+ const replacements: Array<{original: string; replacement: string}> = [];
106
+
107
+ // Escape HTML to prevent injection
108
+ const escapedValue = escapeHtml(value);
109
+
110
+ while ((cellMatch = cellRegex.exec(rowContent)) !== null) {
111
+ if (currentCol === columnIndex) {
112
+ const prefix = cellMatch[1];
113
+ const suffix = cellMatch[3];
114
+ const newContent = isBold ? `<b>${escapedValue}</b>` : escapedValue;
115
+ replacements.push({
116
+ original: cellMatch[0],
117
+ replacement: `${prefix}${newContent}${suffix}`
118
+ });
119
+ }
120
+ currentCol++;
121
+ }
122
+
123
+ for (const r of replacements) {
124
+ result = result.replace(r.original, r.replacement);
125
+ }
126
+
127
+ return result;
128
+ }
129
+
68
130
  /**
69
131
  * Update a specific cell in an Apple Notes table HTML.
70
132
  *
@@ -85,48 +147,15 @@ export function updateTableCell(html: string, row: number, column: number, value
85
147
  throw new Error(`Column ${column} out of bounds (row has ${parsed.rows[row].length} columns)`);
86
148
  }
87
149
 
88
- // Find and replace the specific cell
89
- let currentRow = 0;
90
- let result = html;
91
-
92
- const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
93
- let rowMatch;
94
-
95
- while ((rowMatch = rowRegex.exec(html)) !== null) {
96
- if (currentRow === row) {
97
- const rowContent = rowMatch[1];
98
- let currentCol = 0;
99
- let newRowContent = rowContent;
100
-
101
- const cellRegex = /(<td[^>]*>[\s\S]*?<div[^>]*>)([\s\S]*?)(<\/div>[\s\S]*?<\/td>)/gi;
102
- let cellMatch;
103
- const replacements: Array<{original: string; replacement: string}> = [];
104
-
105
- while ((cellMatch = cellRegex.exec(rowContent)) !== null) {
106
- if (currentCol === column) {
107
- const prefix = cellMatch[1];
108
- const suffix = cellMatch[3];
109
- const isBold = parsed.formatting[row][column].bold;
110
- const newContent = isBold ? `<b>${value}</b>` : value;
111
- replacements.push({
112
- original: cellMatch[0],
113
- replacement: `${prefix}${newContent}${suffix}`
114
- });
115
- }
116
- currentCol++;
117
- }
118
-
119
- for (const r of replacements) {
120
- newRowContent = newRowContent.replace(r.original, r.replacement);
121
- }
122
-
123
- result = result.replace(rowMatch[0], `<tr>${newRowContent}</tr>`);
124
- break;
125
- }
126
- currentRow++;
150
+ const rowData = findRowHtml(html, row);
151
+ if (!rowData) {
152
+ throw new Error(`Could not find row ${row} in table HTML`);
127
153
  }
128
154
 
129
- return result;
155
+ const isBold = parsed.formatting[row][column].bold;
156
+ const updatedRowContent = updateCellInRow(rowData.content, column, value, isBold);
157
+
158
+ return html.replace(rowData.match[0], `<tr>${updatedRowContent}</tr>`);
130
159
  }
131
160
 
132
161
  /**
@@ -0,0 +1,353 @@
1
+ import { describe, it, expect, vi, beforeEach, type Mock } from "vitest";
2
+ import type { NoteDetails } from "../notes/read.js";
3
+ import type { ChunkRecord } from "../db/lancedb.js";
4
+
5
+ // Mock dependencies before importing the module under test
6
+ vi.mock("../embeddings/index.js", () => ({
7
+ getEmbeddingBatch: vi.fn(),
8
+ }));
9
+
10
+ vi.mock("../db/lancedb.js", () => ({
11
+ getChunkStore: vi.fn(() => ({
12
+ indexChunks: vi.fn(),
13
+ count: vi.fn().mockResolvedValue(0),
14
+ })),
15
+ }));
16
+
17
+ vi.mock("../notes/read.js", () => ({
18
+ getAllNotesWithContent: vi.fn(),
19
+ }));
20
+
21
+ vi.mock("../utils/debug.js", () => ({
22
+ createDebugLogger: vi.fn(() => vi.fn()),
23
+ }));
24
+
25
+ // Import after mocking
26
+ import { chunkNote, fullChunkIndex, hasChunkIndex } from "./chunk-indexer.js";
27
+ import { getEmbeddingBatch } from "../embeddings/index.js";
28
+ import { getChunkStore } from "../db/lancedb.js";
29
+ import { getAllNotesWithContent } from "../notes/read.js";
30
+
31
+ describe("chunk-indexer", () => {
32
+ beforeEach(() => {
33
+ vi.clearAllMocks();
34
+ });
35
+
36
+ describe("chunkNote", () => {
37
+ // Note: content must be at least 50 chars to pass the content filter
38
+ it("creates chunks for a note with content", () => {
39
+ const content = "This is a test note content that is long enough to pass the minimum content length requirement for indexing.";
40
+ const note: NoteDetails = {
41
+ id: "note-123",
42
+ title: "Test Note",
43
+ folder: "Work",
44
+ content,
45
+ htmlContent: `<p>${content}</p>`,
46
+ created: "2024-01-01T00:00:00.000Z",
47
+ modified: "2024-01-02T00:00:00.000Z",
48
+ };
49
+
50
+ const chunks = chunkNote(note);
51
+
52
+ expect(chunks).toHaveLength(1);
53
+ expect(chunks[0]).toMatchObject({
54
+ chunk_id: "note-123_chunk_0",
55
+ note_id: "note-123",
56
+ note_title: "Test Note",
57
+ folder: "Work",
58
+ chunk_index: 0,
59
+ total_chunks: 1,
60
+ content,
61
+ created: "2024-01-01T00:00:00.000Z",
62
+ modified: "2024-01-02T00:00:00.000Z",
63
+ });
64
+ // Vector should be empty - not generated yet
65
+ expect(chunks[0].vector).toEqual([]);
66
+ // indexed_at should be empty - set during batch processing
67
+ expect(chunks[0].indexed_at).toBe("");
68
+ // Tags and outlinks should be extracted
69
+ expect(chunks[0].tags).toEqual([]);
70
+ expect(chunks[0].outlinks).toEqual([]);
71
+ });
72
+
73
+ it("returns single chunk for notes under chunk size", () => {
74
+ const note: NoteDetails = {
75
+ id: "short-note",
76
+ title: "Short",
77
+ folder: "Notes",
78
+ content: "This is a shorter note but still has enough content to pass the minimum length filter requirement.",
79
+ htmlContent: "",
80
+ created: "2024-01-01T00:00:00.000Z",
81
+ modified: "2024-01-01T00:00:00.000Z",
82
+ };
83
+
84
+ const chunks = chunkNote(note);
85
+
86
+ expect(chunks).toHaveLength(1);
87
+ expect(chunks[0].chunk_index).toBe(0);
88
+ expect(chunks[0].total_chunks).toBe(1);
89
+ });
90
+
91
+ it("returns empty array for empty notes", () => {
92
+ const note: NoteDetails = {
93
+ id: "empty-note",
94
+ title: "Empty",
95
+ folder: "Notes",
96
+ content: "",
97
+ htmlContent: "",
98
+ created: "2024-01-01T00:00:00.000Z",
99
+ modified: "2024-01-01T00:00:00.000Z",
100
+ };
101
+
102
+ const chunks = chunkNote(note);
103
+
104
+ expect(chunks).toHaveLength(0);
105
+ });
106
+
107
+ it("returns empty array for whitespace-only notes", () => {
108
+ const note: NoteDetails = {
109
+ id: "whitespace-note",
110
+ title: "Whitespace",
111
+ folder: "Notes",
112
+ content: " \n\n ",
113
+ htmlContent: "",
114
+ created: "2024-01-01T00:00:00.000Z",
115
+ modified: "2024-01-01T00:00:00.000Z",
116
+ };
117
+
118
+ const chunks = chunkNote(note);
119
+
120
+ expect(chunks).toHaveLength(0);
121
+ });
122
+
123
+ it("extracts tags from note content", () => {
124
+ const note: NoteDetails = {
125
+ id: "tagged-note",
126
+ title: "Tagged Note",
127
+ folder: "Work",
128
+ content: "This note has #important and #work tags. It also contains enough text to pass the minimum content length requirement.",
129
+ htmlContent: "",
130
+ created: "2024-01-01T00:00:00.000Z",
131
+ modified: "2024-01-01T00:00:00.000Z",
132
+ };
133
+
134
+ const chunks = chunkNote(note);
135
+
136
+ expect(chunks).toHaveLength(1);
137
+ expect(chunks[0].tags).toContain("important");
138
+ expect(chunks[0].tags).toContain("work");
139
+ });
140
+
141
+ it("extracts outlinks from note content", () => {
142
+ const note: NoteDetails = {
143
+ id: "linked-note",
144
+ title: "Linked Note",
145
+ folder: "Work",
146
+ content: "This links to [[Other Note]] and [[Another Note]]. This is additional content to meet the minimum length requirement.",
147
+ htmlContent: "",
148
+ created: "2024-01-01T00:00:00.000Z",
149
+ modified: "2024-01-01T00:00:00.000Z",
150
+ };
151
+
152
+ const chunks = chunkNote(note);
153
+
154
+ expect(chunks).toHaveLength(1);
155
+ expect(chunks[0].outlinks).toContain("Other Note");
156
+ expect(chunks[0].outlinks).toContain("Another Note");
157
+ });
158
+
159
+ it("creates multiple chunks for long notes", () => {
160
+ // Create a long note that will produce multiple chunks
161
+ const longContent = "This is paragraph one. ".repeat(50) + "\n\n" +
162
+ "This is paragraph two. ".repeat(50);
163
+ const note: NoteDetails = {
164
+ id: "long-note",
165
+ title: "Long Note",
166
+ folder: "Work",
167
+ content: longContent,
168
+ htmlContent: "",
169
+ created: "2024-01-01T00:00:00.000Z",
170
+ modified: "2024-01-01T00:00:00.000Z",
171
+ };
172
+
173
+ const chunks = chunkNote(note);
174
+
175
+ expect(chunks.length).toBeGreaterThan(1);
176
+ // Verify chunk IDs are unique
177
+ const chunkIds = chunks.map(c => c.chunk_id);
178
+ expect(new Set(chunkIds).size).toBe(chunks.length);
179
+ // Verify indices are correct
180
+ chunks.forEach((chunk, i) => {
181
+ expect(chunk.chunk_index).toBe(i);
182
+ expect(chunk.total_chunks).toBe(chunks.length);
183
+ expect(chunk.chunk_id).toBe(`long-note_chunk_${i}`);
184
+ });
185
+ });
186
+ });
187
+
188
+ describe("fullChunkIndex", () => {
189
+ it("processes notes and creates chunks with embeddings", async () => {
190
+ // Note: content must be at least 50 chars to pass content filter
191
+ const mockNotes: NoteDetails[] = [
192
+ {
193
+ id: "note-1",
194
+ title: "Note 1",
195
+ folder: "Work",
196
+ content: "This is the first note content with enough text to pass the minimum length filter requirement.",
197
+ htmlContent: "",
198
+ created: "2024-01-01T00:00:00.000Z",
199
+ modified: "2024-01-02T00:00:00.000Z",
200
+ },
201
+ {
202
+ id: "note-2",
203
+ title: "Note 2",
204
+ folder: "Personal",
205
+ content: "This is the second note content with enough text to pass the minimum length filter requirement.",
206
+ htmlContent: "",
207
+ created: "2024-01-01T00:00:00.000Z",
208
+ modified: "2024-01-02T00:00:00.000Z",
209
+ },
210
+ ];
211
+
212
+ const mockVectors = [
213
+ [0.1, 0.2, 0.3],
214
+ [0.4, 0.5, 0.6],
215
+ ];
216
+
217
+ const mockIndexChunks = vi.fn();
218
+ (getAllNotesWithContent as Mock).mockResolvedValue(mockNotes);
219
+ (getEmbeddingBatch as Mock).mockResolvedValue(mockVectors);
220
+ (getChunkStore as Mock).mockReturnValue({
221
+ indexChunks: mockIndexChunks,
222
+ count: vi.fn().mockResolvedValue(0),
223
+ });
224
+
225
+ const result = await fullChunkIndex();
226
+
227
+ // Verify all notes were fetched
228
+ expect(getAllNotesWithContent).toHaveBeenCalledOnce();
229
+
230
+ // Verify embeddings were generated for chunks
231
+ expect(getEmbeddingBatch).toHaveBeenCalledOnce();
232
+ const embeddingTexts = (getEmbeddingBatch as Mock).mock.calls[0][0];
233
+ expect(embeddingTexts).toHaveLength(2);
234
+
235
+ // Verify chunks were stored with vectors
236
+ expect(mockIndexChunks).toHaveBeenCalledOnce();
237
+ const storedChunks = mockIndexChunks.mock.calls[0][0] as ChunkRecord[];
238
+ expect(storedChunks).toHaveLength(2);
239
+ expect(storedChunks[0].vector).toEqual([0.1, 0.2, 0.3]);
240
+ expect(storedChunks[1].vector).toEqual([0.4, 0.5, 0.6]);
241
+
242
+ // Verify indexed_at is set
243
+ storedChunks.forEach(chunk => {
244
+ expect(chunk.indexed_at).toBeTruthy();
245
+ // Should be valid ISO date
246
+ expect(new Date(chunk.indexed_at).toISOString()).toBe(chunk.indexed_at);
247
+ });
248
+
249
+ // Verify result
250
+ expect(result).toMatchObject({
251
+ totalNotes: 2,
252
+ totalChunks: 2,
253
+ indexed: 2,
254
+ });
255
+ expect(result.timeMs).toBeGreaterThanOrEqual(0);
256
+ });
257
+
258
+ it("handles empty note list", async () => {
259
+ (getAllNotesWithContent as Mock).mockResolvedValue([]);
260
+ const mockIndexChunks = vi.fn();
261
+ (getChunkStore as Mock).mockReturnValue({
262
+ indexChunks: mockIndexChunks,
263
+ count: vi.fn().mockResolvedValue(0),
264
+ });
265
+
266
+ const result = await fullChunkIndex();
267
+
268
+ expect(result).toMatchObject({
269
+ totalNotes: 0,
270
+ totalChunks: 0,
271
+ indexed: 0,
272
+ });
273
+ // Should not call embedding or indexing for empty list
274
+ expect(getEmbeddingBatch).not.toHaveBeenCalled();
275
+ expect(mockIndexChunks).not.toHaveBeenCalled();
276
+ });
277
+
278
+ it("skips empty notes in chunking", async () => {
279
+ // Note: content must be at least 50 chars to pass content filter
280
+ const mockNotes: NoteDetails[] = [
281
+ {
282
+ id: "note-1",
283
+ title: "Note 1",
284
+ folder: "Work",
285
+ content: "This is valid note content with enough characters to pass the minimum length requirement for indexing.",
286
+ htmlContent: "",
287
+ created: "2024-01-01T00:00:00.000Z",
288
+ modified: "2024-01-02T00:00:00.000Z",
289
+ },
290
+ {
291
+ id: "note-2",
292
+ title: "Empty Note",
293
+ folder: "Work",
294
+ content: "", // Empty!
295
+ htmlContent: "",
296
+ created: "2024-01-01T00:00:00.000Z",
297
+ modified: "2024-01-02T00:00:00.000Z",
298
+ },
299
+ ];
300
+
301
+ const mockIndexChunks = vi.fn();
302
+ (getAllNotesWithContent as Mock).mockResolvedValue(mockNotes);
303
+ (getEmbeddingBatch as Mock).mockResolvedValue([[0.1, 0.2, 0.3]]);
304
+ (getChunkStore as Mock).mockReturnValue({
305
+ indexChunks: mockIndexChunks,
306
+ count: vi.fn().mockResolvedValue(0),
307
+ });
308
+
309
+ const result = await fullChunkIndex();
310
+
311
+ // Only 1 chunk should be created (empty note skipped)
312
+ expect(result.totalNotes).toBe(2);
313
+ expect(result.totalChunks).toBe(1);
314
+ expect(result.indexed).toBe(1);
315
+
316
+ const storedChunks = mockIndexChunks.mock.calls[0][0] as ChunkRecord[];
317
+ expect(storedChunks).toHaveLength(1);
318
+ expect(storedChunks[0].note_title).toBe("Note 1");
319
+ });
320
+ });
321
+
322
+ describe("hasChunkIndex", () => {
323
+ it("returns true when chunk index exists", async () => {
324
+ (getChunkStore as Mock).mockReturnValue({
325
+ count: vi.fn().mockResolvedValue(10),
326
+ });
327
+
328
+ const result = await hasChunkIndex();
329
+
330
+ expect(result).toBe(true);
331
+ });
332
+
333
+ it("returns false when chunk index is empty", async () => {
334
+ (getChunkStore as Mock).mockReturnValue({
335
+ count: vi.fn().mockResolvedValue(0),
336
+ });
337
+
338
+ const result = await hasChunkIndex();
339
+
340
+ expect(result).toBe(false);
341
+ });
342
+
343
+ it("returns false when chunk store throws (table not found)", async () => {
344
+ (getChunkStore as Mock).mockReturnValue({
345
+ count: vi.fn().mockRejectedValue(new Error("Chunk index not found")),
346
+ });
347
+
348
+ const result = await hasChunkIndex();
349
+
350
+ expect(result).toBe(false);
351
+ });
352
+ });
353
+ });
@@ -0,0 +1,207 @@
1
+ /**
2
+ * Chunk indexer for Parent Document Retriever pattern.
3
+ * Splits notes into overlapping chunks, generates embeddings, and stores in LanceDB.
4
+ */
5
+
6
+ import { getEmbeddingBatch } from "../embeddings/index.js";
7
+ import { getChunkStore, type ChunkRecord } from "../db/lancedb.js";
8
+ import { getAllNotesWithContent, type NoteDetails } from "../notes/read.js";
9
+ import { chunkText } from "../utils/chunker.js";
10
+ import { extractMetadata } from "../graph/extract.js";
11
+ import { DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP } from "../config/constants.js";
12
+ import { createDebugLogger } from "../utils/debug.js";
13
+ import { filterContent, shouldIndexContent } from "../utils/content-filter.js";
14
+
15
+ // Debug logging
16
+ const debug = createDebugLogger("CHUNK-INDEXER");
17
+
18
+ /**
19
+ * Result of a chunk indexing operation.
20
+ */
21
+ export interface ChunkIndexResult {
22
+ /** Total number of notes processed */
23
+ totalNotes: number;
24
+ /** Total number of chunks created */
25
+ totalChunks: number;
26
+ /** Number of chunks indexed (with embeddings) */
27
+ indexed: number;
28
+ /** Time taken in milliseconds */
29
+ timeMs: number;
30
+ }
31
+
32
+ /** Chunk record for internal processing - explicit types to avoid index signature issues */
33
+ interface InternalChunkRecord {
34
+ chunk_id: string;
35
+ note_id: string;
36
+ note_title: string;
37
+ folder: string;
38
+ chunk_index: number;
39
+ total_chunks: number;
40
+ content: string;
41
+ vector: number[];
42
+ created: string;
43
+ modified: string;
44
+ indexed_at: string;
45
+ tags: string[];
46
+ outlinks: string[];
47
+ }
48
+
49
+ /**
50
+ * Convert a note into chunk records WITHOUT vectors.
51
+ * Vectors are added later during batch embedding generation.
52
+ *
53
+ * Filters out Base64/binary content before chunking to improve search quality.
54
+ *
55
+ * @param note - The note to chunk
56
+ * @returns Array of ChunkRecord with empty vectors
57
+ */
58
+ export function chunkNote(note: NoteDetails): InternalChunkRecord[] {
59
+ // Quick check - skip notes with mostly encoded content
60
+ if (!shouldIndexContent(note.content)) {
61
+ debug(`Note "${note.title}" skipped: contains mostly encoded/binary content`);
62
+ return [];
63
+ }
64
+
65
+ // Filter content to remove Base64 blocks and redact secrets
66
+ const filterResult = filterContent(note.content);
67
+
68
+ if (filterResult.action === "skip") {
69
+ debug(`Note "${note.title}" skipped: ${filterResult.reasons.join(", ")}`);
70
+ return [];
71
+ }
72
+
73
+ const contentToChunk = filterResult.cleanedContent || note.content;
74
+
75
+ if (filterResult.action === "filter") {
76
+ debug(`Note "${note.title}" filtered: ${filterResult.reasons.join(", ")}`);
77
+ }
78
+
79
+ // Extract metadata from the ORIGINAL content (tags/links should be preserved)
80
+ const { tags, outlinks } = extractMetadata(note.content);
81
+
82
+ // Chunk the filtered content
83
+ const chunks = chunkText(contentToChunk, {
84
+ chunkSize: DEFAULT_CHUNK_SIZE,
85
+ overlap: DEFAULT_CHUNK_OVERLAP,
86
+ });
87
+
88
+ // Return empty array for empty notes (chunkText handles this)
89
+ if (chunks.length === 0) {
90
+ debug(`Note "${note.title}" has no content to chunk`);
91
+ return [];
92
+ }
93
+
94
+ debug(`Note "${note.title}" chunked into ${chunks.length} chunks`);
95
+
96
+ // Convert to ChunkRecord format
97
+ return chunks.map((chunk) => ({
98
+ chunk_id: `${note.id}_chunk_${chunk.index}`,
99
+ note_id: note.id,
100
+ note_title: note.title,
101
+ folder: note.folder,
102
+ chunk_index: chunk.index,
103
+ total_chunks: chunk.totalChunks,
104
+ content: chunk.content,
105
+ vector: [], // Empty - to be filled during embedding generation
106
+ created: note.created,
107
+ modified: note.modified,
108
+ indexed_at: "", // Empty - to be set during batch processing
109
+ tags,
110
+ outlinks,
111
+ }));
112
+ }
113
+
114
+ /**
115
+ * Perform a full chunk index of all notes.
116
+ *
117
+ * Phases:
118
+ * 1. Fetch all notes via getAllNotesWithContent
119
+ * 2. Chunk all notes using chunkNote
120
+ * 3. Generate embeddings in batch using getEmbeddingBatch
121
+ * 4. Combine chunks with vectors and set indexed_at
122
+ * 5. Store via getChunkStore().indexChunks()
123
+ *
124
+ * @returns ChunkIndexResult with stats
125
+ */
126
+ export async function fullChunkIndex(): Promise<ChunkIndexResult> {
127
+ const startTime = Date.now();
128
+
129
+ // Phase 1: Fetch all notes
130
+ debug("Phase 1: Fetching all notes...");
131
+ const notes = await getAllNotesWithContent();
132
+ debug(`Fetched ${notes.length} notes`);
133
+
134
+ if (notes.length === 0) {
135
+ return {
136
+ totalNotes: 0,
137
+ totalChunks: 0,
138
+ indexed: 0,
139
+ timeMs: Date.now() - startTime,
140
+ };
141
+ }
142
+
143
+ // Phase 2: Chunk all notes
144
+ debug("Phase 2: Chunking all notes...");
145
+ const allChunks: InternalChunkRecord[] = [];
146
+ for (const note of notes) {
147
+ const noteChunks = chunkNote(note);
148
+ allChunks.push(...noteChunks);
149
+ }
150
+ debug(`Created ${allChunks.length} chunks from ${notes.length} notes`);
151
+
152
+ if (allChunks.length === 0) {
153
+ return {
154
+ totalNotes: notes.length,
155
+ totalChunks: 0,
156
+ indexed: 0,
157
+ timeMs: Date.now() - startTime,
158
+ };
159
+ }
160
+
161
+ // Phase 3: Generate embeddings in batch
162
+ debug("Phase 3: Generating embeddings...");
163
+ const chunkTexts: string[] = allChunks.map((chunk) => chunk.content);
164
+ const vectors = await getEmbeddingBatch(chunkTexts);
165
+ debug(`Generated ${vectors.length} embeddings`);
166
+
167
+ // Phase 4: Combine chunks with vectors and set indexed_at
168
+ debug("Phase 4: Combining chunks with vectors...");
169
+ const indexedAt = new Date().toISOString();
170
+ const completeChunks: ChunkRecord[] = allChunks.map((chunk, i) => ({
171
+ ...chunk,
172
+ vector: vectors[i],
173
+ indexed_at: indexedAt,
174
+ }));
175
+
176
+ // Phase 5: Store in LanceDB
177
+ debug("Phase 5: Storing chunks...");
178
+ const chunkStore = getChunkStore();
179
+ await chunkStore.indexChunks(completeChunks);
180
+ debug(`Stored ${completeChunks.length} chunks`);
181
+
182
+ const timeMs = Date.now() - startTime;
183
+ debug(`Chunk indexing completed in ${timeMs}ms`);
184
+
185
+ return {
186
+ totalNotes: notes.length,
187
+ totalChunks: allChunks.length,
188
+ indexed: completeChunks.length,
189
+ timeMs,
190
+ };
191
+ }
192
+
193
+ /**
194
+ * Check if a chunk index exists.
195
+ *
196
+ * @returns true if chunk index has records, false otherwise
197
+ */
198
+ export async function hasChunkIndex(): Promise<boolean> {
199
+ try {
200
+ const chunkStore = getChunkStore();
201
+ const count = await chunkStore.count();
202
+ return count > 0;
203
+ } catch {
204
+ // Table doesn't exist or error - no index
205
+ return false;
206
+ }
207
+ }