@disco_trooper/apple-notes-mcp 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +136 -24
- package/package.json +13 -9
- package/src/config/claude.test.ts +47 -0
- package/src/config/claude.ts +106 -0
- package/src/config/constants.ts +11 -2
- package/src/config/paths.test.ts +40 -0
- package/src/config/paths.ts +86 -0
- package/src/db/arrow-fix.test.ts +101 -0
- package/src/db/lancedb.test.ts +209 -2
- package/src/db/lancedb.ts +373 -7
- package/src/embeddings/cache.test.ts +150 -0
- package/src/embeddings/cache.ts +204 -0
- package/src/embeddings/index.ts +21 -2
- package/src/embeddings/local.ts +61 -10
- package/src/embeddings/openrouter.ts +233 -11
- package/src/graph/export.test.ts +81 -0
- package/src/graph/export.ts +163 -0
- package/src/graph/extract.test.ts +90 -0
- package/src/graph/extract.ts +52 -0
- package/src/graph/queries.test.ts +156 -0
- package/src/graph/queries.ts +224 -0
- package/src/index.ts +376 -10
- package/src/notes/crud.test.ts +148 -3
- package/src/notes/crud.ts +250 -5
- package/src/notes/read.ts +83 -68
- package/src/search/chunk-indexer.test.ts +353 -0
- package/src/search/chunk-indexer.ts +254 -0
- package/src/search/chunk-search.test.ts +327 -0
- package/src/search/chunk-search.ts +298 -0
- package/src/search/indexer.ts +151 -109
- package/src/search/refresh.test.ts +173 -0
- package/src/search/refresh.ts +151 -0
- package/src/setup.ts +46 -67
- package/src/utils/chunker.test.ts +182 -0
- package/src/utils/chunker.ts +170 -0
- package/src/utils/content-filter.test.ts +225 -0
- package/src/utils/content-filter.ts +275 -0
- package/src/utils/runtime.test.ts +70 -0
- package/src/utils/runtime.ts +40 -0
package/src/setup.ts
CHANGED
|
@@ -12,14 +12,23 @@
|
|
|
12
12
|
|
|
13
13
|
import * as p from "@clack/prompts";
|
|
14
14
|
import * as fs from "node:fs";
|
|
15
|
-
import
|
|
15
|
+
import {
|
|
16
|
+
getEnvPath,
|
|
17
|
+
ensureConfigDir,
|
|
18
|
+
hasLegacyConfig,
|
|
19
|
+
getLegacyEnvPath,
|
|
20
|
+
hasConfig,
|
|
21
|
+
isNpmInstall,
|
|
22
|
+
} from "./config/paths.js";
|
|
23
|
+
import {
|
|
24
|
+
getClaudeConfigEntry,
|
|
25
|
+
writeClaudeConfig,
|
|
26
|
+
getExistingInstallMethod,
|
|
27
|
+
} from "./config/claude.js";
|
|
28
|
+
import { checkBunRuntime } from "./utils/runtime.js";
|
|
29
|
+
|
|
16
30
|
// Paths
|
|
17
|
-
const
|
|
18
|
-
const ENV_FILE = path.join(PROJECT_DIR, "..", ".env");
|
|
19
|
-
const CLAUDE_CONFIG_PATH = path.join(
|
|
20
|
-
process.env.HOME || "~",
|
|
21
|
-
".claude.json"
|
|
22
|
-
);
|
|
31
|
+
const ENV_FILE = getEnvPath();
|
|
23
32
|
|
|
24
33
|
interface Config {
|
|
25
34
|
provider: "local" | "openrouter";
|
|
@@ -69,6 +78,7 @@ function readExistingEnv(): Record<string, string> {
|
|
|
69
78
|
* Write configuration to .env file
|
|
70
79
|
*/
|
|
71
80
|
function writeEnvFile(config: Config): void {
|
|
81
|
+
ensureConfigDir();
|
|
72
82
|
const lines: string[] = [
|
|
73
83
|
"# apple-notes-mcp configuration",
|
|
74
84
|
"# Generated by setup wizard",
|
|
@@ -114,80 +124,29 @@ function writeEnvFile(config: Config): void {
|
|
|
114
124
|
fs.writeFileSync(ENV_FILE, lines.join("\n") + "\n");
|
|
115
125
|
}
|
|
116
126
|
|
|
117
|
-
/**
|
|
118
|
-
* Read Claude Code config if it exists
|
|
119
|
-
*/
|
|
120
|
-
function readClaudeConfig(): Record<string, unknown> | null {
|
|
121
|
-
if (!fs.existsSync(CLAUDE_CONFIG_PATH)) {
|
|
122
|
-
return null;
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
try {
|
|
126
|
-
const content = fs.readFileSync(CLAUDE_CONFIG_PATH, "utf-8");
|
|
127
|
-
return JSON.parse(content);
|
|
128
|
-
} catch (error) {
|
|
129
|
-
// Config doesn't exist or is invalid JSON
|
|
130
|
-
if (process.env.DEBUG === "true") {
|
|
131
|
-
console.error("[SETUP] Could not read Claude config:", error);
|
|
132
|
-
}
|
|
133
|
-
return null;
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
|
|
137
127
|
/**
|
|
138
128
|
* Add MCP server to Claude Code config
|
|
139
129
|
*/
|
|
140
130
|
function addToClaudeConfig(): boolean {
|
|
141
|
-
const
|
|
142
|
-
const serverEntry = {
|
|
143
|
-
command: "bun",
|
|
144
|
-
args: ["run", path.join(projectPath, "src", "index.ts")],
|
|
145
|
-
env: {},
|
|
146
|
-
};
|
|
131
|
+
const entry = getClaudeConfigEntry();
|
|
147
132
|
|
|
148
|
-
|
|
133
|
+
// Check for install method change
|
|
134
|
+
const existingMethod = getExistingInstallMethod();
|
|
135
|
+
const currentMethod = isNpmInstall() ? "npm" : "source";
|
|
149
136
|
|
|
150
|
-
if (
|
|
151
|
-
|
|
152
|
-
config = {
|
|
153
|
-
mcpServers: {
|
|
154
|
-
"apple-notes": serverEntry,
|
|
155
|
-
},
|
|
156
|
-
};
|
|
157
|
-
} else {
|
|
158
|
-
// Add to existing config
|
|
159
|
-
const mcpServers = (config.mcpServers || {}) as Record<string, unknown>;
|
|
160
|
-
mcpServers["apple-notes"] = serverEntry;
|
|
161
|
-
config.mcpServers = mcpServers;
|
|
137
|
+
if (existingMethod && existingMethod !== currentMethod) {
|
|
138
|
+
p.log.info(`Updating Claude config from ${existingMethod} to ${currentMethod} installation`);
|
|
162
139
|
}
|
|
163
140
|
|
|
164
|
-
|
|
165
|
-
fs.writeFileSync(CLAUDE_CONFIG_PATH, JSON.stringify(config, null, 2) + "\n");
|
|
166
|
-
return true;
|
|
167
|
-
} catch (error) {
|
|
168
|
-
if (process.env.DEBUG === "true") {
|
|
169
|
-
console.error("[SETUP] Failed to write Claude config:", error);
|
|
170
|
-
}
|
|
171
|
-
return false;
|
|
172
|
-
}
|
|
141
|
+
return writeClaudeConfig(entry);
|
|
173
142
|
}
|
|
174
143
|
|
|
175
144
|
/**
|
|
176
145
|
* Generate config snippet for manual setup
|
|
177
146
|
*/
|
|
178
147
|
function getConfigSnippet(): string {
|
|
179
|
-
const
|
|
180
|
-
return JSON.stringify(
|
|
181
|
-
{
|
|
182
|
-
"apple-notes": {
|
|
183
|
-
command: "bun",
|
|
184
|
-
args: ["run", path.join(projectPath, "src", "index.ts")],
|
|
185
|
-
env: {},
|
|
186
|
-
},
|
|
187
|
-
},
|
|
188
|
-
null,
|
|
189
|
-
2
|
|
190
|
-
);
|
|
148
|
+
const entry = getClaudeConfigEntry();
|
|
149
|
+
return JSON.stringify({ "apple-notes": entry }, null, 2);
|
|
191
150
|
}
|
|
192
151
|
|
|
193
152
|
/**
|
|
@@ -217,6 +176,7 @@ async function downloadLocalModel(): Promise<void> {
|
|
|
217
176
|
* Main setup wizard
|
|
218
177
|
*/
|
|
219
178
|
async function main(): Promise<void> {
|
|
179
|
+
checkBunRuntime();
|
|
220
180
|
console.clear();
|
|
221
181
|
|
|
222
182
|
p.intro("apple-notes-mcp Setup Wizard");
|
|
@@ -233,6 +193,25 @@ async function main(): Promise<void> {
|
|
|
233
193
|
);
|
|
234
194
|
}
|
|
235
195
|
|
|
196
|
+
// Check for legacy config migration
|
|
197
|
+
if (hasLegacyConfig() && !hasConfig()) {
|
|
198
|
+
const migrate = await p.confirm({
|
|
199
|
+
message: "Found config in project directory. Migrate to ~/.apple-notes-mcp/?",
|
|
200
|
+
initialValue: true,
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
if (p.isCancel(migrate)) {
|
|
204
|
+
p.cancel("Setup cancelled.");
|
|
205
|
+
process.exit(0);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if (migrate) {
|
|
209
|
+
ensureConfigDir();
|
|
210
|
+
fs.copyFileSync(getLegacyEnvPath(), getEnvPath());
|
|
211
|
+
p.log.success("Config migrated to ~/.apple-notes-mcp/.env");
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
236
215
|
// Provider selection
|
|
237
216
|
const provider = await p.select({
|
|
238
217
|
message: "Which embedding provider would you like to use?",
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import {
|
|
3
|
+
chunkText,
|
|
4
|
+
type ChunkOptions,
|
|
5
|
+
DEFAULT_CHUNK_OPTIONS,
|
|
6
|
+
SEPARATORS,
|
|
7
|
+
findSplitPoint,
|
|
8
|
+
} from "./chunker.js";
|
|
9
|
+
|
|
10
|
+
describe("chunker", () => {
|
|
11
|
+
describe("exports", () => {
|
|
12
|
+
it("exports SEPARATORS array with correct order", () => {
|
|
13
|
+
expect(SEPARATORS).toEqual([
|
|
14
|
+
"\n\n",
|
|
15
|
+
"\n",
|
|
16
|
+
". ",
|
|
17
|
+
"! ",
|
|
18
|
+
"? ",
|
|
19
|
+
"; ",
|
|
20
|
+
", ",
|
|
21
|
+
" ",
|
|
22
|
+
"",
|
|
23
|
+
]);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it("exports DEFAULT_CHUNK_OPTIONS with correct values", () => {
|
|
27
|
+
expect(DEFAULT_CHUNK_OPTIONS).toEqual({
|
|
28
|
+
chunkSize: 500,
|
|
29
|
+
overlap: 100,
|
|
30
|
+
});
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
describe("findSplitPoint", () => {
|
|
35
|
+
it("finds paragraph boundary near target", () => {
|
|
36
|
+
const text = "First paragraph.\n\nSecond paragraph.";
|
|
37
|
+
const target = 20;
|
|
38
|
+
const result = findSplitPoint(text, target);
|
|
39
|
+
// Should find the \n\n at position 16
|
|
40
|
+
expect(result).toBe(18); // After \n\n
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it("falls back to sentence boundary", () => {
|
|
44
|
+
const text = "First sentence. Second sentence.";
|
|
45
|
+
const target = 18;
|
|
46
|
+
const result = findSplitPoint(text, target);
|
|
47
|
+
// Should find ". " at position 14-16
|
|
48
|
+
expect(result).toBe(16); // After ". "
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("falls back to word boundary", () => {
|
|
52
|
+
const text = "oneword anotherword";
|
|
53
|
+
const target = 10;
|
|
54
|
+
const result = findSplitPoint(text, target);
|
|
55
|
+
// Should find space at position 7
|
|
56
|
+
expect(result).toBe(8); // After " "
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it("returns target when no separator found", () => {
|
|
60
|
+
const text = "noseparatorshere";
|
|
61
|
+
const target = 8;
|
|
62
|
+
const result = findSplitPoint(text, target);
|
|
63
|
+
expect(result).toBe(8);
|
|
64
|
+
});
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
describe("chunkText", () => {
|
|
68
|
+
it("returns single chunk for short text", () => {
|
|
69
|
+
const text = "Short text";
|
|
70
|
+
const options: ChunkOptions = { chunkSize: 100, overlap: 20 };
|
|
71
|
+
|
|
72
|
+
const result = chunkText(text, options);
|
|
73
|
+
|
|
74
|
+
expect(result).toHaveLength(1);
|
|
75
|
+
expect(result[0]).toEqual({
|
|
76
|
+
content: "Short text",
|
|
77
|
+
index: 0,
|
|
78
|
+
totalChunks: 1,
|
|
79
|
+
startPos: 0,
|
|
80
|
+
endPos: 10,
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it("creates multiple chunks for long text", () => {
|
|
85
|
+
const text = "Word ".repeat(50).trim(); // 249 chars
|
|
86
|
+
const options: ChunkOptions = { chunkSize: 50, overlap: 10 };
|
|
87
|
+
|
|
88
|
+
const result = chunkText(text, options);
|
|
89
|
+
|
|
90
|
+
expect(result.length).toBeGreaterThan(1);
|
|
91
|
+
// Each chunk should have content
|
|
92
|
+
result.forEach((chunk) => {
|
|
93
|
+
expect(chunk.content.length).toBeGreaterThan(0);
|
|
94
|
+
expect(chunk.content.length).toBeLessThanOrEqual(options.chunkSize);
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
it("includes overlap between chunks", () => {
|
|
99
|
+
const text = "First part. Second part. Third part. Fourth part.";
|
|
100
|
+
const options: ChunkOptions = { chunkSize: 25, overlap: 10 };
|
|
101
|
+
|
|
102
|
+
const result = chunkText(text, options);
|
|
103
|
+
|
|
104
|
+
// Check that chunks overlap - endPos of chunk N should be > startPos of chunk N+1
|
|
105
|
+
for (let i = 0; i < result.length - 1; i++) {
|
|
106
|
+
const currentChunk = result[i];
|
|
107
|
+
const nextChunk = result[i + 1];
|
|
108
|
+
// Overlap means next chunk starts before current chunk ends
|
|
109
|
+
expect(nextChunk.startPos).toBeLessThan(currentChunk.endPos);
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it("respects paragraph boundaries when splitting", () => {
|
|
114
|
+
const text = "First paragraph here.\n\nSecond paragraph here.\n\nThird paragraph.";
|
|
115
|
+
const options: ChunkOptions = { chunkSize: 30, overlap: 5 };
|
|
116
|
+
|
|
117
|
+
const result = chunkText(text, options);
|
|
118
|
+
|
|
119
|
+
// At least one chunk should end at a paragraph boundary
|
|
120
|
+
const hasParaBoundary = result.some((chunk) => {
|
|
121
|
+
const endContent = text.slice(chunk.startPos, chunk.endPos);
|
|
122
|
+
return endContent.endsWith("\n\n") || chunk.endPos === text.length;
|
|
123
|
+
});
|
|
124
|
+
expect(hasParaBoundary).toBe(true);
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
it("sets correct totalChunks on all chunks", () => {
|
|
128
|
+
const text = "A ".repeat(100).trim(); // Create text that will be chunked
|
|
129
|
+
const options: ChunkOptions = { chunkSize: 20, overlap: 5 };
|
|
130
|
+
|
|
131
|
+
const result = chunkText(text, options);
|
|
132
|
+
|
|
133
|
+
const expectedTotal = result.length;
|
|
134
|
+
result.forEach((chunk, idx) => {
|
|
135
|
+
expect(chunk.totalChunks).toBe(expectedTotal);
|
|
136
|
+
expect(chunk.index).toBe(idx);
|
|
137
|
+
});
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it("handles empty text", () => {
|
|
141
|
+
const result = chunkText("", { chunkSize: 100, overlap: 20 });
|
|
142
|
+
|
|
143
|
+
expect(result).toHaveLength(0);
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
it("handles whitespace-only text", () => {
|
|
147
|
+
const result = chunkText(" \n\n ", { chunkSize: 100, overlap: 20 });
|
|
148
|
+
|
|
149
|
+
expect(result).toHaveLength(0);
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
it("uses default options when not provided", () => {
|
|
153
|
+
const text = "Test";
|
|
154
|
+
const result = chunkText(text);
|
|
155
|
+
|
|
156
|
+
expect(result).toHaveLength(1);
|
|
157
|
+
expect(result[0].content).toBe("Test");
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
it("covers all original text with chunks", () => {
|
|
161
|
+
const text = "The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs.";
|
|
162
|
+
const options: ChunkOptions = { chunkSize: 30, overlap: 10 };
|
|
163
|
+
|
|
164
|
+
const result = chunkText(text, options);
|
|
165
|
+
|
|
166
|
+
// Verify chunks cover the entire text
|
|
167
|
+
expect(result[0].startPos).toBe(0);
|
|
168
|
+
expect(result[result.length - 1].endPos).toBe(text.length);
|
|
169
|
+
|
|
170
|
+
// Verify each chunk's content matches its position in original text
|
|
171
|
+
for (const chunk of result) {
|
|
172
|
+
expect(chunk.content).toBe(text.slice(chunk.startPos, chunk.endPos));
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Verify chunks are contiguous (no gaps)
|
|
176
|
+
for (let i = 0; i < result.length - 1; i++) {
|
|
177
|
+
// Next chunk should start before or at current chunk's end (overlap)
|
|
178
|
+
expect(result[i + 1].startPos).toBeLessThanOrEqual(result[i].endPos);
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
});
|
|
182
|
+
});
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text chunker with recursive character splitting that respects natural boundaries.
|
|
3
|
+
* Prioritizes splitting at: paragraphs > sentences > words > characters
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Separators in priority order - prefer splitting at larger boundaries first
|
|
8
|
+
*/
|
|
9
|
+
export const SEPARATORS = [
|
|
10
|
+
"\n\n", // Paragraph
|
|
11
|
+
"\n", // Line
|
|
12
|
+
". ", // Sentence (period)
|
|
13
|
+
"! ", // Sentence (exclamation)
|
|
14
|
+
"? ", // Sentence (question)
|
|
15
|
+
"; ", // Clause
|
|
16
|
+
", ", // Phrase
|
|
17
|
+
" ", // Word
|
|
18
|
+
"", // Character (fallback)
|
|
19
|
+
] as const;
|
|
20
|
+
|
|
21
|
+
export interface ChunkOptions {
|
|
22
|
+
/** Maximum size of each chunk in characters */
|
|
23
|
+
chunkSize: number;
|
|
24
|
+
/** Number of characters to overlap between chunks */
|
|
25
|
+
overlap: number;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface ChunkResult {
|
|
29
|
+
/** The text content of this chunk */
|
|
30
|
+
content: string;
|
|
31
|
+
/** Zero-based index of this chunk */
|
|
32
|
+
index: number;
|
|
33
|
+
/** Total number of chunks */
|
|
34
|
+
totalChunks: number;
|
|
35
|
+
/** Start position in original text */
|
|
36
|
+
startPos: number;
|
|
37
|
+
/** End position in original text (exclusive) */
|
|
38
|
+
endPos: number;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export const DEFAULT_CHUNK_OPTIONS: ChunkOptions = {
|
|
42
|
+
chunkSize: 500,
|
|
43
|
+
overlap: 100,
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Find the best split point near the target position.
|
|
48
|
+
* Searches for separators in priority order within a reasonable range.
|
|
49
|
+
*
|
|
50
|
+
* @param text - The full text to search in
|
|
51
|
+
* @param target - The target position to split near
|
|
52
|
+
* @returns The best split position (after the separator)
|
|
53
|
+
*/
|
|
54
|
+
export function findSplitPoint(text: string, target: number): number {
|
|
55
|
+
// Search window: look backwards and forwards from target
|
|
56
|
+
const searchWindow = Math.min(50, Math.floor(target / 2));
|
|
57
|
+
const searchStart = Math.max(0, target - searchWindow);
|
|
58
|
+
const searchEnd = Math.min(text.length, target + searchWindow);
|
|
59
|
+
const searchText = text.slice(searchStart, searchEnd);
|
|
60
|
+
|
|
61
|
+
// Try each separator in priority order
|
|
62
|
+
for (const sep of SEPARATORS) {
|
|
63
|
+
if (sep === "") continue; // Skip empty string fallback for now
|
|
64
|
+
|
|
65
|
+
// Find all occurrences of separator in search window
|
|
66
|
+
let bestPos = -1;
|
|
67
|
+
let bestDistance = Infinity;
|
|
68
|
+
|
|
69
|
+
let idx = 0;
|
|
70
|
+
while ((idx = searchText.indexOf(sep, idx)) !== -1) {
|
|
71
|
+
const absolutePos = searchStart + idx + sep.length;
|
|
72
|
+
const distance = Math.abs(absolutePos - target);
|
|
73
|
+
|
|
74
|
+
if (distance < bestDistance) {
|
|
75
|
+
bestDistance = distance;
|
|
76
|
+
bestPos = absolutePos;
|
|
77
|
+
}
|
|
78
|
+
idx += 1;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (bestPos !== -1) {
|
|
82
|
+
return bestPos;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// No separator found, return target as-is
|
|
87
|
+
return target;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Split text into overlapping chunks that respect natural boundaries.
|
|
92
|
+
*
|
|
93
|
+
* @param text - The text to chunk
|
|
94
|
+
* @param options - Chunk size and overlap options
|
|
95
|
+
* @returns Array of chunk results
|
|
96
|
+
*/
|
|
97
|
+
export function chunkText(
|
|
98
|
+
text: string,
|
|
99
|
+
options: ChunkOptions = DEFAULT_CHUNK_OPTIONS
|
|
100
|
+
): ChunkResult[] {
|
|
101
|
+
const { chunkSize, overlap } = options;
|
|
102
|
+
|
|
103
|
+
// Handle empty or whitespace-only text
|
|
104
|
+
const trimmed = text.trim();
|
|
105
|
+
if (trimmed.length === 0) {
|
|
106
|
+
return [];
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// If text fits in a single chunk, return it
|
|
110
|
+
if (text.length <= chunkSize) {
|
|
111
|
+
return [
|
|
112
|
+
{
|
|
113
|
+
content: text,
|
|
114
|
+
index: 0,
|
|
115
|
+
totalChunks: 1,
|
|
116
|
+
startPos: 0,
|
|
117
|
+
endPos: text.length,
|
|
118
|
+
},
|
|
119
|
+
];
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const chunks: ChunkResult[] = [];
|
|
123
|
+
let startPos = 0;
|
|
124
|
+
// Minimum step size to ensure progress and avoid tiny chunks
|
|
125
|
+
const minStep = Math.max(1, chunkSize - overlap);
|
|
126
|
+
|
|
127
|
+
while (startPos < text.length) {
|
|
128
|
+
// Calculate target end position
|
|
129
|
+
let endPos = Math.min(startPos + chunkSize, text.length);
|
|
130
|
+
|
|
131
|
+
// If not at the end, find a good split point
|
|
132
|
+
if (endPos < text.length) {
|
|
133
|
+
const splitPoint = findSplitPoint(text, endPos);
|
|
134
|
+
// Only use split point if it creates a reasonably sized chunk
|
|
135
|
+
if (
|
|
136
|
+
splitPoint > startPos + minStep / 2 &&
|
|
137
|
+
splitPoint - startPos <= chunkSize * 1.2
|
|
138
|
+
) {
|
|
139
|
+
endPos = splitPoint;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Extract chunk content
|
|
144
|
+
const content = text.slice(startPos, endPos);
|
|
145
|
+
|
|
146
|
+
chunks.push({
|
|
147
|
+
content,
|
|
148
|
+
index: chunks.length,
|
|
149
|
+
totalChunks: 0, // Will be set after all chunks are created
|
|
150
|
+
startPos,
|
|
151
|
+
endPos,
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
// If we've reached the end, stop
|
|
155
|
+
if (endPos >= text.length) {
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Move to next chunk - ensure minimum step for progress
|
|
160
|
+
startPos = startPos + minStep;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Set totalChunks on all chunks
|
|
164
|
+
const totalChunks = chunks.length;
|
|
165
|
+
for (const chunk of chunks) {
|
|
166
|
+
chunk.totalChunks = totalChunks;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return chunks;
|
|
170
|
+
}
|