@copilotkit/pathfinder 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +20 -0
- package/.superpowers/brainstorm/47098-1775507869/content/homepage-mockup.html +324 -0
- package/.superpowers/brainstorm/47098-1775507869/state/server-stopped +1 -0
- package/.superpowers/brainstorm/47098-1775507869/state/server.log +13 -0
- package/.superpowers/brainstorm/47098-1775507869/state/server.pid +1 -0
- package/.superpowers/brainstorm/82141-1775511032/content/migration-v2.html +340 -0
- package/.superpowers/brainstorm/82141-1775511032/content/migration.html +340 -0
- package/.superpowers/brainstorm/82141-1775511032/state/server-stopped +1 -0
- package/.superpowers/brainstorm/82141-1775511032/state/server.log +4 -0
- package/.superpowers/brainstorm/82141-1775511032/state/server.pid +1 -0
- package/CHANGELOG.md +26 -0
- package/LICENSE +21 -0
- package/README.md +284 -0
- package/dist/config.d.ts +32 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +180 -0
- package/dist/config.js.map +1 -0
- package/dist/db/client.d.ts +22 -0
- package/dist/db/client.d.ts.map +1 -0
- package/dist/db/client.js +134 -0
- package/dist/db/client.js.map +1 -0
- package/dist/db/queries.d.ts +51 -0
- package/dist/db/queries.d.ts.map +1 -0
- package/dist/db/queries.js +271 -0
- package/dist/db/queries.js.map +1 -0
- package/dist/db/schema.d.ts +11 -0
- package/dist/db/schema.d.ts.map +1 -0
- package/dist/db/schema.js +63 -0
- package/dist/db/schema.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +366 -0
- package/dist/index.js.map +1 -0
- package/dist/indexing/chunking/code.d.ts +17 -0
- package/dist/indexing/chunking/code.d.ts.map +1 -0
- package/dist/indexing/chunking/code.js +277 -0
- package/dist/indexing/chunking/code.js.map +1 -0
- package/dist/indexing/chunking/index.d.ts +6 -0
- package/dist/indexing/chunking/index.d.ts.map +1 -0
- package/dist/indexing/chunking/index.js +19 -0
- package/dist/indexing/chunking/index.js.map +1 -0
- package/dist/indexing/chunking/markdown.d.ts +16 -0
- package/dist/indexing/chunking/markdown.d.ts.map +1 -0
- package/dist/indexing/chunking/markdown.js +283 -0
- package/dist/indexing/chunking/markdown.js.map +1 -0
- package/dist/indexing/chunking/raw-text.d.ts +11 -0
- package/dist/indexing/chunking/raw-text.d.ts.map +1 -0
- package/dist/indexing/chunking/raw-text.js +59 -0
- package/dist/indexing/chunking/raw-text.js.map +1 -0
- package/dist/indexing/embeddings.d.ts +10 -0
- package/dist/indexing/embeddings.d.ts.map +1 -0
- package/dist/indexing/embeddings.js +78 -0
- package/dist/indexing/embeddings.js.map +1 -0
- package/dist/indexing/orchestrator.d.ts +69 -0
- package/dist/indexing/orchestrator.d.ts.map +1 -0
- package/dist/indexing/orchestrator.js +387 -0
- package/dist/indexing/orchestrator.js.map +1 -0
- package/dist/indexing/source-indexer.d.ts +68 -0
- package/dist/indexing/source-indexer.d.ts.map +1 -0
- package/dist/indexing/source-indexer.js +379 -0
- package/dist/indexing/source-indexer.js.map +1 -0
- package/dist/indexing/url-derivation.d.ts +7 -0
- package/dist/indexing/url-derivation.d.ts.map +1 -0
- package/dist/indexing/url-derivation.js +31 -0
- package/dist/indexing/url-derivation.js.map +1 -0
- package/dist/mcp/server.d.ts +10 -0
- package/dist/mcp/server.d.ts.map +1 -0
- package/dist/mcp/server.js +67 -0
- package/dist/mcp/server.js.map +1 -0
- package/dist/mcp/tools/bash-fs.d.ts +19 -0
- package/dist/mcp/tools/bash-fs.d.ts.map +1 -0
- package/dist/mcp/tools/bash-fs.js +134 -0
- package/dist/mcp/tools/bash-fs.js.map +1 -0
- package/dist/mcp/tools/bash-grep.d.ts +29 -0
- package/dist/mcp/tools/bash-grep.d.ts.map +1 -0
- package/dist/mcp/tools/bash-grep.js +153 -0
- package/dist/mcp/tools/bash-grep.js.map +1 -0
- package/dist/mcp/tools/bash-related.d.ts +14 -0
- package/dist/mcp/tools/bash-related.d.ts.map +1 -0
- package/dist/mcp/tools/bash-related.js +54 -0
- package/dist/mcp/tools/bash-related.js.map +1 -0
- package/dist/mcp/tools/bash-session.d.ts +23 -0
- package/dist/mcp/tools/bash-session.d.ts.map +1 -0
- package/dist/mcp/tools/bash-session.js +60 -0
- package/dist/mcp/tools/bash-session.js.map +1 -0
- package/dist/mcp/tools/bash-telemetry.d.ts +26 -0
- package/dist/mcp/tools/bash-telemetry.d.ts.map +1 -0
- package/dist/mcp/tools/bash-telemetry.js +53 -0
- package/dist/mcp/tools/bash-telemetry.js.map +1 -0
- package/dist/mcp/tools/bash-virtual-files.d.ts +3 -0
- package/dist/mcp/tools/bash-virtual-files.d.ts.map +1 -0
- package/dist/mcp/tools/bash-virtual-files.js +65 -0
- package/dist/mcp/tools/bash-virtual-files.js.map +1 -0
- package/dist/mcp/tools/bash.d.ts +25 -0
- package/dist/mcp/tools/bash.d.ts.map +1 -0
- package/dist/mcp/tools/bash.js +140 -0
- package/dist/mcp/tools/bash.js.map +1 -0
- package/dist/mcp/tools/collect.d.ts +13 -0
- package/dist/mcp/tools/collect.d.ts.map +1 -0
- package/dist/mcp/tools/collect.js +56 -0
- package/dist/mcp/tools/collect.js.map +1 -0
- package/dist/mcp/tools/search.d.ts +5 -0
- package/dist/mcp/tools/search.d.ts.map +1 -0
- package/dist/mcp/tools/search.js +68 -0
- package/dist/mcp/tools/search.js.map +1 -0
- package/dist/types.d.ts +1237 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +163 -0
- package/dist/types.js.map +1 -0
- package/dist/webhooks/github.d.ts +12 -0
- package/dist/webhooks/github.d.ts.map +1 -0
- package/dist/webhooks/github.js +117 -0
- package/dist/webhooks/github.js.map +1 -0
- package/package.json +48 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
// Line-based code splitter
|
|
2
|
+
const DEFAULT_TARGET_LINES = 80;
|
|
3
|
+
const DEFAULT_OVERLAP_LINES = 10;
|
|
4
|
+
/**
|
|
5
|
+
* Map file extension to language name.
|
|
6
|
+
*/
|
|
7
|
+
function detectLanguage(filePath) {
|
|
8
|
+
const ext = filePath.split('.').pop()?.toLowerCase() || '';
|
|
9
|
+
const languageMap = {
|
|
10
|
+
ts: 'typescript',
|
|
11
|
+
tsx: 'typescript',
|
|
12
|
+
js: 'javascript',
|
|
13
|
+
jsx: 'javascript',
|
|
14
|
+
mjs: 'javascript',
|
|
15
|
+
cjs: 'javascript',
|
|
16
|
+
py: 'python',
|
|
17
|
+
rb: 'ruby',
|
|
18
|
+
go: 'go',
|
|
19
|
+
rs: 'rust',
|
|
20
|
+
java: 'java',
|
|
21
|
+
kt: 'kotlin',
|
|
22
|
+
swift: 'swift',
|
|
23
|
+
c: 'c',
|
|
24
|
+
cpp: 'cpp',
|
|
25
|
+
h: 'c',
|
|
26
|
+
hpp: 'cpp',
|
|
27
|
+
cs: 'csharp',
|
|
28
|
+
md: 'markdown',
|
|
29
|
+
mdx: 'markdown',
|
|
30
|
+
json: 'json',
|
|
31
|
+
yaml: 'yaml',
|
|
32
|
+
yml: 'yaml',
|
|
33
|
+
toml: 'toml',
|
|
34
|
+
sql: 'sql',
|
|
35
|
+
sh: 'shell',
|
|
36
|
+
bash: 'shell',
|
|
37
|
+
zsh: 'shell',
|
|
38
|
+
css: 'css',
|
|
39
|
+
scss: 'scss',
|
|
40
|
+
html: 'html',
|
|
41
|
+
xml: 'xml',
|
|
42
|
+
};
|
|
43
|
+
return languageMap[ext] || ext || 'text';
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Check whether the character at `pos` is escaped by counting preceding
|
|
47
|
+
* backslashes. An odd number means the character is escaped.
|
|
48
|
+
*/
|
|
49
|
+
function isEscaped(line, pos) {
|
|
50
|
+
let backslashes = 0;
|
|
51
|
+
for (let j = pos - 1; j >= 0 && line[j] === '\\'; j--) {
|
|
52
|
+
backslashes++;
|
|
53
|
+
}
|
|
54
|
+
return backslashes % 2 === 1;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Strip string literals and single-line comments from a line so that
|
|
58
|
+
* block-comment and template-string detection only fires on real syntax.
|
|
59
|
+
*/
|
|
60
|
+
function stripStringsAndLineComments(line) {
|
|
61
|
+
let result = '';
|
|
62
|
+
let inSingle = false;
|
|
63
|
+
let inDouble = false;
|
|
64
|
+
for (let i = 0; i < line.length; i++) {
|
|
65
|
+
const ch = line[i];
|
|
66
|
+
if (!inSingle && !inDouble && ch === '/' && line[i + 1] === '/') {
|
|
67
|
+
break; // rest of line is a single-line comment
|
|
68
|
+
}
|
|
69
|
+
if (!inDouble && ch === "'" && !isEscaped(line, i)) {
|
|
70
|
+
inSingle = !inSingle;
|
|
71
|
+
}
|
|
72
|
+
else if (!inSingle && ch === '"' && !isEscaped(line, i)) {
|
|
73
|
+
inDouble = !inDouble;
|
|
74
|
+
}
|
|
75
|
+
if (!inSingle && !inDouble) {
|
|
76
|
+
result += ch;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
return result;
|
|
80
|
+
}
|
|
81
|
+
function trackBlockState(line, state) {
|
|
82
|
+
const newState = { ...state };
|
|
83
|
+
const stripped = stripStringsAndLineComments(line);
|
|
84
|
+
if (newState.inBlockComment) {
|
|
85
|
+
if (stripped.includes('*/')) {
|
|
86
|
+
newState.inBlockComment = false;
|
|
87
|
+
}
|
|
88
|
+
return newState;
|
|
89
|
+
}
|
|
90
|
+
if (newState.inTemplateString) {
|
|
91
|
+
// Count unescaped backticks
|
|
92
|
+
const backticks = (stripped.match(/(?<!\\)`/g) || []).length;
|
|
93
|
+
if (backticks % 2 === 1) {
|
|
94
|
+
newState.inTemplateString = false;
|
|
95
|
+
}
|
|
96
|
+
return newState;
|
|
97
|
+
}
|
|
98
|
+
// Check for block comment start (not on same line as end)
|
|
99
|
+
if (stripped.includes('/*') && !stripped.includes('*/')) {
|
|
100
|
+
newState.inBlockComment = true;
|
|
101
|
+
}
|
|
102
|
+
else if (!newState.inBlockComment) {
|
|
103
|
+
// Only check template strings if we didn't just enter a block comment
|
|
104
|
+
const backticks = (stripped.match(/(?<!\\)`/g) || []).length;
|
|
105
|
+
if (backticks % 2 === 1) {
|
|
106
|
+
newState.inTemplateString = true;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Python triple-quote strings — run on original line since stripping is JS-oriented
|
|
110
|
+
if (!newState.inBlockComment && !newState.inTemplateString) {
|
|
111
|
+
if (line.includes('"""') || line.includes("'''")) {
|
|
112
|
+
const tripleDouble = (line.match(/"""/g) || []).length;
|
|
113
|
+
const tripleSingle = (line.match(/'''/g) || []).length;
|
|
114
|
+
if ((tripleDouble % 2 === 1) || (tripleSingle % 2 === 1)) {
|
|
115
|
+
newState.inBlockComment = true; // reuse flag for python docstrings
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return newState;
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Determine safe split points: lines where we're not inside a block comment
|
|
123
|
+
* or string literal, and that represent logical boundaries.
|
|
124
|
+
*/
|
|
125
|
+
function findSplitPoints(lines) {
|
|
126
|
+
const safePoints = new Set();
|
|
127
|
+
let state = { inBlockComment: false, inTemplateString: false };
|
|
128
|
+
for (let i = 0; i < lines.length; i++) {
|
|
129
|
+
const prevState = { ...state };
|
|
130
|
+
state = trackBlockState(lines[i], state);
|
|
131
|
+
// A double-newline boundary is a safe split point
|
|
132
|
+
if (i > 0 && lines[i].trim() === '' && lines[i - 1].trim() === '') {
|
|
133
|
+
if (!state.inBlockComment && !state.inTemplateString &&
|
|
134
|
+
!prevState.inBlockComment && !prevState.inTemplateString) {
|
|
135
|
+
safePoints.add(i);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
// A single blank line is a secondary split point
|
|
139
|
+
if (lines[i].trim() === '' && !state.inBlockComment && !state.inTemplateString) {
|
|
140
|
+
safePoints.add(i);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return safePoints;
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Format a range of lines with line numbers and a file breadcrumb.
|
|
147
|
+
*/
|
|
148
|
+
function formatChunk(lines, startLine, filePath) {
|
|
149
|
+
const breadcrumb = `// File: ${filePath}`;
|
|
150
|
+
const maxLineNum = startLine + lines.length - 1;
|
|
151
|
+
const padWidth = String(maxLineNum).length;
|
|
152
|
+
const numbered = lines.map((line, i) => {
|
|
153
|
+
const lineNum = String(startLine + i).padStart(padWidth, ' ');
|
|
154
|
+
return `${lineNum} | ${line}`;
|
|
155
|
+
});
|
|
156
|
+
return breadcrumb + '\n' + numbered.join('\n');
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Split lines into groups at double-newline boundaries, respecting block state.
|
|
160
|
+
*/
|
|
161
|
+
function splitAtBoundaries(lines, targetLines) {
|
|
162
|
+
if (lines.length <= targetLines) {
|
|
163
|
+
return [{ start: 0, end: lines.length - 1 }];
|
|
164
|
+
}
|
|
165
|
+
const safePoints = findSplitPoints(lines);
|
|
166
|
+
const ranges = [];
|
|
167
|
+
let rangeStart = 0;
|
|
168
|
+
// Prefer double-newline boundaries first
|
|
169
|
+
const doubleNewlines = [];
|
|
170
|
+
for (let i = 1; i < lines.length; i++) {
|
|
171
|
+
if (lines[i].trim() === '' && lines[i - 1].trim() === '' && safePoints.has(i)) {
|
|
172
|
+
doubleNewlines.push(i);
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
// Try splitting on double-newline boundaries
|
|
176
|
+
if (doubleNewlines.length > 0) {
|
|
177
|
+
const splitPoints = selectSplitPoints(doubleNewlines, lines.length, targetLines);
|
|
178
|
+
for (const point of splitPoints) {
|
|
179
|
+
if (point > rangeStart) {
|
|
180
|
+
ranges.push({ start: rangeStart, end: point - 1 });
|
|
181
|
+
rangeStart = point;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
ranges.push({ start: rangeStart, end: lines.length - 1 });
|
|
185
|
+
// Check if any range is still too large
|
|
186
|
+
const needsRefinement = ranges.some(r => (r.end - r.start + 1) > targetLines * 1.5);
|
|
187
|
+
if (!needsRefinement)
|
|
188
|
+
return ranges;
|
|
189
|
+
}
|
|
190
|
+
// Fall back to single blank line boundaries
|
|
191
|
+
const blankLines = Array.from(safePoints).sort((a, b) => a - b);
|
|
192
|
+
if (blankLines.length > 0) {
|
|
193
|
+
const refinedRanges = [];
|
|
194
|
+
rangeStart = 0;
|
|
195
|
+
const splitPoints = selectSplitPoints(blankLines, lines.length, targetLines);
|
|
196
|
+
for (const point of splitPoints) {
|
|
197
|
+
if (point > rangeStart) {
|
|
198
|
+
refinedRanges.push({ start: rangeStart, end: point - 1 });
|
|
199
|
+
rangeStart = point;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
refinedRanges.push({ start: rangeStart, end: lines.length - 1 });
|
|
203
|
+
return refinedRanges;
|
|
204
|
+
}
|
|
205
|
+
// No good split points; split mechanically on line boundaries
|
|
206
|
+
const mechanicalRanges = [];
|
|
207
|
+
for (let i = 0; i < lines.length; i += targetLines) {
|
|
208
|
+
mechanicalRanges.push({
|
|
209
|
+
start: i,
|
|
210
|
+
end: Math.min(i + targetLines - 1, lines.length - 1),
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
return mechanicalRanges;
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Select split points from candidates that best partition the content
|
|
217
|
+
* into chunks near the target size.
|
|
218
|
+
*/
|
|
219
|
+
function selectSplitPoints(candidates, _totalLines, targetLines) {
|
|
220
|
+
const selected = [];
|
|
221
|
+
let lastSplit = 0;
|
|
222
|
+
for (const candidate of candidates) {
|
|
223
|
+
const distance = candidate - lastSplit;
|
|
224
|
+
if (distance >= targetLines) {
|
|
225
|
+
selected.push(candidate);
|
|
226
|
+
lastSplit = candidate;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
return selected;
|
|
230
|
+
}
|
|
231
|
+
/**
|
|
232
|
+
* Split code content into embedding-friendly chunks with line numbers.
|
|
233
|
+
*
|
|
234
|
+
* @param content - The full source file content
|
|
235
|
+
* @param filePath - Path to the source file
|
|
236
|
+
* @returns Array of CodeChunk objects
|
|
237
|
+
*/
|
|
238
|
+
export function chunkCode(content, filePath, config) {
|
|
239
|
+
if (!content || !content.trim()) {
|
|
240
|
+
return [];
|
|
241
|
+
}
|
|
242
|
+
const targetLines = config.chunk?.target_lines ?? DEFAULT_TARGET_LINES;
|
|
243
|
+
const overlapLines = config.chunk?.overlap_lines ?? DEFAULT_OVERLAP_LINES;
|
|
244
|
+
const language = detectLanguage(filePath);
|
|
245
|
+
const lines = content.split('\n');
|
|
246
|
+
// Remove trailing empty line if file ends with newline
|
|
247
|
+
if (lines.length > 0 && lines[lines.length - 1] === '') {
|
|
248
|
+
lines.pop();
|
|
249
|
+
}
|
|
250
|
+
if (lines.length === 0) {
|
|
251
|
+
return [];
|
|
252
|
+
}
|
|
253
|
+
// Split into ranges
|
|
254
|
+
const ranges = splitAtBoundaries(lines, targetLines);
|
|
255
|
+
// Apply overlap and build chunks
|
|
256
|
+
const chunks = [];
|
|
257
|
+
for (let i = 0; i < ranges.length; i++) {
|
|
258
|
+
let { start, end } = ranges[i];
|
|
259
|
+
// Apply overlap from previous chunk
|
|
260
|
+
if (i > 0 && overlapLines > 0) {
|
|
261
|
+
const overlapStart = Math.max(ranges[i - 1].end - overlapLines + 1, ranges[i - 1].start);
|
|
262
|
+
start = Math.min(start, overlapStart);
|
|
263
|
+
}
|
|
264
|
+
const chunkLines = lines.slice(start, end + 1);
|
|
265
|
+
const startLine = start + 1; // 1-indexed
|
|
266
|
+
const endLine = end + 1; // 1-indexed
|
|
267
|
+
chunks.push({
|
|
268
|
+
content: formatChunk(chunkLines, startLine, filePath),
|
|
269
|
+
startLine,
|
|
270
|
+
endLine,
|
|
271
|
+
language,
|
|
272
|
+
chunkIndex: chunks.length,
|
|
273
|
+
});
|
|
274
|
+
}
|
|
275
|
+
return chunks;
|
|
276
|
+
}
|
|
277
|
+
//# sourceMappingURL=code.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"code.js","sourceRoot":"","sources":["../../../src/indexing/chunking/code.ts"],"names":[],"mappings":"AAAA,2BAA2B;AAY3B,MAAM,oBAAoB,GAAG,EAAE,CAAC;AAChC,MAAM,qBAAqB,GAAG,EAAE,CAAC;AAEjC;;GAEG;AACH,SAAS,cAAc,CAAC,QAAgB;IACpC,MAAM,GAAG,GAAG,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;IAC3D,MAAM,WAAW,GAA2B;QACxC,EAAE,EAAE,YAAY;QAChB,GAAG,EAAE,YAAY;QACjB,EAAE,EAAE,YAAY;QAChB,GAAG,EAAE,YAAY;QACjB,GAAG,EAAE,YAAY;QACjB,GAAG,EAAE,YAAY;QACjB,EAAE,EAAE,QAAQ;QACZ,EAAE,EAAE,MAAM;QACV,EAAE,EAAE,IAAI;QACR,EAAE,EAAE,MAAM;QACV,IAAI,EAAE,MAAM;QACZ,EAAE,EAAE,QAAQ;QACZ,KAAK,EAAE,OAAO;QACd,CAAC,EAAE,GAAG;QACN,GAAG,EAAE,KAAK;QACV,CAAC,EAAE,GAAG;QACN,GAAG,EAAE,KAAK;QACV,EAAE,EAAE,QAAQ;QACZ,EAAE,EAAE,UAAU;QACd,GAAG,EAAE,UAAU;QACf,IAAI,EAAE,MAAM;QACZ,IAAI,EAAE,MAAM;QACZ,GAAG,EAAE,MAAM;QACX,IAAI,EAAE,MAAM;QACZ,GAAG,EAAE,KAAK;QACV,EAAE,EAAE,OAAO;QACX,IAAI,EAAE,OAAO;QACb,GAAG,EAAE,OAAO;QACZ,GAAG,EAAE,KAAK;QACV,IAAI,EAAE,MAAM;QACZ,IAAI,EAAE,MAAM;QACZ,GAAG,EAAE,KAAK;KACb,CAAC;IAEF,OAAO,WAAW,CAAC,GAAG,CAAC,IAAI,GAAG,IAAI,MAAM,CAAC;AAC7C,CAAC;AAWD;;;GAGG;AACH,SAAS,SAAS,CAAC,IAAY,EAAE,GAAW;IACxC,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,KAAK,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;QACpD,WAAW,EAAE,CAAC;IAClB,CAAC;IACD,OAAO,WAAW,GAAG,CAAC,KAAK,CAAC,CAAC;AACjC,CAAC;AAED;;;GAGG;AACH,SAAS,2BAA2B,CAAC,IAAY;IAC7C,IAAI,MAAM,GAAG,EAAE,CAAC;IAChB,IAAI,QAAQ,GAAG,KAAK,CAAC;IACrB,IAAI,QAAQ,GAAG,KAAK,CAAC;IAErB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACnC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QAEnB,IAAI,CAAC,QAAQ,IAAI,CAAC,QAAQ,IAAI,EAAE,KAAK,GAAG,IAAI,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC;YAC9D,MAAM,CAAC,wCAAwC;QACnD,CAAC;QAED,IAAI,CAAC,QAAQ,IAAI,EAAE,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;YACjD,QAAQ,GAAG,CAAC,QAAQ,CAAC;QACzB,CAAC;aAAM,IAAI,CAAC,QAAQ,IAAI,EAAE,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,CAAC,CAAC,EAAE,CAAC;YACxD,QAAQ,GAAG,CAAC,QAAQ,CAAC;QACzB,CAAC;QAED,IAAI,CAAC,QAAQ,IAAI,CAAC,QAAQ,EAAE,CAAC;YACzB,MAAM,IAAI,EAAE,CAAC;QACjB,CAAC;IACL,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAED,SAAS,eAAe,CAAC,IAAY,EAAE,KAAiB;IACpD,MAAM,QAAQ,GAAG,EAAE,GAAG,KAAK,EAAE,CAAC;IAC9B,MAAM,QAAQ,GAAG,2BAA2B,CAAC,IAAI,CAAC,CAAC;IAEnD,IAAI,QAAQ,CAAC,cAAc,EAAE,CAAC;QAC1B,IAAI,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YAC1B,QAAQ,CAAC,cAAc,GAAG,KAAK,CAAC;QACpC,CAAC;QACD,OAAO,QAAQ,CAAC;IACpB,CAAC;IAED,IAAI,QAAQ,CAAC,gBAAgB,EAAE,CAAC;QAC5B,4BAA4B;QAC5B,MAAM,SAAS,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAC7D,IAAI,SAAS,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YACtB,QAAQ,CAAC,gBAAgB,GAAG,KAAK,CAAC;QACtC,CAAC;QACD,OAAO,QAAQ,CAAC;IACpB,CAAC;IAED,0DAA0D;IAC1D,IAAI,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACtD,QAAQ,CAAC,cAAc,GAAG,IAAI,CAAC;IACnC,CAAC;SAAM,IAAI,CAAC,QAAQ,CAAC,cAAc,EAAE,CAAC;QAClC,sEAAsE;QACtE,MAAM,SAAS,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAC7D,IAAI,SAAS,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YACtB,QAAQ,CAAC,gBAAgB,GAAG,IAAI,CAAC;QACrC,CAAC;IACL,CAAC;IAED,oFAAoF;IACpF,IAAI,CAAC,QAAQ,CAAC,cAAc,IAAI,CAAC,QAAQ,CAAC,gBAAgB,EAAE,CAAC;QACzD,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/C,MAAM,YAAY,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;YACvD,MAAM,YAAY,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;YACvD,IAAI,CAAC,YAAY,GAAG,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,YAAY,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC;gBACvD,QAAQ,CAAC,cAAc,GAAG,IAAI,CAAC,CAAC,mCAAmC;YACvE,CAAC;QACL,CAAC;IACL,CAAC;IAED,OAAO,QAAQ,CAAC;AACpB,CAAC;AAED;;;GAGG;AACH,SAAS,eAAe,CAAC,KAAe;IACpC,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAC;IACrC,IAAI,KAAK,GAAe,EAAE,cAAc,EAAE,KAAK,EAAE,gBAAgB,EAAE,KAAK,EAAE,CAAC;IAE3E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,MAAM,SAAS,GAAG,EAAE,GAAG,KAAK,EAAE,CAAC;QAC/B,KAAK,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;QAEzC,kDAAkD;QAClD,IAAI,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;YAChE,IAAI,CAAC,KAAK,CAAC,cAAc,IAAI,CAAC,KAAK,CAAC,gBAAgB;gBAChD,CAAC,SAAS,CAAC,cAAc,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,CAAC;gBAC3D,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YACtB,CAAC;QACL,CAAC;QAED,iDAAiD;QACjD,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,cAAc,IAAI,CAAC,KAAK,CAAC,gBAAgB,EAAE,CAAC;YAC7E,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;QACtB,CAAC;IACL,CAAC;IAED,OAAO,UAAU,CAAC;AACtB,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,KAAe,EAAE,SAAiB,EAAE,QAAgB;IACrE,MAAM,UAAU,GAAG,YAAY,QAAQ,EAAE,CAAC;IAC1C,MAAM,UAAU,GAAG,SAAS,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;IAChD,MAAM,QAAQ,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC,MAAM,CAAC;IAE3C,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QACnC,MAAM,OAAO,GAAG,MAAM,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;QAC9D,OAAO,GAAG,OAAO,MAAM,IAAI,EAAE,CAAC;IAClC,CAAC,CAAC,CAAC;IAEH,OAAO,UAAU,GAAG,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACnD,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,KAAe,EAAE,WAAmB;IAC3D,IAAI,KAAK,CAAC,MAAM,IAAI,WAAW,EAAE,CAAC;QAC9B,OAAO,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,CAAC;IACjD,CAAC;IAED,MAAM,UAAU,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;IAC1C,MAAM,MAAM,GAA0C,EAAE,CAAC;IACzD,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,yCAAyC;IACzC,MAAM,cAAc,GAAa,EAAE,CAAC;IACpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5E,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC3B,CAAC;IACL,CAAC;IAED,6CAA6C;IAC7C,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAG,iBAAiB,CAAC,cAAc,EAAE,KAAK,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;QACjF,KAAK,MAAM,KAAK,IAAI,WAAW,EAAE,CAAC;YAC9B,IAAI,KAAK,GAAG,UAAU,EAAE,CAAC;gBACrB,MAAM,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,GAAG,EAAE,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;gBACnD,UAAU,GAAG,KAAK,CAAC;YACvB,CAAC;QACL,CAAC;QACD,MAAM,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,GAAG,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,CAAC;QAE1D,wCAAwC;QACxC,MAAM,eAAe,GAAG,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,WAAW,GAAG,GAAG,CAAC,CAAC;QACpF,IAAI,CAAC,eAAe;YAAE,OAAO,MAAM,CAAC;IACxC,CAAC;IAED,4CAA4C;IAC5C,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAChE,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,MAAM,aAAa,GAA0C,EAAE,CAAC;QAChE,UAAU,GAAG,CAAC,CAAC;QAEf,MAAM,WAAW,GAAG,iBAAiB,CAAC,UAAU,EAAE,KAAK,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;QAC7E,KAAK,MAAM,KAAK,IAAI,WAAW,EAAE,CAAC;YAC9B,IAAI,KAAK,GAAG,UAAU,EAAE,CAAC;gBACrB,aAAa,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,GAAG,EAAE,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;gBAC1D,UAAU,GAAG,KAAK,CAAC;YACvB,CAAC;QACL,CAAC;QACD,aAAa,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,UAAU,EAAE,GAAG,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,CAAC;QACjE,OAAO,aAAa,CAAC;IACzB,CAAC;IAED,8DAA8D;IAC9D,MAAM,gBAAgB,GAA0C,EAAE,CAAC;IACnE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,WAAW,EAAE,CAAC;QACjD,gBAAgB,CAAC,IAAI,CAAC;YAClB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,WAAW,GAAG,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;SACvD,CAAC,CAAC;IACP,CAAC;IACD,OAAO,gBAAgB,CAAC;AAC5B,CAAC;AAED;;;GAGG;AACH,SAAS,iBAAiB,CAAC,UAAoB,EAAE,WAAmB,EAAE,WAAmB;IACrF,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACjC,MAAM,QAAQ,GAAG,SAAS,GAAG,SAAS,CAAC;QACvC,IAAI,QAAQ,IAAI,WAAW,EAAE,CAAC;YAC1B,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACzB,SAAS,GAAG,SAAS,CAAC;QAC1B,CAAC;IACL,CAAC;IAED,OAAO,QAAQ,CAAC;AACpB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,SAAS,CAAC,OAAe,EAAE,QAAgB,EAAE,MAAoB;IAC7E,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;QAC9B,OAAO,EAAE,CAAC;IACd,CAAC;IAED,MAAM,WAAW,GAAG,MAAM,CAAC,KAAK,EAAE,YAAY,IAAI,oBAAoB,CAAC;IACvE,MAAM,YAAY,GAAG,MAAM,CAAC,KAAK,EAAE,aAAa,IAAI,qBAAqB,CAAC;IAE1E,MAAM,QAAQ,GAAG,cAAc,CAAC,QAAQ,CAAC,CAAC;IAC1C,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,uDAAuD;IACvD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC;QACrD,KAAK,CAAC,GAAG,EAAE,CAAC;IAChB,CAAC;IAED,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrB,OAAO,EAAE,CAAC;IACd,CAAC;IAED,oBAAoB;IACpB,MAAM,MAAM,GAAG,iBAAiB,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IAErD,iCAAiC;IACjC,MAAM,MAAM,GAAkB,EAAE,CAAC;IAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QAE/B,oCAAoC;QACpC,IAAI,CAAC,GAAG,CAAC,IAAI,YAAY,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,GAAG,YAAY,GAAG,CAAC,EAAE,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YACzF,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,YAAY,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,YAAY;QACzC,MAAM,OAAO,GAAG,GAAG,GAAG,CAAC,CAAC,CAAK,YAAY;QAEzC,MAAM,CAAC,IAAI,CAAC;YACR,OAAO,EAAE,WAAW,CAAC,UAAU,EAAE,SAAS,EAAE,QAAQ,CAAC;YACrD,SAAS;YACT,OAAO;YACP,QAAQ;YACR,UAAU,EAAE,MAAM,CAAC,MAAM;SAC5B,CAAC,CAAC;IACP,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { type ChunkOutput, type SourceConfig } from '../../types.js';
|
|
2
|
+
type ChunkerFn = (content: string, filePath: string, config: SourceConfig) => ChunkOutput[];
|
|
3
|
+
export declare function registerChunker(type: string, fn: ChunkerFn): void;
|
|
4
|
+
export declare function getChunker(type: string): ChunkerFn;
|
|
5
|
+
export {};
|
|
6
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/indexing/chunking/index.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,WAAW,EAAE,KAAK,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAErE,KAAK,SAAS,GAAG,CAAC,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,KAAK,WAAW,EAAE,CAAC;AAI5F,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,SAAS,GAAG,IAAI,CAEjE;AAED,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,CAIlD"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
// Chunker registry — maps source.type to a chunking function.
|
|
2
|
+
const registry = new Map();
|
|
3
|
+
export function registerChunker(type, fn) {
|
|
4
|
+
registry.set(type, fn);
|
|
5
|
+
}
|
|
6
|
+
export function getChunker(type) {
|
|
7
|
+
const fn = registry.get(type);
|
|
8
|
+
if (!fn)
|
|
9
|
+
throw new Error(`Unknown chunker type: "${type}". Available: ${[...registry.keys()].join(', ')}`);
|
|
10
|
+
return fn;
|
|
11
|
+
}
|
|
12
|
+
// Register built-ins on import
|
|
13
|
+
import { chunkMarkdown } from './markdown.js';
|
|
14
|
+
import { chunkCode } from './code.js';
|
|
15
|
+
import { chunkRawText } from './raw-text.js';
|
|
16
|
+
registerChunker('markdown', chunkMarkdown);
|
|
17
|
+
registerChunker('code', chunkCode);
|
|
18
|
+
registerChunker('raw-text', chunkRawText);
|
|
19
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/indexing/chunking/index.ts"],"names":[],"mappings":"AAAA,8DAA8D;AAM9D,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAqB,CAAC;AAE9C,MAAM,UAAU,eAAe,CAAC,IAAY,EAAE,EAAa;IACvD,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;AAC3B,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,IAAY;IACnC,MAAM,EAAE,GAAG,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC9B,IAAI,CAAC,EAAE;QAAE,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,iBAAiB,CAAC,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC3G,OAAO,EAAE,CAAC;AACd,CAAC;AAED,+BAA+B;AAC/B,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACtC,OAAO,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AAE7C,eAAe,CAAC,UAAU,EAAE,aAAa,CAAC,CAAC;AAC3C,eAAe,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;AACnC,eAAe,CAAC,UAAU,EAAE,YAAY,CAAC,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { type ChunkOutput, type SourceConfig } from '../../types.js';
|
|
2
|
+
export interface MarkdownChunk {
|
|
3
|
+
content: string;
|
|
4
|
+
title: string;
|
|
5
|
+
headingPath: string[];
|
|
6
|
+
chunkIndex: number;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Split markdown/MDX content into embedding-friendly chunks.
|
|
10
|
+
*
|
|
11
|
+
* @param content - The full markdown/MDX file content
|
|
12
|
+
* @param filePath - Path to the source file (used for metadata)
|
|
13
|
+
* @returns Array of MarkdownChunk objects
|
|
14
|
+
*/
|
|
15
|
+
export declare function chunkMarkdown(content: string, filePath: string, config: SourceConfig): ChunkOutput[];
|
|
16
|
+
//# sourceMappingURL=markdown.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.d.ts","sourceRoot":"","sources":["../../../src/indexing/chunking/markdown.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,WAAW,EAAE,KAAK,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAErE,MAAM,WAAW,aAAa;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;CACtB;AAsRD;;;;;;GAMG;AACH,wBAAgB,aAAa,CAAC,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,GAAG,WAAW,EAAE,CAuDpG"}
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
// Recursive markdown/MDX splitter
|
|
2
|
+
const DEFAULT_TARGET_TOKENS = 600;
|
|
3
|
+
const DEFAULT_OVERLAP_TOKENS = 50;
|
|
4
|
+
/**
|
|
5
|
+
* Parse YAML frontmatter from markdown content.
|
|
6
|
+
* Returns the title (if found) and the content with frontmatter stripped.
|
|
7
|
+
*/
|
|
8
|
+
function parseFrontmatter(content) {
|
|
9
|
+
const match = content.match(/^---\r?\n([\s\S]*?)\r?\n---\r?\n?/);
|
|
10
|
+
if (!match)
|
|
11
|
+
return { title: null, body: content };
|
|
12
|
+
const frontmatter = match[1];
|
|
13
|
+
const body = content.slice(match[0].length);
|
|
14
|
+
const titleMatch = frontmatter.match(/^title:\s*["']?(.+?)["']?\s*$/m);
|
|
15
|
+
return {
|
|
16
|
+
title: titleMatch ? titleMatch[1].trim() : null,
|
|
17
|
+
body,
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Extract the first heading from content to use as fallback title.
|
|
22
|
+
*/
|
|
23
|
+
function extractFirstHeading(content) {
|
|
24
|
+
const match = content.match(/^#{1,6}\s+(.+)$/m);
|
|
25
|
+
return match ? match[1].trim() : null;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Strip MDX-specific syntax: import statements and JSX component tags.
|
|
29
|
+
* Preserves text content inside JSX tags.
|
|
30
|
+
*/
|
|
31
|
+
function stripMdx(content) {
|
|
32
|
+
// Strip import statements (single and multi-line)
|
|
33
|
+
let result = content.replace(/^import\s+[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, '');
|
|
34
|
+
// Strip self-closing JSX tags: <Component ... />
|
|
35
|
+
result = result.replace(/<[A-Z][A-Za-z0-9]*(?:\s+[^>]*)?\s*\/>/g, '');
|
|
36
|
+
// Strip JSX component open/close tags but keep inner content
|
|
37
|
+
// Handles nested tags by repeatedly stripping innermost pairs
|
|
38
|
+
let prev = '';
|
|
39
|
+
while (prev !== result) {
|
|
40
|
+
prev = result;
|
|
41
|
+
result = result.replace(/<([A-Z][A-Za-z0-9]*)(?:\s+[^>]*)?>([^]*?)<\/\1>/g, '$2');
|
|
42
|
+
}
|
|
43
|
+
// Clean up excessive blank lines left by stripping
|
|
44
|
+
result = result.replace(/\n{3,}/g, '\n\n');
|
|
45
|
+
return result.trim();
|
|
46
|
+
}
|
|
47
|
+
function segmentCodeBlocks(content) {
|
|
48
|
+
const segments = [];
|
|
49
|
+
const codeBlockRegex = /^(`{3,})[^\n]*\n(?:[\s\S]*?\n)?\1\s*$/gm;
|
|
50
|
+
let lastIndex = 0;
|
|
51
|
+
let match;
|
|
52
|
+
while ((match = codeBlockRegex.exec(content)) !== null) {
|
|
53
|
+
if (match.index > lastIndex) {
|
|
54
|
+
segments.push({ text: content.slice(lastIndex, match.index), isCodeBlock: false });
|
|
55
|
+
}
|
|
56
|
+
segments.push({ text: match[0], isCodeBlock: true });
|
|
57
|
+
lastIndex = match.index + match[0].length;
|
|
58
|
+
}
|
|
59
|
+
if (lastIndex < content.length) {
|
|
60
|
+
segments.push({ text: content.slice(lastIndex), isCodeBlock: false });
|
|
61
|
+
}
|
|
62
|
+
return segments;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Split text on a delimiter, but never split inside code blocks.
|
|
66
|
+
*/
|
|
67
|
+
function splitPreservingCodeBlocks(content, delimiter) {
|
|
68
|
+
const segments = segmentCodeBlocks(content);
|
|
69
|
+
const parts = [];
|
|
70
|
+
let current = '';
|
|
71
|
+
for (const segment of segments) {
|
|
72
|
+
if (segment.isCodeBlock) {
|
|
73
|
+
current += segment.text;
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
const subParts = typeof delimiter === 'string'
|
|
77
|
+
? segment.text.split(delimiter)
|
|
78
|
+
: segment.text.split(delimiter);
|
|
79
|
+
if (subParts.length === 1) {
|
|
80
|
+
current += subParts[0];
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
// First sub-part continues the current accumulator
|
|
84
|
+
current += subParts[0];
|
|
85
|
+
for (let i = 1; i < subParts.length; i++) {
|
|
86
|
+
parts.push(current);
|
|
87
|
+
// Re-attach the delimiter for heading-based splits
|
|
88
|
+
if (typeof delimiter === 'string' && delimiter.startsWith('#')) {
|
|
89
|
+
current = delimiter + subParts[i];
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
current = subParts[i];
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
if (current) {
|
|
99
|
+
parts.push(current);
|
|
100
|
+
}
|
|
101
|
+
return parts.filter(p => p.trim().length > 0);
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Track heading hierarchy up to a given position in the original content.
|
|
105
|
+
*/
|
|
106
|
+
function getHeadingPathAtPosition(fullContent, position) {
|
|
107
|
+
const contentBefore = fullContent.slice(0, position);
|
|
108
|
+
const headingRegex = /^(#{1,6})\s+(.+)$/gm;
|
|
109
|
+
const headings = [];
|
|
110
|
+
let match;
|
|
111
|
+
while ((match = headingRegex.exec(contentBefore)) !== null) {
|
|
112
|
+
const level = match[1].length;
|
|
113
|
+
const text = match[2].trim();
|
|
114
|
+
// Remove headings at same or deeper level (new section at this level)
|
|
115
|
+
while (headings.length > 0 && headings[headings.length - 1].level >= level) {
|
|
116
|
+
headings.pop();
|
|
117
|
+
}
|
|
118
|
+
headings.push({ level, text });
|
|
119
|
+
}
|
|
120
|
+
return headings.map(h => h.text);
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Split text on heading boundaries at a specific level.
|
|
124
|
+
* Re-attaches the heading marker to each section.
|
|
125
|
+
*/
|
|
126
|
+
function splitOnHeading(content, level) {
|
|
127
|
+
const prefix = '#'.repeat(level) + ' ';
|
|
128
|
+
const regex = new RegExp(`(?=^${prefix.replace(/ $/, ' ')})`, 'gm');
|
|
129
|
+
const segments = segmentCodeBlocks(content);
|
|
130
|
+
const parts = [];
|
|
131
|
+
let current = '';
|
|
132
|
+
for (const segment of segments) {
|
|
133
|
+
if (segment.isCodeBlock) {
|
|
134
|
+
current += segment.text;
|
|
135
|
+
}
|
|
136
|
+
else {
|
|
137
|
+
const subParts = segment.text.split(regex);
|
|
138
|
+
if (subParts.length === 1) {
|
|
139
|
+
current += subParts[0];
|
|
140
|
+
}
|
|
141
|
+
else {
|
|
142
|
+
current += subParts[0];
|
|
143
|
+
for (let i = 1; i < subParts.length; i++) {
|
|
144
|
+
if (current.trim())
|
|
145
|
+
parts.push(current);
|
|
146
|
+
current = subParts[i];
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
if (current.trim())
|
|
152
|
+
parts.push(current);
|
|
153
|
+
return parts;
|
|
154
|
+
}
|
|
155
|
+
/**
|
|
156
|
+
* Recursively split content to fit within target chunk size.
|
|
157
|
+
* Priority: h2 -> h3 -> paragraph -> line
|
|
158
|
+
*/
|
|
159
|
+
function recursiveSplit(content, targetChars, depth = 0) {
|
|
160
|
+
if (content.length <= targetChars) {
|
|
161
|
+
return [content];
|
|
162
|
+
}
|
|
163
|
+
let parts;
|
|
164
|
+
if (depth === 0) {
|
|
165
|
+
// Split on ## headings
|
|
166
|
+
parts = splitOnHeading(content, 2);
|
|
167
|
+
if (parts.length > 1) {
|
|
168
|
+
return parts.flatMap(p => recursiveSplit(p, targetChars, 1));
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
if (depth <= 1) {
|
|
172
|
+
// Split on ### headings
|
|
173
|
+
parts = splitOnHeading(content, 3);
|
|
174
|
+
if (parts.length > 1) {
|
|
175
|
+
return parts.flatMap(p => recursiveSplit(p, targetChars, 2));
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
if (depth <= 2) {
|
|
179
|
+
// Split on paragraph boundaries
|
|
180
|
+
parts = splitPreservingCodeBlocks(content, /\n\n+/);
|
|
181
|
+
if (parts.length > 1) {
|
|
182
|
+
return mergeSmallParts(parts, targetChars).flatMap(p => recursiveSplit(p, targetChars, 3));
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
// Split on line boundaries
|
|
186
|
+
const lines = content.split('\n');
|
|
187
|
+
if (lines.length > 1) {
|
|
188
|
+
return mergeSmallParts(lines, targetChars);
|
|
189
|
+
}
|
|
190
|
+
// Content is a single very long line; return as-is
|
|
191
|
+
return [content];
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Merge adjacent small parts until they approach the target size.
|
|
195
|
+
*/
|
|
196
|
+
function mergeSmallParts(parts, targetSize) {
|
|
197
|
+
const merged = [];
|
|
198
|
+
let current = '';
|
|
199
|
+
for (const part of parts) {
|
|
200
|
+
const separator = current && !current.endsWith('\n') ? '\n\n' : '';
|
|
201
|
+
if (current && (current.length + separator.length + part.length) > targetSize) {
|
|
202
|
+
merged.push(current);
|
|
203
|
+
current = part;
|
|
204
|
+
}
|
|
205
|
+
else {
|
|
206
|
+
current = current ? current + separator + part : part;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
if (current.trim()) {
|
|
210
|
+
merged.push(current);
|
|
211
|
+
}
|
|
212
|
+
return merged;
|
|
213
|
+
}
|
|
214
|
+
/**
|
|
215
|
+
* Apply overlap between consecutive chunks.
|
|
216
|
+
*/
|
|
217
|
+
function applyOverlap(chunks, overlapChars) {
|
|
218
|
+
if (chunks.length <= 1 || overlapChars <= 0)
|
|
219
|
+
return chunks;
|
|
220
|
+
const result = [chunks[0]];
|
|
221
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
222
|
+
const prevChunk = chunks[i - 1];
|
|
223
|
+
const overlapText = prevChunk.slice(-overlapChars);
|
|
224
|
+
// Find a clean break point (newline or space) in the overlap
|
|
225
|
+
const breakPoint = overlapText.lastIndexOf('\n');
|
|
226
|
+
const cleanOverlap = breakPoint > 0 ? overlapText.slice(breakPoint) : overlapText;
|
|
227
|
+
result.push(cleanOverlap + chunks[i]);
|
|
228
|
+
}
|
|
229
|
+
return result;
|
|
230
|
+
}
|
|
231
|
+
/**
|
|
232
|
+
* Split markdown/MDX content into embedding-friendly chunks.
|
|
233
|
+
*
|
|
234
|
+
* @param content - The full markdown/MDX file content
|
|
235
|
+
* @param filePath - Path to the source file (used for metadata)
|
|
236
|
+
* @returns Array of MarkdownChunk objects
|
|
237
|
+
*/
|
|
238
|
+
export function chunkMarkdown(content, filePath, config) {
|
|
239
|
+
if (!content || !content.trim()) {
|
|
240
|
+
return [];
|
|
241
|
+
}
|
|
242
|
+
const targetChars = (config.chunk?.target_tokens ?? DEFAULT_TARGET_TOKENS) * 4;
|
|
243
|
+
const overlapChars = (config.chunk?.overlap_tokens ?? DEFAULT_OVERLAP_TOKENS) * 4;
|
|
244
|
+
// Parse frontmatter
|
|
245
|
+
const { title: fmTitle, body } = parseFrontmatter(content);
|
|
246
|
+
// Strip MDX syntax
|
|
247
|
+
const cleanBody = stripMdx(body);
|
|
248
|
+
if (!cleanBody.trim()) {
|
|
249
|
+
return [];
|
|
250
|
+
}
|
|
251
|
+
// Determine title
|
|
252
|
+
const title = fmTitle || extractFirstHeading(cleanBody) || filePath.split('/').pop() || filePath;
|
|
253
|
+
// Recursively split the content
|
|
254
|
+
const rawChunks = recursiveSplit(cleanBody, targetChars);
|
|
255
|
+
// Apply overlap
|
|
256
|
+
const overlappedChunks = applyOverlap(rawChunks, overlapChars);
|
|
257
|
+
// Build heading paths by finding where each raw chunk starts in the original
|
|
258
|
+
const chunks = [];
|
|
259
|
+
let searchFrom = 0;
|
|
260
|
+
for (let i = 0; i < overlappedChunks.length; i++) {
|
|
261
|
+
const chunkText = overlappedChunks[i].trim();
|
|
262
|
+
if (!chunkText)
|
|
263
|
+
continue;
|
|
264
|
+
// Find the position of this chunk's primary content in the clean body
|
|
265
|
+
// Use the raw (non-overlapped) chunk to find position
|
|
266
|
+
const rawText = rawChunks[i]?.trim() || chunkText;
|
|
267
|
+
const pos = cleanBody.indexOf(rawText, searchFrom);
|
|
268
|
+
const headingPath = pos >= 0
|
|
269
|
+
? getHeadingPathAtPosition(cleanBody, pos)
|
|
270
|
+
: [];
|
|
271
|
+
if (pos >= 0) {
|
|
272
|
+
searchFrom = pos;
|
|
273
|
+
}
|
|
274
|
+
chunks.push({
|
|
275
|
+
content: chunkText,
|
|
276
|
+
title,
|
|
277
|
+
headingPath,
|
|
278
|
+
chunkIndex: chunks.length,
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
return chunks;
|
|
282
|
+
}
|
|
283
|
+
//# sourceMappingURL=markdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../../src/indexing/chunking/markdown.ts"],"names":[],"mappings":"AAAA,kCAAkC;AAWlC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,sBAAsB,GAAG,EAAE,CAAC;AAElC;;;GAGG;AACH,SAAS,gBAAgB,CAAC,OAAe;IACrC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,mCAAmC,CAAC,CAAC;IACjE,IAAI,CAAC,KAAK;QAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IAElD,MAAM,WAAW,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IAC7B,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IAE5C,MAAM,UAAU,GAAG,WAAW,CAAC,KAAK,CAAC,gCAAgC,CAAC,CAAC;IACvE,OAAO;QACH,KAAK,EAAE,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI;QAC/C,IAAI;KACP,CAAC;AACN,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,OAAe;IACxC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;IAChD,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;AAC1C,CAAC;AAED;;;GAGG;AACH,SAAS,QAAQ,CAAC,OAAe;IAC7B,kDAAkD;IAClD,IAAI,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,iDAAiD,EAAE,EAAE,CAAC,CAAC;IAEpF,iDAAiD;IACjD,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,wCAAwC,EAAE,EAAE,CAAC,CAAC;IAEtE,6DAA6D;IAC7D,8DAA8D;IAC9D,IAAI,IAAI,GAAG,EAAE,CAAC;IACd,OAAO,IAAI,KAAK,MAAM,EAAE,CAAC;QACrB,IAAI,GAAG,MAAM,CAAC;QACd,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,kDAAkD,EAAE,IAAI,CAAC,CAAC;IACtF,CAAC;IAED,mDAAmD;IACnD,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;IAE3C,OAAO,MAAM,CAAC,IAAI,EAAE,CAAC;AACzB,CAAC;AAWD,SAAS,iBAAiB,CAAC,OAAe;IACtC,MAAM,QAAQ,GAAqB,EAAE,CAAC;IACtC,MAAM,cAAc,GAAG,yCAAyC,CAAC;IAEjE,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,KAA6B,CAAC;IAElC,OAAO,CAAC,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACrD,IAAI,KAAK,CAAC,KAAK,GAAG,SAAS,EAAE,CAAC;YAC1B,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,KAAK,CAAC,EAAE,WAAW,EAAE,KAAK,EAAE,CAAC,CAAC;QACvF,CAAC;QACD,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,CAAC;QACrD,SAAS,GAAG,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IAC9C,CAAC;IAED,IAAI,SAAS,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;QAC7B,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,OAAO,CAAC,KAAK,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,KAAK,EAAE,CAAC,CAAC;IAC1E,CAAC;IAED,OAAO,QAAQ,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,SAAS,yBAAyB,CAAC,OAAe,EAAE,SAA0B;IAC1E,MAAM,QAAQ,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;IAC5C,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAC;IAEjB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC7B,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;YACtB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;QAC5B,CAAC;aAAM,CAAC;YACJ,MAAM,QAAQ,GAAG,OAAO,SAAS,KAAK,QAAQ;gBAC1C,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC;gBAC/B,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;YAEpC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACxB,OAAO,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC;YAC3B,CAAC;iBAAM,CAAC;gBACJ,mDAAmD;gBACnD,OAAO,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC;gBACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBACvC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBACpB,mDAAmD;oBACnD,IAAI,OAAO,SAAS,KAAK,QAAQ,IAAI,SAAS,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;wBAC7D,OAAO,GAAG,SAAS,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;oBACtC,CAAC;yBAAM,CAAC;wBACJ,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;oBAC1B,CAAC;gBACL,CAAC;YACL,CAAC;QACL,CAAC;IACL,CAAC;IACD,IAAI,OAAO,EAAE,CAAC;QACV,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACxB,CAAC;IAED,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AAClD,CAAC;AAQD;;GAEG;AACH,SAAS,wBAAwB,CAAC,WAAmB,EAAE,QAAgB;IACnE,MAAM,aAAa,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IACrD,MAAM,YAAY,GAAG,qBAAqB,CAAC;IAC3C,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,IAAI,KAA6B,CAAC;IAElC,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACzD,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAE7B,sEAAsE;QACtE,OAAO,QAAQ,CAAC,MAAM,GAAG,CAAC,IAAI,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,KAAK,IAAI,KAAK,EAAE,CAAC;YACzE,QAAQ,CAAC,GAAG,EAAE,CAAC;QACnB,CAAC;QACD,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACnC,CAAC;IAED,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;AACrC,CAAC;AAED;;;GAGG;AACH,SAAS,cAAc,CAAC,OAAe,EAAE,KAAa;IAClD,MAAM,MAAM,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC;IACvC,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,MAAM,CAAC,OAAO,CAAC,IAAI,EAAE,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;IAEpE,MAAM,QAAQ,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;IAC5C,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAC;IAEjB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC7B,IAAI,OAAO,CAAC,WAAW,EAAE,CAAC;YACtB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;QAC5B,CAAC;aAAM,CAAC;YACJ,MAAM,QAAQ,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAC3C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACxB,OAAO,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC;YAC3B,CAAC;iBAAM,CAAC;gBACJ,OAAO,IAAI,QAAQ,CAAC,CAAC,CAAC,CAAC;gBACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBACvC,IAAI,OAAO,CAAC,IAAI,EAAE;wBAAE,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;oBACxC,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;gBAC1B,CAAC;YACL,CAAC;QACL,CAAC;IACL,CAAC;IACD,IAAI,OAAO,CAAC,IAAI,EAAE;QAAE,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAExC,OAAO,KAAK,CAAC;AACjB,CAAC;AAED;;;GAGG;AACH,SAAS,cAAc,CAAC,OAAe,EAAE,WAAmB,EAAE,QAAgB,CAAC;IAC3E,IAAI,OAAO,CAAC,MAAM,IAAI,WAAW,EAAE,CAAC;QAChC,OAAO,CAAC,OAAO,CAAC,CAAC;IACrB,CAAC;IAED,IAAI,KAAe,CAAC;IAEpB,IAAI,KAAK,KAAK,CAAC,EAAE,CAAC;QACd,uBAAuB;QACvB,KAAK,GAAG,cAAc,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACnC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnB,OAAO,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,cAAc,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC;QACjE,CAAC;IACL,CAAC;IAED,IAAI,KAAK,IAAI,CAAC,EAAE,CAAC;QACb,wBAAwB;QACxB,KAAK,GAAG,cAAc,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QACnC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnB,OAAO,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,cAAc,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC;QACjE,CAAC;IACL,CAAC;IAED,IAAI,KAAK,IAAI,CAAC,EAAE,CAAC;QACb,gCAAgC;QAChC,KAAK,GAAG,yBAAyB,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;QACpD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnB,OAAO,eAAe,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,cAAc,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC;QAC/F,CAAC;IACL,CAAC;IAED,2BAA2B;IAC3B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAClC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACnB,OAAO,eAAe,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;IAC/C,CAAC;IAED,mDAAmD;IACnD,OAAO,CAAC,OAAO,CAAC,CAAC;AACrB,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,KAAe,EAAE,UAAkB;IACxD,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,OAAO,GAAG,EAAE,CAAC;IAEjB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,MAAM,SAAS,GAAG,OAAO,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;QACnE,IAAI,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,UAAU,EAAE,CAAC;YAC5E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACrB,OAAO,GAAG,IAAI,CAAC;QACnB,CAAC;aAAM,CAAC;YACJ,OAAO,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO,GAAG,SAAS,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;QAC1D,CAAC;IACL,CAAC;IACD,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;QACjB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACzB,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,MAAgB,EAAE,YAAoB;IACxD,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,IAAI,YAAY,IAAI,CAAC;QAAE,OAAO,MAAM,CAAC;IAE3D,MAAM,MAAM,GAAa,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IACrC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,SAAS,GAAG,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAChC,MAAM,WAAW,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC;QAEnD,6DAA6D;QAC7D,MAAM,UAAU,GAAG,WAAW,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACjD,MAAM,YAAY,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC;QAElF,MAAM,CAAC,IAAI,CAAC,YAAY,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;IAC1C,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,aAAa,CAAC,OAAe,EAAE,QAAgB,EAAE,MAAoB;IACjF,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;QAC9B,OAAO,EAAE,CAAC;IACd,CAAC;IAED,MAAM,WAAW,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE,aAAa,IAAI,qBAAqB,CAAC,GAAG,CAAC,CAAC;IAC/E,MAAM,YAAY,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE,cAAc,IAAI,sBAAsB,CAAC,GAAG,CAAC,CAAC;IAElF,oBAAoB;IACpB,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,IAAI,EAAE,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;IAE3D,mBAAmB;IACnB,MAAM,SAAS,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC;IAEjC,IAAI,CAAC,SAAS,CAAC,IAAI,EAAE,EAAE,CAAC;QACpB,OAAO,EAAE,CAAC;IACd,CAAC;IAED,kBAAkB;IAClB,MAAM,KAAK,GAAG,OAAO,IAAI,mBAAmB,CAAC,SAAS,CAAC,IAAI,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,QAAQ,CAAC;IAEjG,gCAAgC;IAChC,MAAM,SAAS,GAAG,cAAc,CAAC,SAAS,EAAE,WAAW,CAAC,CAAC;IAEzD,gBAAgB;IAChB,MAAM,gBAAgB,GAAG,YAAY,CAAC,SAAS,EAAE,YAAY,CAAC,CAAC;IAE/D,6EAA6E;IAC7E,MAAM,MAAM,GAAkB,EAAE,CAAC;IACjC,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC/C,MAAM,SAAS,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC7C,IAAI,CAAC,SAAS;YAAE,SAAS;QAEzB,sEAAsE;QACtE,sDAAsD;QACtD,MAAM,OAAO,GAAG,SAAS,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,SAAS,CAAC;QAClD,MAAM,GAAG,GAAG,SAAS,CAAC,OAAO,CAAC,OAAO,EAAE,UAAU,CAAC,CAAC;QACnD,MAAM,WAAW,GAAG,GAAG,IAAI,CAAC;YACxB,CAAC,CAAC,wBAAwB,CAAC,SAAS,EAAE,GAAG,CAAC;YAC1C,CAAC,CAAC,EAAE,CAAC;QACT,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;YACX,UAAU,GAAG,GAAG,CAAC;QACrB,CAAC;QAED,MAAM,CAAC,IAAI,CAAC;YACR,OAAO,EAAE,SAAS;YAClB,KAAK;YACL,WAAW;YACX,UAAU,EAAE,MAAM,CAAC,MAAM;SAC5B,CAAC,CAAC;IACP,CAAC;IAED,OAAO,MAAM,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { type ChunkOutput, type SourceConfig } from '../../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Split plain text into embedding-friendly chunks on paragraph boundaries.
|
|
4
|
+
*
|
|
5
|
+
* @param content - The full file content
|
|
6
|
+
* @param _filePath - Path to the source file (unused, kept for registry signature)
|
|
7
|
+
* @param config - Source configuration with chunk size parameters
|
|
8
|
+
* @returns Array of ChunkOutput objects
|
|
9
|
+
*/
|
|
10
|
+
export declare function chunkRawText(content: string, _filePath: string, config: SourceConfig): ChunkOutput[];
|
|
11
|
+
//# sourceMappingURL=raw-text.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"raw-text.d.ts","sourceRoot":"","sources":["../../../src/indexing/chunking/raw-text.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,KAAK,WAAW,EAAE,KAAK,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAKrE;;;;;;;GAOG;AACH,wBAAgB,YAAY,CAAC,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,GAAG,WAAW,EAAE,CA0DpG"}
|