@rce-mcp/retrieval-core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +19 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/chunking.d.ts +50 -0
- package/dist/chunking.js +520 -0
- package/dist/index.d.ts +390 -0
- package/dist/index.js +3417 -0
- package/dist/remote-sync.d.ts +116 -0
- package/dist/remote-sync.js +476 -0
- package/package.json +33 -0
- package/scripts/poc-node-parser-host.cjs +101 -0
- package/scripts/poc-parser-availability-benchmark.ts +290 -0
- package/src/chunking.ts +641 -0
- package/src/index.ts +4338 -0
- package/src/remote-sync.ts +651 -0
- package/test/benchmark.thresholds.test.ts +752 -0
- package/test/chunking.language-aware.test.ts +279 -0
- package/test/chunking.parser-availability.poc.test.ts +60 -0
- package/test/embedding-provider.test.ts +121 -0
- package/test/enhance-confidence.test.ts +357 -0
- package/test/integration.test.ts +324 -0
- package/test/local-sqlite.integration.test.ts +258 -0
- package/test/remote-sync.integration.test.ts +177 -0
- package/tsconfig.build.json +17 -0
- package/tsconfig.json +4 -0
package/src/chunking.ts
ADDED
|
@@ -0,0 +1,641 @@
|
|
|
1
|
+
import Parser from "tree-sitter";
|
|
2
|
+
import Go from "tree-sitter-go";
|
|
3
|
+
import JavaScript from "tree-sitter-javascript";
|
|
4
|
+
import Python from "tree-sitter-python";
|
|
5
|
+
import TypeScript from "tree-sitter-typescript";
|
|
6
|
+
|
|
7
|
+
export type ChunkingStrategy = "language_aware" | "sliding";
|
|
8
|
+
export type ChunkingFallbackReason =
|
|
9
|
+
| "unsupported_language"
|
|
10
|
+
| "parser_unavailable"
|
|
11
|
+
| "parse_error"
|
|
12
|
+
| "parse_timeout_exceeded"
|
|
13
|
+
| "empty_language_boundaries";
|
|
14
|
+
|
|
15
|
+
export interface ChunkingConfig {
|
|
16
|
+
strategy: ChunkingStrategy;
|
|
17
|
+
fallback_strategy: "sliding";
|
|
18
|
+
target_chunk_tokens: number;
|
|
19
|
+
chunk_overlap_tokens: number;
|
|
20
|
+
max_chunks_per_file: number;
|
|
21
|
+
parse_timeout_ms: number;
|
|
22
|
+
enabled_languages: string[];
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface ChunkingRawFile {
|
|
26
|
+
path: string;
|
|
27
|
+
content: string;
|
|
28
|
+
language?: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface ChunkingOutput {
|
|
32
|
+
start_line: number;
|
|
33
|
+
end_line: number;
|
|
34
|
+
snippet: string;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export interface ChunkingResult {
|
|
38
|
+
chunks: ChunkingOutput[];
|
|
39
|
+
strategy: ChunkingStrategy;
|
|
40
|
+
fallback_reason?: ChunkingFallbackReason;
|
|
41
|
+
parse_latency_ms?: number;
|
|
42
|
+
language_aware_attempt_latency_ms?: number;
|
|
43
|
+
fallback_path_latency_ms?: number;
|
|
44
|
+
language?: string;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export type ParserLanguage = "typescript" | "tsx" | "javascript" | "jsx" | "python" | "go";
|
|
48
|
+
|
|
49
|
+
type ParserAvailabilityStatus = "available" | "unavailable";
|
|
50
|
+
|
|
51
|
+
interface ParserAvailability {
|
|
52
|
+
status: ParserAvailabilityStatus;
|
|
53
|
+
parser?: Parser;
|
|
54
|
+
error?: string;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export interface ChunkingParserAvailabilitySnapshotEntry {
|
|
58
|
+
language: string;
|
|
59
|
+
status: ParserAvailabilityStatus;
|
|
60
|
+
error?: string;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const DEFAULT_BOUNDARY_NODE_TYPES: Record<ParserLanguage, Set<string>> = {
|
|
64
|
+
typescript: new Set([
|
|
65
|
+
"function_declaration",
|
|
66
|
+
"generator_function_declaration",
|
|
67
|
+
"class_declaration",
|
|
68
|
+
"interface_declaration",
|
|
69
|
+
"enum_declaration",
|
|
70
|
+
"type_alias_declaration"
|
|
71
|
+
]),
|
|
72
|
+
tsx: new Set([
|
|
73
|
+
"function_declaration",
|
|
74
|
+
"generator_function_declaration",
|
|
75
|
+
"class_declaration",
|
|
76
|
+
"interface_declaration",
|
|
77
|
+
"enum_declaration",
|
|
78
|
+
"type_alias_declaration"
|
|
79
|
+
]),
|
|
80
|
+
javascript: new Set(["function_declaration", "generator_function_declaration", "class_declaration"]),
|
|
81
|
+
jsx: new Set(["function_declaration", "generator_function_declaration", "class_declaration"]),
|
|
82
|
+
python: new Set(["function_definition", "class_definition"]),
|
|
83
|
+
go: new Set(["function_declaration", "method_declaration", "type_declaration"])
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
const parserAvailabilityCache = new Map<ParserLanguage, ParserAvailability>();
|
|
87
|
+
const parserInitAttempts = new Map<ParserLanguage, number>();
|
|
88
|
+
const parserLanguageLoaderOverrides = new Map<ParserLanguage, () => Parser.Language>();
|
|
89
|
+
const CANONICAL_TO_PARSER_LANGUAGE: Record<string, ParserLanguage> = {
|
|
90
|
+
typescript: "typescript",
|
|
91
|
+
javascript: "javascript",
|
|
92
|
+
python: "python",
|
|
93
|
+
go: "go"
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
function parserLanguageToCanonical(language: ParserLanguage): string {
|
|
97
|
+
if (language === "tsx") {
|
|
98
|
+
return "typescript";
|
|
99
|
+
}
|
|
100
|
+
if (language === "jsx") {
|
|
101
|
+
return "javascript";
|
|
102
|
+
}
|
|
103
|
+
return language;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
function normalizeLanguageAlias(language: string): ParserLanguage | undefined {
|
|
107
|
+
const normalized = language.trim().toLowerCase();
|
|
108
|
+
if (normalized === "typescript" || normalized === "ts") {
|
|
109
|
+
return "typescript";
|
|
110
|
+
}
|
|
111
|
+
if (normalized === "tsx") {
|
|
112
|
+
return "tsx";
|
|
113
|
+
}
|
|
114
|
+
if (normalized === "javascript" || normalized === "js") {
|
|
115
|
+
return "javascript";
|
|
116
|
+
}
|
|
117
|
+
if (normalized === "jsx") {
|
|
118
|
+
return "jsx";
|
|
119
|
+
}
|
|
120
|
+
if (normalized === "python" || normalized === "py") {
|
|
121
|
+
return "python";
|
|
122
|
+
}
|
|
123
|
+
if (normalized === "go" || normalized === "golang") {
|
|
124
|
+
return "go";
|
|
125
|
+
}
|
|
126
|
+
return undefined;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function parserLanguageFromPath(path: string): ParserLanguage | undefined {
|
|
130
|
+
const normalized = path.toLowerCase();
|
|
131
|
+
if (normalized.endsWith(".tsx")) {
|
|
132
|
+
return "tsx";
|
|
133
|
+
}
|
|
134
|
+
if (normalized.endsWith(".ts")) {
|
|
135
|
+
return "typescript";
|
|
136
|
+
}
|
|
137
|
+
if (normalized.endsWith(".jsx")) {
|
|
138
|
+
return "jsx";
|
|
139
|
+
}
|
|
140
|
+
if (normalized.endsWith(".js") || normalized.endsWith(".mjs") || normalized.endsWith(".cjs")) {
|
|
141
|
+
return "javascript";
|
|
142
|
+
}
|
|
143
|
+
if (normalized.endsWith(".py")) {
|
|
144
|
+
return "python";
|
|
145
|
+
}
|
|
146
|
+
if (normalized.endsWith(".go")) {
|
|
147
|
+
return "go";
|
|
148
|
+
}
|
|
149
|
+
return undefined;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function resolveParserLanguage(file: ChunkingRawFile): ParserLanguage | undefined {
|
|
153
|
+
const explicit = file.language ? normalizeLanguageAlias(file.language) : undefined;
|
|
154
|
+
if (explicit) {
|
|
155
|
+
return explicit;
|
|
156
|
+
}
|
|
157
|
+
return parserLanguageFromPath(file.path);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function formatErrorMessage(error: unknown): string {
|
|
161
|
+
if (error instanceof Error && error.message) {
|
|
162
|
+
return error.message;
|
|
163
|
+
}
|
|
164
|
+
return String(error);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function resolveTreeSitterLanguageHandle(moduleValue: unknown): Parser.Language {
|
|
168
|
+
if (moduleValue && typeof moduleValue === "object" && "language" in moduleValue) {
|
|
169
|
+
return (moduleValue as { language: unknown }).language as Parser.Language;
|
|
170
|
+
}
|
|
171
|
+
return moduleValue as Parser.Language;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function loadParserLanguage(language: ParserLanguage): Parser.Language {
|
|
175
|
+
const overrideLoader = parserLanguageLoaderOverrides.get(language);
|
|
176
|
+
if (overrideLoader) {
|
|
177
|
+
return overrideLoader();
|
|
178
|
+
}
|
|
179
|
+
if (language === "typescript") {
|
|
180
|
+
return (TypeScript as unknown as { typescript: unknown }).typescript as Parser.Language;
|
|
181
|
+
}
|
|
182
|
+
if (language === "tsx") {
|
|
183
|
+
return (TypeScript as unknown as { tsx: unknown }).tsx as Parser.Language;
|
|
184
|
+
}
|
|
185
|
+
if (language === "javascript" || language === "jsx") {
|
|
186
|
+
return resolveTreeSitterLanguageHandle(JavaScript);
|
|
187
|
+
}
|
|
188
|
+
if (language === "python") {
|
|
189
|
+
return resolveTreeSitterLanguageHandle(Python);
|
|
190
|
+
}
|
|
191
|
+
return resolveTreeSitterLanguageHandle(Go);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
function getParserAvailability(language: ParserLanguage): ParserAvailability {
|
|
195
|
+
const cached = parserAvailabilityCache.get(language);
|
|
196
|
+
if (cached) {
|
|
197
|
+
return cached;
|
|
198
|
+
}
|
|
199
|
+
parserInitAttempts.set(language, (parserInitAttempts.get(language) ?? 0) + 1);
|
|
200
|
+
try {
|
|
201
|
+
const parser = new Parser();
|
|
202
|
+
parser.setLanguage(loadParserLanguage(language));
|
|
203
|
+
const availability: ParserAvailability = {
|
|
204
|
+
status: "available",
|
|
205
|
+
parser
|
|
206
|
+
};
|
|
207
|
+
parserAvailabilityCache.set(language, availability);
|
|
208
|
+
return availability;
|
|
209
|
+
} catch (error) {
|
|
210
|
+
const availability: ParserAvailability = {
|
|
211
|
+
status: "unavailable",
|
|
212
|
+
error: formatErrorMessage(error)
|
|
213
|
+
};
|
|
214
|
+
parserAvailabilityCache.set(language, availability);
|
|
215
|
+
return availability;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function getParser(language: ParserLanguage): Parser | undefined {
|
|
220
|
+
const availability = getParserAvailability(language);
|
|
221
|
+
if (availability.status !== "available") {
|
|
222
|
+
return undefined;
|
|
223
|
+
}
|
|
224
|
+
return availability.parser;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function toChunkingParserLanguage(canonicalLanguage: string): ParserLanguage | undefined {
|
|
228
|
+
return CANONICAL_TO_PARSER_LANGUAGE[canonicalLanguage];
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
export function getChunkingParserAvailabilitySnapshot(input?: {
|
|
232
|
+
enabled_languages?: string[];
|
|
233
|
+
}): ChunkingParserAvailabilitySnapshotEntry[] {
|
|
234
|
+
const canonicalLanguages = new Set<string>();
|
|
235
|
+
if (input?.enabled_languages && input.enabled_languages.length > 0) {
|
|
236
|
+
for (const language of input.enabled_languages) {
|
|
237
|
+
const parsed = normalizeLanguageAlias(language);
|
|
238
|
+
if (!parsed) {
|
|
239
|
+
continue;
|
|
240
|
+
}
|
|
241
|
+
const canonical = parserLanguageToCanonical(parsed);
|
|
242
|
+
if (toChunkingParserLanguage(canonical)) {
|
|
243
|
+
canonicalLanguages.add(canonical);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
if (canonicalLanguages.size === 0) {
|
|
248
|
+
for (const canonical of Object.keys(CANONICAL_TO_PARSER_LANGUAGE)) {
|
|
249
|
+
canonicalLanguages.add(canonical);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
return [...canonicalLanguages]
|
|
254
|
+
.sort((a, b) => a.localeCompare(b))
|
|
255
|
+
.map((canonicalLanguage) => {
|
|
256
|
+
const parserLanguage = toChunkingParserLanguage(canonicalLanguage);
|
|
257
|
+
if (!parserLanguage) {
|
|
258
|
+
return {
|
|
259
|
+
language: canonicalLanguage,
|
|
260
|
+
status: "unavailable" as const,
|
|
261
|
+
error: "no parser mapping for language"
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
const availability = getParserAvailability(parserLanguage);
|
|
265
|
+
return {
|
|
266
|
+
language: canonicalLanguage,
|
|
267
|
+
status: availability.status,
|
|
268
|
+
...(availability.error ? { error: availability.error } : {})
|
|
269
|
+
};
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
export function __resetChunkingParserStateForTests(): void {
|
|
274
|
+
parserAvailabilityCache.clear();
|
|
275
|
+
parserInitAttempts.clear();
|
|
276
|
+
parserLanguageLoaderOverrides.clear();
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
export function __setChunkingParserLanguageLoaderForTests(
|
|
280
|
+
language: ParserLanguage,
|
|
281
|
+
loader: (() => Parser.Language) | undefined
|
|
282
|
+
): void {
|
|
283
|
+
if (loader) {
|
|
284
|
+
parserLanguageLoaderOverrides.set(language, loader);
|
|
285
|
+
} else {
|
|
286
|
+
parserLanguageLoaderOverrides.delete(language);
|
|
287
|
+
}
|
|
288
|
+
parserAvailabilityCache.delete(language);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
export function __getChunkingParserInitAttemptsForTests(): Record<string, number> {
|
|
292
|
+
return Object.fromEntries(parserInitAttempts.entries());
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
function toInclusiveEndRow(node: Parser.SyntaxNode): number {
|
|
296
|
+
const end = node.endPosition;
|
|
297
|
+
if (end.column === 0 && end.row > node.startPosition.row) {
|
|
298
|
+
return end.row - 1;
|
|
299
|
+
}
|
|
300
|
+
return end.row;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function trimLineRange(lines: string[], startRow: number, endRow: number): { start: number; end: number } | undefined {
|
|
304
|
+
let start = startRow;
|
|
305
|
+
let end = endRow;
|
|
306
|
+
while (start <= end && (lines[start] ?? "").trim().length === 0) {
|
|
307
|
+
start += 1;
|
|
308
|
+
}
|
|
309
|
+
while (end >= start && (lines[end] ?? "").trim().length === 0) {
|
|
310
|
+
end -= 1;
|
|
311
|
+
}
|
|
312
|
+
if (end < start) {
|
|
313
|
+
return undefined;
|
|
314
|
+
}
|
|
315
|
+
return { start, end };
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
function splitRangeWithBudget(input: {
|
|
319
|
+
lines: string[];
|
|
320
|
+
startRow: number;
|
|
321
|
+
endRow: number;
|
|
322
|
+
tokenize: (text: string) => string[];
|
|
323
|
+
targetChunkTokens: number;
|
|
324
|
+
overlapTokens: number;
|
|
325
|
+
maxChunks: number;
|
|
326
|
+
}): Array<{ startRow: number; endRow: number }> {
|
|
327
|
+
const segments: Array<{ startRow: number; endRow: number }> = [];
|
|
328
|
+
let start = input.startRow;
|
|
329
|
+
|
|
330
|
+
while (start <= input.endRow && segments.length < input.maxChunks) {
|
|
331
|
+
let tokens = 0;
|
|
332
|
+
let end = start;
|
|
333
|
+
while (end <= input.endRow) {
|
|
334
|
+
tokens += input.tokenize(input.lines[end] ?? "").length;
|
|
335
|
+
if (tokens >= input.targetChunkTokens) {
|
|
336
|
+
break;
|
|
337
|
+
}
|
|
338
|
+
end += 1;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
const safeEnd = Math.min(end, input.endRow);
|
|
342
|
+
if (safeEnd >= start) {
|
|
343
|
+
segments.push({ startRow: start, endRow: safeEnd });
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
if (safeEnd >= input.endRow) {
|
|
347
|
+
break;
|
|
348
|
+
}
|
|
349
|
+
const rewind = Math.max(1, Math.floor(input.overlapTokens / 4));
|
|
350
|
+
start = Math.max(start + 1, safeEnd - rewind + 1);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
return segments;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
function buildSlidingChunks(input: {
|
|
357
|
+
lines: string[];
|
|
358
|
+
tokenize: (text: string) => string[];
|
|
359
|
+
targetChunkTokens: number;
|
|
360
|
+
overlapTokens: number;
|
|
361
|
+
maxChunks: number;
|
|
362
|
+
}): ChunkingOutput[] {
|
|
363
|
+
const rawSegments = splitRangeWithBudget({
|
|
364
|
+
lines: input.lines,
|
|
365
|
+
startRow: 0,
|
|
366
|
+
endRow: Math.max(0, input.lines.length - 1),
|
|
367
|
+
tokenize: input.tokenize,
|
|
368
|
+
targetChunkTokens: input.targetChunkTokens,
|
|
369
|
+
overlapTokens: input.overlapTokens,
|
|
370
|
+
maxChunks: input.maxChunks
|
|
371
|
+
});
|
|
372
|
+
const chunks: ChunkingOutput[] = [];
|
|
373
|
+
for (const segment of rawSegments) {
|
|
374
|
+
const trimmed = trimLineRange(input.lines, segment.startRow, segment.endRow);
|
|
375
|
+
if (!trimmed) {
|
|
376
|
+
continue;
|
|
377
|
+
}
|
|
378
|
+
chunks.push({
|
|
379
|
+
start_line: trimmed.start + 1,
|
|
380
|
+
end_line: trimmed.end + 1,
|
|
381
|
+
snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
|
|
382
|
+
});
|
|
383
|
+
}
|
|
384
|
+
return chunks;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
function hasBoundaryAncestor(node: Parser.SyntaxNode, boundaryTypes: Set<string>): boolean {
|
|
388
|
+
let current = node.parent;
|
|
389
|
+
while (current) {
|
|
390
|
+
if (boundaryTypes.has(current.type)) {
|
|
391
|
+
return true;
|
|
392
|
+
}
|
|
393
|
+
current = current.parent;
|
|
394
|
+
}
|
|
395
|
+
return false;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
function buildLanguageAwareChunks(input: {
|
|
399
|
+
file: ChunkingRawFile;
|
|
400
|
+
lines: string[];
|
|
401
|
+
parserLanguage: ParserLanguage;
|
|
402
|
+
config: ChunkingConfig;
|
|
403
|
+
tokenize: (text: string) => string[];
|
|
404
|
+
}): ChunkingResult {
|
|
405
|
+
const languageAwareAttemptStart = Date.now();
|
|
406
|
+
const parser = getParser(input.parserLanguage);
|
|
407
|
+
if (!parser) {
|
|
408
|
+
const fallbackStart = Date.now();
|
|
409
|
+
const chunks = buildSlidingChunks({
|
|
410
|
+
lines: input.lines,
|
|
411
|
+
tokenize: input.tokenize,
|
|
412
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
413
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
414
|
+
maxChunks: input.config.max_chunks_per_file
|
|
415
|
+
});
|
|
416
|
+
return {
|
|
417
|
+
chunks,
|
|
418
|
+
strategy: "sliding",
|
|
419
|
+
fallback_reason: "parser_unavailable",
|
|
420
|
+
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
421
|
+
fallback_path_latency_ms: Date.now() - fallbackStart,
|
|
422
|
+
language: parserLanguageToCanonical(input.parserLanguage)
|
|
423
|
+
};
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
try {
|
|
427
|
+
parser.setTimeoutMicros(input.config.parse_timeout_ms * 1_000);
|
|
428
|
+
const parseStart = Date.now();
|
|
429
|
+
const tree = parser.parse(input.file.content);
|
|
430
|
+
const parseLatencyMs = Date.now() - parseStart;
|
|
431
|
+
if (parseLatencyMs > input.config.parse_timeout_ms) {
|
|
432
|
+
const fallbackStart = Date.now();
|
|
433
|
+
const chunks = buildSlidingChunks({
|
|
434
|
+
lines: input.lines,
|
|
435
|
+
tokenize: input.tokenize,
|
|
436
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
437
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
438
|
+
maxChunks: input.config.max_chunks_per_file
|
|
439
|
+
});
|
|
440
|
+
return {
|
|
441
|
+
chunks,
|
|
442
|
+
strategy: "sliding",
|
|
443
|
+
fallback_reason: "parse_timeout_exceeded",
|
|
444
|
+
parse_latency_ms: parseLatencyMs,
|
|
445
|
+
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
446
|
+
fallback_path_latency_ms: Date.now() - fallbackStart,
|
|
447
|
+
language: parserLanguageToCanonical(input.parserLanguage)
|
|
448
|
+
};
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
const root = (tree as Parser.Tree | null)?.rootNode;
|
|
452
|
+
if (!root) {
|
|
453
|
+
const fallbackStart = Date.now();
|
|
454
|
+
const chunks = buildSlidingChunks({
|
|
455
|
+
lines: input.lines,
|
|
456
|
+
tokenize: input.tokenize,
|
|
457
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
458
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
459
|
+
maxChunks: input.config.max_chunks_per_file
|
|
460
|
+
});
|
|
461
|
+
return {
|
|
462
|
+
chunks,
|
|
463
|
+
strategy: "sliding",
|
|
464
|
+
fallback_reason: "parse_error",
|
|
465
|
+
parse_latency_ms: parseLatencyMs,
|
|
466
|
+
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
467
|
+
fallback_path_latency_ms: Date.now() - fallbackStart,
|
|
468
|
+
language: parserLanguageToCanonical(input.parserLanguage)
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
const boundaryTypes = DEFAULT_BOUNDARY_NODE_TYPES[input.parserLanguage];
|
|
473
|
+
const candidates = root.descendantsOfType([...boundaryTypes]);
|
|
474
|
+
const boundaryNodes = candidates
|
|
475
|
+
.filter((node) => !hasBoundaryAncestor(node, boundaryTypes))
|
|
476
|
+
.sort((a, b) => a.startPosition.row - b.startPosition.row || a.startPosition.column - b.startPosition.column);
|
|
477
|
+
|
|
478
|
+
if (boundaryNodes.length === 0) {
|
|
479
|
+
const fallbackStart = Date.now();
|
|
480
|
+
const chunks = buildSlidingChunks({
|
|
481
|
+
lines: input.lines,
|
|
482
|
+
tokenize: input.tokenize,
|
|
483
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
484
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
485
|
+
maxChunks: input.config.max_chunks_per_file
|
|
486
|
+
});
|
|
487
|
+
return {
|
|
488
|
+
chunks,
|
|
489
|
+
strategy: "sliding",
|
|
490
|
+
fallback_reason: "empty_language_boundaries",
|
|
491
|
+
parse_latency_ms: parseLatencyMs,
|
|
492
|
+
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
493
|
+
fallback_path_latency_ms: Date.now() - fallbackStart,
|
|
494
|
+
language: parserLanguageToCanonical(input.parserLanguage)
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
const segments: Array<{ startRow: number; endRow: number }> = [];
|
|
499
|
+
let cursor = 0;
|
|
500
|
+
const lastRow = Math.max(0, input.lines.length - 1);
|
|
501
|
+
for (const node of boundaryNodes) {
|
|
502
|
+
const startRow = Math.max(0, Math.min(lastRow, node.startPosition.row));
|
|
503
|
+
const endRow = Math.max(startRow, Math.min(lastRow, toInclusiveEndRow(node)));
|
|
504
|
+
if (startRow > cursor) {
|
|
505
|
+
segments.push({ startRow: cursor, endRow: startRow - 1 });
|
|
506
|
+
}
|
|
507
|
+
segments.push({ startRow, endRow });
|
|
508
|
+
cursor = endRow + 1;
|
|
509
|
+
if (cursor > lastRow) {
|
|
510
|
+
break;
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
if (cursor <= lastRow) {
|
|
514
|
+
segments.push({ startRow: cursor, endRow: lastRow });
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
const chunks: ChunkingOutput[] = [];
|
|
518
|
+
for (const segment of segments) {
|
|
519
|
+
if (segment.endRow < segment.startRow || chunks.length >= input.config.max_chunks_per_file) {
|
|
520
|
+
continue;
|
|
521
|
+
}
|
|
522
|
+
const pieces = splitRangeWithBudget({
|
|
523
|
+
lines: input.lines,
|
|
524
|
+
startRow: segment.startRow,
|
|
525
|
+
endRow: segment.endRow,
|
|
526
|
+
tokenize: input.tokenize,
|
|
527
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
528
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
529
|
+
maxChunks: input.config.max_chunks_per_file - chunks.length
|
|
530
|
+
});
|
|
531
|
+
for (const piece of pieces) {
|
|
532
|
+
const trimmed = trimLineRange(input.lines, piece.startRow, piece.endRow);
|
|
533
|
+
if (!trimmed) {
|
|
534
|
+
continue;
|
|
535
|
+
}
|
|
536
|
+
chunks.push({
|
|
537
|
+
start_line: trimmed.start + 1,
|
|
538
|
+
end_line: trimmed.end + 1,
|
|
539
|
+
snippet: input.lines.slice(trimmed.start, trimmed.end + 1).join("\n")
|
|
540
|
+
});
|
|
541
|
+
if (chunks.length >= input.config.max_chunks_per_file) {
|
|
542
|
+
break;
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
if (chunks.length === 0) {
|
|
548
|
+
const fallbackStart = Date.now();
|
|
549
|
+
const slidingChunks = buildSlidingChunks({
|
|
550
|
+
lines: input.lines,
|
|
551
|
+
tokenize: input.tokenize,
|
|
552
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
553
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
554
|
+
maxChunks: input.config.max_chunks_per_file
|
|
555
|
+
});
|
|
556
|
+
return {
|
|
557
|
+
chunks: slidingChunks,
|
|
558
|
+
strategy: "sliding",
|
|
559
|
+
fallback_reason: "empty_language_boundaries",
|
|
560
|
+
parse_latency_ms: parseLatencyMs,
|
|
561
|
+
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
562
|
+
fallback_path_latency_ms: Date.now() - fallbackStart,
|
|
563
|
+
language: parserLanguageToCanonical(input.parserLanguage)
|
|
564
|
+
};
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
return {
|
|
568
|
+
chunks,
|
|
569
|
+
strategy: "language_aware",
|
|
570
|
+
parse_latency_ms: parseLatencyMs,
|
|
571
|
+
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
572
|
+
language: parserLanguageToCanonical(input.parserLanguage)
|
|
573
|
+
};
|
|
574
|
+
} catch {
|
|
575
|
+
const fallbackStart = Date.now();
|
|
576
|
+
const chunks = buildSlidingChunks({
|
|
577
|
+
lines: input.lines,
|
|
578
|
+
tokenize: input.tokenize,
|
|
579
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
580
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
581
|
+
maxChunks: input.config.max_chunks_per_file
|
|
582
|
+
});
|
|
583
|
+
return {
|
|
584
|
+
chunks,
|
|
585
|
+
strategy: "sliding",
|
|
586
|
+
fallback_reason: "parse_error",
|
|
587
|
+
language_aware_attempt_latency_ms: Date.now() - languageAwareAttemptStart,
|
|
588
|
+
fallback_path_latency_ms: Date.now() - fallbackStart,
|
|
589
|
+
language: parserLanguageToCanonical(input.parserLanguage)
|
|
590
|
+
};
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
export function buildChunksForFile(input: {
|
|
595
|
+
file: ChunkingRawFile;
|
|
596
|
+
config: ChunkingConfig;
|
|
597
|
+
tokenize: (text: string) => string[];
|
|
598
|
+
}): ChunkingResult {
|
|
599
|
+
const lines = input.file.content.split("\n");
|
|
600
|
+
const language = resolveParserLanguage(input.file);
|
|
601
|
+
const enabledLanguageSet = new Set(input.config.enabled_languages.map((value) => value.trim().toLowerCase()));
|
|
602
|
+
|
|
603
|
+
if (input.config.strategy === "sliding") {
|
|
604
|
+
return {
|
|
605
|
+
chunks: buildSlidingChunks({
|
|
606
|
+
lines,
|
|
607
|
+
tokenize: input.tokenize,
|
|
608
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
609
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
610
|
+
maxChunks: input.config.max_chunks_per_file
|
|
611
|
+
}),
|
|
612
|
+
strategy: "sliding",
|
|
613
|
+
language: language ? parserLanguageToCanonical(language) : undefined
|
|
614
|
+
};
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
if (!language || !enabledLanguageSet.has(parserLanguageToCanonical(language))) {
|
|
618
|
+
const fallbackStart = Date.now();
|
|
619
|
+
const chunks = buildSlidingChunks({
|
|
620
|
+
lines,
|
|
621
|
+
tokenize: input.tokenize,
|
|
622
|
+
targetChunkTokens: input.config.target_chunk_tokens,
|
|
623
|
+
overlapTokens: input.config.chunk_overlap_tokens,
|
|
624
|
+
maxChunks: input.config.max_chunks_per_file
|
|
625
|
+
});
|
|
626
|
+
return {
|
|
627
|
+
chunks,
|
|
628
|
+
strategy: "sliding",
|
|
629
|
+
fallback_reason: "unsupported_language",
|
|
630
|
+
fallback_path_latency_ms: Date.now() - fallbackStart
|
|
631
|
+
};
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
return buildLanguageAwareChunks({
|
|
635
|
+
file: input.file,
|
|
636
|
+
lines,
|
|
637
|
+
parserLanguage: language,
|
|
638
|
+
config: input.config,
|
|
639
|
+
tokenize: input.tokenize
|
|
640
|
+
});
|
|
641
|
+
}
|