@cue-dev/retrieval-core 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +27 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/chunking.d.ts +64 -0
- package/dist/chunking.js +983 -0
- package/dist/index.d.ts +673 -0
- package/dist/index.js +6605 -0
- package/dist/indexing-ignore.d.ts +9 -0
- package/dist/indexing-ignore.js +151 -0
- package/dist/remote-sync.d.ts +193 -0
- package/dist/remote-sync.js +816 -0
- package/package.json +37 -0
- package/scripts/poc-node-parser-host.cjs +105 -0
- package/scripts/poc-parser-availability-benchmark.ts +338 -0
- package/src/chunking.ts +1187 -0
- package/src/index.ts +8338 -0
- package/src/indexing-ignore.ts +179 -0
- package/src/remote-sync.ts +1119 -0
- package/test/benchmark.thresholds.test.ts +815 -0
- package/test/chunking.config.test.ts +84 -0
- package/test/chunking.language-aware.test.ts +1248 -0
- package/test/chunking.parser-availability.poc.test.ts +86 -0
- package/test/claude-agent-provider.test.ts +209 -0
- package/test/embedding-context-prefix.test.ts +101 -0
- package/test/embedding-provider.test.ts +570 -0
- package/test/enhance-confidence.test.ts +752 -0
- package/test/index-prep.concurrency.regression.test.ts +142 -0
- package/test/integration.test.ts +508 -0
- package/test/local-sqlite.integration.test.ts +258 -0
- package/test/mcp-search-quality.regression.test.ts +1358 -0
- package/test/remote-sync.integration.test.ts +350 -0
- package/test/smart-cutoff.config.test.ts +86 -0
- package/test/snippet-integrity.config.test.ts +59 -0
- package/tsconfig.build.json +17 -0
- package/tsconfig.json +4 -0
|
@@ -0,0 +1,1248 @@
|
|
|
1
|
+
import Parser from "tree-sitter";
|
|
2
|
+
import { describe, expect, it, vi } from "vitest";
|
|
3
|
+
import { InMemoryQueryCache } from "@cue-dev/data-plane";
|
|
4
|
+
import { getObservability } from "@cue-dev/observability";
|
|
5
|
+
import { InMemoryIndexStore, RetrievalCore } from "../src/index.js";
|
|
6
|
+
import {
|
|
7
|
+
__isChunkingBoundaryCandidateForTests,
|
|
8
|
+
__getChunkingParserInitAttemptsForTests,
|
|
9
|
+
__resetChunkingParserStateForTests,
|
|
10
|
+
__setChunkingParserLanguageLoaderForTests,
|
|
11
|
+
buildChunksForFile,
|
|
12
|
+
getChunkingParserAvailabilitySnapshot
|
|
13
|
+
} from "../src/chunking.js";
|
|
14
|
+
|
|
15
|
+
async function indexAndListChunks(input: {
|
|
16
|
+
core: RetrievalCore;
|
|
17
|
+
store: InMemoryIndexStore;
|
|
18
|
+
tenant_id: string;
|
|
19
|
+
workspace_id: string;
|
|
20
|
+
index_version: string;
|
|
21
|
+
file: {
|
|
22
|
+
path: string;
|
|
23
|
+
language?: string;
|
|
24
|
+
content: string;
|
|
25
|
+
};
|
|
26
|
+
}): Promise<
|
|
27
|
+
Array<{
|
|
28
|
+
path: string;
|
|
29
|
+
start_line: number;
|
|
30
|
+
end_line: number;
|
|
31
|
+
snippet: string;
|
|
32
|
+
}>
|
|
33
|
+
> {
|
|
34
|
+
await input.store.upsertWorkspace({
|
|
35
|
+
workspace_id: input.workspace_id,
|
|
36
|
+
tenant_id: input.tenant_id,
|
|
37
|
+
name: "chunking-test",
|
|
38
|
+
project_root_path: "/workspace/chunking-test"
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
const report = await input.core.indexArtifact({
|
|
42
|
+
tenant_id: input.tenant_id,
|
|
43
|
+
workspace_id: input.workspace_id,
|
|
44
|
+
index_version: input.index_version,
|
|
45
|
+
files: [input.file]
|
|
46
|
+
});
|
|
47
|
+
expect(report.status).toBe("ready");
|
|
48
|
+
|
|
49
|
+
const index = await input.store.getLatestReadyIndex({
|
|
50
|
+
tenant_id: input.tenant_id,
|
|
51
|
+
workspace_id: input.workspace_id
|
|
52
|
+
});
|
|
53
|
+
expect(index).toBeDefined();
|
|
54
|
+
|
|
55
|
+
const chunks = await input.store.listChunksByIndex({
|
|
56
|
+
tenant_id: input.tenant_id,
|
|
57
|
+
index_id: index!.index_id
|
|
58
|
+
});
|
|
59
|
+
return chunks.map((chunk) => ({
|
|
60
|
+
path: chunk.path,
|
|
61
|
+
start_line: chunk.start_line,
|
|
62
|
+
end_line: chunk.end_line,
|
|
63
|
+
snippet: chunk.snippet
|
|
64
|
+
}));
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function mockBoundaryTreeFromSource(source: string): Parser.Tree {
|
|
68
|
+
const lines = source.split("\n");
|
|
69
|
+
const nodes: Parser.SyntaxNode[] = [];
|
|
70
|
+
|
|
71
|
+
const rustFunctionStart = lines.findIndex((line) => line.includes("pub fn"));
|
|
72
|
+
if (rustFunctionStart >= 0) {
|
|
73
|
+
let end = rustFunctionStart;
|
|
74
|
+
while (end < lines.length - 1 && lines[end]?.trim() !== "}") {
|
|
75
|
+
end += 1;
|
|
76
|
+
}
|
|
77
|
+
nodes.push({
|
|
78
|
+
type: "function_item",
|
|
79
|
+
startPosition: { row: rustFunctionStart, column: 0 },
|
|
80
|
+
endPosition: { row: end + 1, column: 0 },
|
|
81
|
+
parent: null
|
|
82
|
+
} as unknown as Parser.SyntaxNode);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const javaClassStart = lines.findIndex((line) => line.includes("class "));
|
|
86
|
+
if (javaClassStart >= 0) {
|
|
87
|
+
let end = lines.length - 1;
|
|
88
|
+
for (let row = lines.length - 1; row >= javaClassStart; row -= 1) {
|
|
89
|
+
if ((lines[row] ?? "").trim() === "}") {
|
|
90
|
+
end = row;
|
|
91
|
+
break;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
nodes.push({
|
|
95
|
+
type: "class_declaration",
|
|
96
|
+
startPosition: { row: javaClassStart, column: 0 },
|
|
97
|
+
endPosition: { row: end + 1, column: 0 },
|
|
98
|
+
parent: null
|
|
99
|
+
} as unknown as Parser.SyntaxNode);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
rootNode: {
|
|
104
|
+
descendantsOfType(types: string[]): Parser.SyntaxNode[] {
|
|
105
|
+
return nodes.filter((node) => types.includes(node.type));
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
} as unknown as Parser.Tree;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
describe("retrieval-core chunking", () => {
|
|
112
|
+
it("uses language-aware chunking for supported languages when configured", async () => {
|
|
113
|
+
const store = new InMemoryIndexStore();
|
|
114
|
+
const observability = getObservability(`retrieval-core-language-aware-${Date.now()}`);
|
|
115
|
+
const core = new RetrievalCore(store, new InMemoryQueryCache(), {
|
|
116
|
+
observability,
|
|
117
|
+
chunkingConfig: {
|
|
118
|
+
strategy: "language_aware",
|
|
119
|
+
parse_timeout_ms: 500
|
|
120
|
+
}
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
const chunks = await indexAndListChunks({
|
|
124
|
+
core,
|
|
125
|
+
store,
|
|
126
|
+
tenant_id: "tenant-a",
|
|
127
|
+
workspace_id: "ws-a",
|
|
128
|
+
index_version: "idx-a1",
|
|
129
|
+
file: {
|
|
130
|
+
path: "src/feature.ts",
|
|
131
|
+
language: "typescript",
|
|
132
|
+
content: [
|
|
133
|
+
"import { dep } from './dep';",
|
|
134
|
+
"",
|
|
135
|
+
"export function alpha(input: number) {",
|
|
136
|
+
" const value = dep(input);",
|
|
137
|
+
" return value + 1;",
|
|
138
|
+
"}",
|
|
139
|
+
"",
|
|
140
|
+
"export class Greeter {",
|
|
141
|
+
" greet(name: string) {",
|
|
142
|
+
" return `hello ${name}`;",
|
|
143
|
+
" }",
|
|
144
|
+
"}",
|
|
145
|
+
"",
|
|
146
|
+
"export function beta(input: number) {",
|
|
147
|
+
" const value = dep(input);",
|
|
148
|
+
" return value * 2;",
|
|
149
|
+
"}"
|
|
150
|
+
].join("\n")
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
expect(chunks.some((chunk) => chunk.snippet.includes("function alpha"))).toBe(true);
|
|
155
|
+
expect(chunks.some((chunk) => chunk.snippet.includes("class Greeter"))).toBe(true);
|
|
156
|
+
expect(chunks.some((chunk) => chunk.snippet.includes("function beta"))).toBe(true);
|
|
157
|
+
expect(chunks.every((chunk) => chunk.start_line >= 1 && chunk.end_line >= chunk.start_line)).toBe(true);
|
|
158
|
+
|
|
159
|
+
const strategyCounters = observability.metrics.readCounter("index_chunking_strategy_total");
|
|
160
|
+
expect(strategyCounters.some((counter) => counter.labels.strategy === "language_aware" && counter.labels.reason === "none")).toBe(
|
|
161
|
+
true
|
|
162
|
+
);
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
it("falls back to sliding chunks for unsupported languages", async () => {
|
|
166
|
+
const store = new InMemoryIndexStore();
|
|
167
|
+
const observability = getObservability(`retrieval-core-unsupported-${Date.now()}`);
|
|
168
|
+
const core = new RetrievalCore(store, new InMemoryQueryCache(), {
|
|
169
|
+
observability,
|
|
170
|
+
chunkingConfig: {
|
|
171
|
+
strategy: "language_aware"
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
const chunks = await indexAndListChunks({
|
|
176
|
+
core,
|
|
177
|
+
store,
|
|
178
|
+
tenant_id: "tenant-b",
|
|
179
|
+
workspace_id: "ws-b",
|
|
180
|
+
index_version: "idx-b1",
|
|
181
|
+
file: {
|
|
182
|
+
path: "docs/readme.md",
|
|
183
|
+
language: "markdown",
|
|
184
|
+
content: Array.from({ length: 220 }, (_, idx) => `line ${idx + 1}: retrieval docs`).join("\n")
|
|
185
|
+
}
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
189
|
+
const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
|
|
190
|
+
expect(
|
|
191
|
+
fallbackCounters.some(
|
|
192
|
+
(counter) => counter.labels.reason === "unsupported_language" && counter.labels.language === "markdown"
|
|
193
|
+
)
|
|
194
|
+
).toBe(true);
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
it("falls back with parse_timeout_exceeded when parse latency exceeds the configured timeout", () => {
|
|
198
|
+
__resetChunkingParserStateForTests();
|
|
199
|
+
const nowSpy = vi.spyOn(Date, "now");
|
|
200
|
+
let tick = 0;
|
|
201
|
+
nowSpy.mockImplementation(() => {
|
|
202
|
+
tick += 3;
|
|
203
|
+
return tick;
|
|
204
|
+
});
|
|
205
|
+
try {
|
|
206
|
+
const config = {
|
|
207
|
+
strategy: "language_aware" as const,
|
|
208
|
+
fallback_strategy: "sliding" as const,
|
|
209
|
+
target_chunk_tokens: 220,
|
|
210
|
+
chunk_overlap_tokens: 40,
|
|
211
|
+
budget_tokenizer: "ranking" as const,
|
|
212
|
+
boundary_strictness: "legacy" as const,
|
|
213
|
+
max_chunks_per_file: 300,
|
|
214
|
+
parse_timeout_ms: 1,
|
|
215
|
+
enabled_languages: ["typescript"]
|
|
216
|
+
};
|
|
217
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
218
|
+
const file = {
|
|
219
|
+
path: "src/timeout.ts",
|
|
220
|
+
language: "typescript",
|
|
221
|
+
content: ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n")
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
const result = buildChunksForFile({ file, config, tokenize });
|
|
225
|
+
expect(result.strategy).toBe("sliding");
|
|
226
|
+
expect(result.fallback_reason).toBe("parse_timeout_exceeded");
|
|
227
|
+
expect(result.language).toBe("typescript");
|
|
228
|
+
} finally {
|
|
229
|
+
nowSpy.mockRestore();
|
|
230
|
+
__resetChunkingParserStateForTests();
|
|
231
|
+
}
|
|
232
|
+
});
|
|
233
|
+
|
|
234
|
+
it("falls back with empty_language_boundaries when parser returns no eligible declaration boundaries", () => {
|
|
235
|
+
__resetChunkingParserStateForTests();
|
|
236
|
+
try {
|
|
237
|
+
const config = {
|
|
238
|
+
strategy: "language_aware" as const,
|
|
239
|
+
fallback_strategy: "sliding" as const,
|
|
240
|
+
target_chunk_tokens: 220,
|
|
241
|
+
chunk_overlap_tokens: 40,
|
|
242
|
+
budget_tokenizer: "ranking" as const,
|
|
243
|
+
boundary_strictness: "legacy" as const,
|
|
244
|
+
max_chunks_per_file: 300,
|
|
245
|
+
parse_timeout_ms: 80,
|
|
246
|
+
enabled_languages: ["typescript"]
|
|
247
|
+
};
|
|
248
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
249
|
+
const file = {
|
|
250
|
+
path: "src/no-boundaries.ts",
|
|
251
|
+
language: "typescript",
|
|
252
|
+
content: ["const alpha = 1;", "const beta = alpha + 1;", "const gamma = beta + 1;"].join("\n")
|
|
253
|
+
};
|
|
254
|
+
|
|
255
|
+
const result = buildChunksForFile({ file, config, tokenize });
|
|
256
|
+
expect(result.strategy).toBe("sliding");
|
|
257
|
+
expect(result.fallback_reason).toBe("empty_language_boundaries");
|
|
258
|
+
expect(result.language).toBe("typescript");
|
|
259
|
+
} finally {
|
|
260
|
+
__resetChunkingParserStateForTests();
|
|
261
|
+
}
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
it("falls back with parse_error when parser throws during language-aware parsing", () => {
|
|
265
|
+
__resetChunkingParserStateForTests();
|
|
266
|
+
const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
|
|
267
|
+
parseSpy.mockImplementation(() => {
|
|
268
|
+
throw new Error("forced parse failure");
|
|
269
|
+
});
|
|
270
|
+
try {
|
|
271
|
+
const config = {
|
|
272
|
+
strategy: "language_aware" as const,
|
|
273
|
+
fallback_strategy: "sliding" as const,
|
|
274
|
+
target_chunk_tokens: 220,
|
|
275
|
+
chunk_overlap_tokens: 40,
|
|
276
|
+
budget_tokenizer: "ranking" as const,
|
|
277
|
+
boundary_strictness: "legacy" as const,
|
|
278
|
+
max_chunks_per_file: 300,
|
|
279
|
+
parse_timeout_ms: 80,
|
|
280
|
+
enabled_languages: ["typescript"]
|
|
281
|
+
};
|
|
282
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
283
|
+
const file = {
|
|
284
|
+
path: "src/parse-error.ts",
|
|
285
|
+
language: "typescript",
|
|
286
|
+
content: ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n")
|
|
287
|
+
};
|
|
288
|
+
|
|
289
|
+
const result = buildChunksForFile({ file, config, tokenize });
|
|
290
|
+
expect(result.strategy).toBe("sliding");
|
|
291
|
+
expect(result.fallback_reason).toBe("parse_error");
|
|
292
|
+
expect(result.language).toBe("typescript");
|
|
293
|
+
} finally {
|
|
294
|
+
parseSpy.mockRestore();
|
|
295
|
+
__resetChunkingParserStateForTests();
|
|
296
|
+
}
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
it("emits fallback metrics with reason and language labels for parse_timeout_exceeded", async () => {
|
|
300
|
+
__resetChunkingParserStateForTests();
|
|
301
|
+
const nowSpy = vi.spyOn(Date, "now");
|
|
302
|
+
let tick = 0;
|
|
303
|
+
nowSpy.mockImplementation(() => {
|
|
304
|
+
tick += 3;
|
|
305
|
+
return tick;
|
|
306
|
+
});
|
|
307
|
+
try {
|
|
308
|
+
const store = new InMemoryIndexStore();
|
|
309
|
+
const observability = getObservability(`retrieval-core-timeout-fallback-${Date.now()}`);
|
|
310
|
+
const core = new RetrievalCore(store, new InMemoryQueryCache(), {
|
|
311
|
+
observability,
|
|
312
|
+
chunkingConfig: {
|
|
313
|
+
strategy: "language_aware",
|
|
314
|
+
parse_timeout_ms: 1,
|
|
315
|
+
enabled_languages: ["typescript"]
|
|
316
|
+
}
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
const chunks = await indexAndListChunks({
|
|
320
|
+
core,
|
|
321
|
+
store,
|
|
322
|
+
tenant_id: "tenant-timeout",
|
|
323
|
+
workspace_id: "ws-timeout",
|
|
324
|
+
index_version: "idx-timeout-1",
|
|
325
|
+
file: {
|
|
326
|
+
path: "src/timeout.ts",
|
|
327
|
+
language: "typescript",
|
|
328
|
+
content: ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n")
|
|
329
|
+
}
|
|
330
|
+
});
|
|
331
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
332
|
+
|
|
333
|
+
const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
|
|
334
|
+
expect(
|
|
335
|
+
fallbackCounters.some(
|
|
336
|
+
(counter) => counter.labels.reason === "parse_timeout_exceeded" && counter.labels.language === "typescript"
|
|
337
|
+
)
|
|
338
|
+
).toBe(true);
|
|
339
|
+
} finally {
|
|
340
|
+
nowSpy.mockRestore();
|
|
341
|
+
__resetChunkingParserStateForTests();
|
|
342
|
+
}
|
|
343
|
+
});
|
|
344
|
+
|
|
345
|
+
it("emits fallback metrics with reason and language labels for empty_language_boundaries", async () => {
|
|
346
|
+
__resetChunkingParserStateForTests();
|
|
347
|
+
try {
|
|
348
|
+
const store = new InMemoryIndexStore();
|
|
349
|
+
const observability = getObservability(`retrieval-core-empty-boundaries-${Date.now()}`);
|
|
350
|
+
const core = new RetrievalCore(store, new InMemoryQueryCache(), {
|
|
351
|
+
observability,
|
|
352
|
+
chunkingConfig: {
|
|
353
|
+
strategy: "language_aware",
|
|
354
|
+
recursive_semantic_chunking_enabled: false,
|
|
355
|
+
enabled_languages: ["typescript"]
|
|
356
|
+
}
|
|
357
|
+
});
|
|
358
|
+
|
|
359
|
+
const chunks = await indexAndListChunks({
|
|
360
|
+
core,
|
|
361
|
+
store,
|
|
362
|
+
tenant_id: "tenant-empty",
|
|
363
|
+
workspace_id: "ws-empty",
|
|
364
|
+
index_version: "idx-empty-1",
|
|
365
|
+
file: {
|
|
366
|
+
path: "src/no-boundaries.ts",
|
|
367
|
+
language: "typescript",
|
|
368
|
+
content: ["const alpha = 1;", "const beta = alpha + 1;", "const gamma = beta + 1;"].join("\n")
|
|
369
|
+
}
|
|
370
|
+
});
|
|
371
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
372
|
+
|
|
373
|
+
const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
|
|
374
|
+
expect(
|
|
375
|
+
fallbackCounters.some(
|
|
376
|
+
(counter) => counter.labels.reason === "empty_language_boundaries" && counter.labels.language === "typescript"
|
|
377
|
+
)
|
|
378
|
+
).toBe(true);
|
|
379
|
+
} finally {
|
|
380
|
+
__resetChunkingParserStateForTests();
|
|
381
|
+
}
|
|
382
|
+
});
|
|
383
|
+
|
|
384
|
+
it("emits fallback metrics with reason and language labels for parse_error", async () => {
|
|
385
|
+
__resetChunkingParserStateForTests();
|
|
386
|
+
const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
|
|
387
|
+
parseSpy.mockImplementation(() => {
|
|
388
|
+
throw new Error("forced parse failure");
|
|
389
|
+
});
|
|
390
|
+
try {
|
|
391
|
+
const store = new InMemoryIndexStore();
|
|
392
|
+
const observability = getObservability(`retrieval-core-parse-error-${Date.now()}`);
|
|
393
|
+
const core = new RetrievalCore(store, new InMemoryQueryCache(), {
|
|
394
|
+
observability,
|
|
395
|
+
chunkingConfig: {
|
|
396
|
+
strategy: "language_aware",
|
|
397
|
+
enabled_languages: ["typescript"]
|
|
398
|
+
}
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
const chunks = await indexAndListChunks({
|
|
402
|
+
core,
|
|
403
|
+
store,
|
|
404
|
+
tenant_id: "tenant-parse-error",
|
|
405
|
+
workspace_id: "ws-parse-error",
|
|
406
|
+
index_version: "idx-parse-error-1",
|
|
407
|
+
file: {
|
|
408
|
+
path: "src/parse-error.ts",
|
|
409
|
+
language: "typescript",
|
|
410
|
+
content: ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n")
|
|
411
|
+
}
|
|
412
|
+
});
|
|
413
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
414
|
+
|
|
415
|
+
const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
|
|
416
|
+
expect(
|
|
417
|
+
fallbackCounters.some((counter) => counter.labels.reason === "parse_error" && counter.labels.language === "typescript")
|
|
418
|
+
).toBe(true);
|
|
419
|
+
} finally {
|
|
420
|
+
parseSpy.mockRestore();
|
|
421
|
+
__resetChunkingParserStateForTests();
|
|
422
|
+
}
|
|
423
|
+
});
|
|
424
|
+
|
|
425
|
+
it("keeps deterministic line coordinates in sliding mode when lines repeat", async () => {
|
|
426
|
+
const store = new InMemoryIndexStore();
|
|
427
|
+
const core = new RetrievalCore(store, new InMemoryQueryCache(), {
|
|
428
|
+
chunkingConfig: {
|
|
429
|
+
strategy: "sliding"
|
|
430
|
+
}
|
|
431
|
+
});
|
|
432
|
+
|
|
433
|
+
const chunks = await indexAndListChunks({
|
|
434
|
+
core,
|
|
435
|
+
store,
|
|
436
|
+
tenant_id: "tenant-c",
|
|
437
|
+
workspace_id: "ws-c",
|
|
438
|
+
index_version: "idx-c1",
|
|
439
|
+
file: {
|
|
440
|
+
path: "src/repeated.ts",
|
|
441
|
+
language: "typescript",
|
|
442
|
+
content: Array.from({ length: 400 }, () => "const token = 1;").join("\n")
|
|
443
|
+
}
|
|
444
|
+
});
|
|
445
|
+
|
|
446
|
+
expect(chunks.length).toBeGreaterThan(2);
|
|
447
|
+
const starts = chunks.map((chunk) => chunk.start_line);
|
|
448
|
+
expect(starts.some((line) => line > 1)).toBe(true);
|
|
449
|
+
for (let i = 1; i < starts.length; i += 1) {
|
|
450
|
+
expect(starts[i]).toBeGreaterThan(starts[i - 1] ?? 0);
|
|
451
|
+
}
|
|
452
|
+
});
|
|
453
|
+
|
|
454
|
+
it("keeps javascript boundary candidate filtering focused on symbol-bearing patterns", () => {
|
|
455
|
+
expect(
|
|
456
|
+
__isChunkingBoundaryCandidateForTests({
|
|
457
|
+
parserLanguage: "javascript",
|
|
458
|
+
nodeType: "function_expression",
|
|
459
|
+
parentType: "assignment_expression"
|
|
460
|
+
})
|
|
461
|
+
).toBe(true);
|
|
462
|
+
expect(
|
|
463
|
+
__isChunkingBoundaryCandidateForTests({
|
|
464
|
+
parserLanguage: "javascript",
|
|
465
|
+
nodeType: "function_expression",
|
|
466
|
+
parentType: "arguments"
|
|
467
|
+
})
|
|
468
|
+
).toBe(false);
|
|
469
|
+
expect(
|
|
470
|
+
__isChunkingBoundaryCandidateForTests({
|
|
471
|
+
parserLanguage: "javascript",
|
|
472
|
+
nodeType: "arrow_function",
|
|
473
|
+
parentType: "variable_declarator"
|
|
474
|
+
})
|
|
475
|
+
).toBe(true);
|
|
476
|
+
expect(
|
|
477
|
+
__isChunkingBoundaryCandidateForTests({
|
|
478
|
+
parserLanguage: "javascript",
|
|
479
|
+
nodeType: "method_definition",
|
|
480
|
+
parentType: "object"
|
|
481
|
+
})
|
|
482
|
+
).toBe(true);
|
|
483
|
+
expect(
|
|
484
|
+
__isChunkingBoundaryCandidateForTests({
|
|
485
|
+
parserLanguage: "typescript",
|
|
486
|
+
nodeType: "function_expression",
|
|
487
|
+
parentType: "arguments",
|
|
488
|
+
boundaryStrictness: "semantic_js_ts"
|
|
489
|
+
})
|
|
490
|
+
).toBe(false);
|
|
491
|
+
expect(
|
|
492
|
+
__isChunkingBoundaryCandidateForTests({
|
|
493
|
+
parserLanguage: "typescript",
|
|
494
|
+
nodeType: "arrow_function",
|
|
495
|
+
parentType: "arguments",
|
|
496
|
+
boundaryStrictness: "semantic_js_ts"
|
|
497
|
+
})
|
|
498
|
+
).toBe(false);
|
|
499
|
+
expect(
|
|
500
|
+
__isChunkingBoundaryCandidateForTests({
|
|
501
|
+
parserLanguage: "typescript",
|
|
502
|
+
nodeType: "arrow_function",
|
|
503
|
+
parentType: "parenthesized_expression",
|
|
504
|
+
ancestorTypes: ["as_expression", "variable_declarator"],
|
|
505
|
+
boundaryStrictness: "semantic_js_ts"
|
|
506
|
+
})
|
|
507
|
+
).toBe(true);
|
|
508
|
+
expect(
|
|
509
|
+
__isChunkingBoundaryCandidateForTests({
|
|
510
|
+
parserLanguage: "typescript",
|
|
511
|
+
nodeType: "arrow_function",
|
|
512
|
+
parentType: "parenthesized_expression",
|
|
513
|
+
ancestorTypes: ["export_statement"],
|
|
514
|
+
boundaryStrictness: "semantic_js_ts"
|
|
515
|
+
})
|
|
516
|
+
).toBe(true);
|
|
517
|
+
expect(
|
|
518
|
+
__isChunkingBoundaryCandidateForTests({
|
|
519
|
+
parserLanguage: "typescript",
|
|
520
|
+
nodeType: "arrow_function",
|
|
521
|
+
parentType: "parenthesized_expression",
|
|
522
|
+
ancestorTypes: ["arguments"],
|
|
523
|
+
boundaryStrictness: "semantic_js_ts"
|
|
524
|
+
})
|
|
525
|
+
).toBe(false);
|
|
526
|
+
expect(
|
|
527
|
+
__isChunkingBoundaryCandidateForTests({
|
|
528
|
+
parserLanguage: "typescript",
|
|
529
|
+
nodeType: "class",
|
|
530
|
+
parentType: "export_statement",
|
|
531
|
+
boundaryStrictness: "semantic_js_ts"
|
|
532
|
+
})
|
|
533
|
+
).toBe(true);
|
|
534
|
+
expect(
|
|
535
|
+
__isChunkingBoundaryCandidateForTests({
|
|
536
|
+
parserLanguage: "typescript",
|
|
537
|
+
nodeType: "class",
|
|
538
|
+
parentType: "arguments",
|
|
539
|
+
boundaryStrictness: "semantic_js_ts"
|
|
540
|
+
})
|
|
541
|
+
).toBe(false);
|
|
542
|
+
});
|
|
543
|
+
|
|
544
|
+
it("keeps typescript arrow-function snippets complete with semantic boundary strictness", () => {
|
|
545
|
+
const config = {
|
|
546
|
+
strategy: "language_aware" as const,
|
|
547
|
+
fallback_strategy: "sliding" as const,
|
|
548
|
+
target_chunk_tokens: 12,
|
|
549
|
+
chunk_overlap_tokens: 4,
|
|
550
|
+
budget_tokenizer: "lightweight" as const,
|
|
551
|
+
boundary_strictness: "semantic_js_ts" as const,
|
|
552
|
+
max_chunks_per_file: 300,
|
|
553
|
+
parse_timeout_ms: 80,
|
|
554
|
+
enabled_languages: ["typescript"]
|
|
555
|
+
};
|
|
556
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
557
|
+
const file = {
|
|
558
|
+
path: "src/runtime.ts",
|
|
559
|
+
language: "typescript",
|
|
560
|
+
content: [
|
|
561
|
+
"export const alpha = (input: number) => {",
|
|
562
|
+
" const first = input + 1;",
|
|
563
|
+
" const second = first * 2;",
|
|
564
|
+
" return second;",
|
|
565
|
+
"};",
|
|
566
|
+
"",
|
|
567
|
+
"export const beta = (input: number) => {",
|
|
568
|
+
" return alpha(input);",
|
|
569
|
+
"};"
|
|
570
|
+
].join("\n")
|
|
571
|
+
};
|
|
572
|
+
const result = buildChunksForFile({ file, config, tokenize });
|
|
573
|
+
expect(result.strategy).toBe("language_aware");
|
|
574
|
+
expect(result.fallback_reason).toBeUndefined();
|
|
575
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("export const alpha = (input: number) => {"))).toBe(true);
|
|
576
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("return second;"))).toBe(true);
|
|
577
|
+
});
|
|
578
|
+
|
|
579
|
+
it("keeps wrapped export-assigned functions as semantic boundaries", () => {
|
|
580
|
+
const config = {
|
|
581
|
+
strategy: "language_aware" as const,
|
|
582
|
+
fallback_strategy: "sliding" as const,
|
|
583
|
+
target_chunk_tokens: 60,
|
|
584
|
+
chunk_overlap_tokens: 12,
|
|
585
|
+
budget_tokenizer: "lightweight" as const,
|
|
586
|
+
boundary_strictness: "semantic_js_ts" as const,
|
|
587
|
+
max_chunks_per_file: 300,
|
|
588
|
+
parse_timeout_ms: 80,
|
|
589
|
+
enabled_languages: ["typescript"]
|
|
590
|
+
};
|
|
591
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
592
|
+
const file = {
|
|
593
|
+
path: "src/export-assigned.ts",
|
|
594
|
+
language: "typescript",
|
|
595
|
+
content: [
|
|
596
|
+
"export default ((input: number) => {",
|
|
597
|
+
" return input + 1;",
|
|
598
|
+
"});"
|
|
599
|
+
].join("\n")
|
|
600
|
+
};
|
|
601
|
+
const result = buildChunksForFile({ file, config, tokenize });
|
|
602
|
+
expect(result.strategy).toBe("language_aware");
|
|
603
|
+
expect(result.fallback_reason).toBeUndefined();
|
|
604
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("export default ((input: number) => {"))).toBe(true);
|
|
605
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("return input + 1;"))).toBe(true);
|
|
606
|
+
});
|
|
607
|
+
|
|
608
|
+
it("keeps wrapped variable-initialized function expressions as semantic boundaries", () => {
|
|
609
|
+
const config = {
|
|
610
|
+
strategy: "language_aware" as const,
|
|
611
|
+
fallback_strategy: "sliding" as const,
|
|
612
|
+
target_chunk_tokens: 60,
|
|
613
|
+
chunk_overlap_tokens: 12,
|
|
614
|
+
budget_tokenizer: "lightweight" as const,
|
|
615
|
+
boundary_strictness: "semantic_js_ts" as const,
|
|
616
|
+
max_chunks_per_file: 300,
|
|
617
|
+
parse_timeout_ms: 80,
|
|
618
|
+
enabled_languages: ["typescript"]
|
|
619
|
+
};
|
|
620
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
621
|
+
const file = {
|
|
622
|
+
path: "src/variable-wrapped.ts",
|
|
623
|
+
language: "typescript",
|
|
624
|
+
content: [
|
|
625
|
+
"const build = ((input: number) => {",
|
|
626
|
+
" return input + 1;",
|
|
627
|
+
"}) as (input: number) => number;"
|
|
628
|
+
].join("\n")
|
|
629
|
+
};
|
|
630
|
+
const result = buildChunksForFile({ file, config, tokenize });
|
|
631
|
+
expect(result.strategy).toBe("language_aware");
|
|
632
|
+
expect(result.fallback_reason).toBeUndefined();
|
|
633
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("const build = ((input: number) => {"))).toBe(true);
|
|
634
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("return input + 1;"))).toBe(true);
|
|
635
|
+
});
|
|
636
|
+
|
|
637
|
+
it("keeps export-assigned class expressions as semantic boundaries", () => {
|
|
638
|
+
const config = {
|
|
639
|
+
strategy: "language_aware" as const,
|
|
640
|
+
fallback_strategy: "sliding" as const,
|
|
641
|
+
target_chunk_tokens: 60,
|
|
642
|
+
chunk_overlap_tokens: 12,
|
|
643
|
+
budget_tokenizer: "lightweight" as const,
|
|
644
|
+
boundary_strictness: "semantic_js_ts" as const,
|
|
645
|
+
max_chunks_per_file: 300,
|
|
646
|
+
parse_timeout_ms: 80,
|
|
647
|
+
enabled_languages: ["typescript"]
|
|
648
|
+
};
|
|
649
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
650
|
+
const file = {
|
|
651
|
+
path: "src/export-class.ts",
|
|
652
|
+
language: "typescript",
|
|
653
|
+
content: ["export default class Service {}"].join("\n")
|
|
654
|
+
};
|
|
655
|
+
const result = buildChunksForFile({ file, config, tokenize });
|
|
656
|
+
expect(result.strategy).toBe("language_aware");
|
|
657
|
+
expect(result.fallback_reason).toBeUndefined();
|
|
658
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("export default class Service {}"))).toBe(true);
|
|
659
|
+
});
|
|
660
|
+
|
|
661
|
+
it("keeps object literal method blocks as semantic boundaries", () => {
|
|
662
|
+
const config = {
|
|
663
|
+
strategy: "language_aware" as const,
|
|
664
|
+
fallback_strategy: "sliding" as const,
|
|
665
|
+
target_chunk_tokens: 60,
|
|
666
|
+
chunk_overlap_tokens: 12,
|
|
667
|
+
budget_tokenizer: "lightweight" as const,
|
|
668
|
+
boundary_strictness: "semantic_js_ts" as const,
|
|
669
|
+
max_chunks_per_file: 300,
|
|
670
|
+
parse_timeout_ms: 80,
|
|
671
|
+
enabled_languages: ["typescript"]
|
|
672
|
+
};
|
|
673
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
674
|
+
const file = {
|
|
675
|
+
path: "src/object-methods.ts",
|
|
676
|
+
language: "typescript",
|
|
677
|
+
content: [
|
|
678
|
+
"export const handlers = {",
|
|
679
|
+
" onReady() {",
|
|
680
|
+
" return 1;",
|
|
681
|
+
" }",
|
|
682
|
+
"};"
|
|
683
|
+
].join("\n")
|
|
684
|
+
};
|
|
685
|
+
const result = buildChunksForFile({ file, config, tokenize });
|
|
686
|
+
expect(result.strategy).toBe("language_aware");
|
|
687
|
+
expect(result.fallback_reason).toBeUndefined();
|
|
688
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("export const handlers = {"))).toBe(true);
|
|
689
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("onReady()"))).toBe(true);
|
|
690
|
+
});
|
|
691
|
+
|
|
692
|
+
it("treats .mts and .cts as typescript for language-aware chunking", () => {
|
|
693
|
+
const config = {
|
|
694
|
+
strategy: "language_aware" as const,
|
|
695
|
+
fallback_strategy: "sliding" as const,
|
|
696
|
+
target_chunk_tokens: 80,
|
|
697
|
+
chunk_overlap_tokens: 16,
|
|
698
|
+
budget_tokenizer: "lightweight" as const,
|
|
699
|
+
boundary_strictness: "semantic_js_ts" as const,
|
|
700
|
+
max_chunks_per_file: 300,
|
|
701
|
+
parse_timeout_ms: 80,
|
|
702
|
+
enabled_languages: ["typescript", "javascript"]
|
|
703
|
+
};
|
|
704
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
705
|
+
const content = ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n");
|
|
706
|
+
const mts = buildChunksForFile({
|
|
707
|
+
file: {
|
|
708
|
+
path: "src/runtime.mts",
|
|
709
|
+
content
|
|
710
|
+
},
|
|
711
|
+
config,
|
|
712
|
+
tokenize
|
|
713
|
+
});
|
|
714
|
+
const cts = buildChunksForFile({
|
|
715
|
+
file: {
|
|
716
|
+
path: "src/runtime.cts",
|
|
717
|
+
content
|
|
718
|
+
},
|
|
719
|
+
config,
|
|
720
|
+
tokenize
|
|
721
|
+
});
|
|
722
|
+
expect(mts.strategy).toBe("language_aware");
|
|
723
|
+
expect(cts.strategy).toBe("language_aware");
|
|
724
|
+
expect(mts.fallback_reason).toBeUndefined();
|
|
725
|
+
expect(cts.fallback_reason).toBeUndefined();
|
|
726
|
+
});
|
|
727
|
+
|
|
728
|
+
it("uses parser-aware chunking for rust and java declarations without mid-body truncation", () => {
|
|
729
|
+
__resetChunkingParserStateForTests();
|
|
730
|
+
const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
|
|
731
|
+
parseSpy.mockImplementation((source: string) => mockBoundaryTreeFromSource(source));
|
|
732
|
+
try {
|
|
733
|
+
const config = {
|
|
734
|
+
strategy: "language_aware" as const,
|
|
735
|
+
fallback_strategy: "sliding" as const,
|
|
736
|
+
target_chunk_tokens: 64,
|
|
737
|
+
chunk_overlap_tokens: 12,
|
|
738
|
+
budget_tokenizer: "lightweight" as const,
|
|
739
|
+
boundary_strictness: "legacy" as const,
|
|
740
|
+
max_chunks_per_file: 300,
|
|
741
|
+
parse_timeout_ms: 80,
|
|
742
|
+
enabled_languages: ["rust", "java"]
|
|
743
|
+
};
|
|
744
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
745
|
+
|
|
746
|
+
const rustResult = buildChunksForFile({
|
|
747
|
+
file: {
|
|
748
|
+
path: "src/counter.rs",
|
|
749
|
+
content: [
|
|
750
|
+
"pub struct Counter {",
|
|
751
|
+
" value: i32,",
|
|
752
|
+
"}",
|
|
753
|
+
"",
|
|
754
|
+
"impl Counter {",
|
|
755
|
+
" pub fn increment(&mut self, amount: i32) {",
|
|
756
|
+
" self.value += amount;",
|
|
757
|
+
" }",
|
|
758
|
+
"}",
|
|
759
|
+
"",
|
|
760
|
+
"pub fn compute_total(input: i32) -> i32 {",
|
|
761
|
+
" let doubled = input * 2;",
|
|
762
|
+
" doubled + 1",
|
|
763
|
+
"}"
|
|
764
|
+
].join("\n")
|
|
765
|
+
},
|
|
766
|
+
config,
|
|
767
|
+
tokenize
|
|
768
|
+
});
|
|
769
|
+
expect(rustResult.strategy).toBe("language_aware");
|
|
770
|
+
expect(rustResult.fallback_reason).toBeUndefined();
|
|
771
|
+
expect(rustResult.language).toBe("rust");
|
|
772
|
+
const rustFunctionChunk = rustResult.chunks.find((chunk) => chunk.snippet.includes("pub fn compute_total(input: i32) -> i32 {"));
|
|
773
|
+
expect(rustFunctionChunk).toBeDefined();
|
|
774
|
+
expect(rustFunctionChunk?.snippet.includes("let doubled = input * 2;")).toBe(true);
|
|
775
|
+
expect(rustFunctionChunk?.snippet.includes("doubled + 1")).toBe(true);
|
|
776
|
+
|
|
777
|
+
const javaResult = buildChunksForFile({
|
|
778
|
+
file: {
|
|
779
|
+
path: "src/Calculator.java",
|
|
780
|
+
content: [
|
|
781
|
+
"public class Calculator {",
|
|
782
|
+
" public int sum(int left, int right) {",
|
|
783
|
+
" int total = left + right;",
|
|
784
|
+
" return total;",
|
|
785
|
+
" }",
|
|
786
|
+
"",
|
|
787
|
+
" public int multiply(int left, int right) {",
|
|
788
|
+
" return left * right;",
|
|
789
|
+
" }",
|
|
790
|
+
"}"
|
|
791
|
+
].join("\n")
|
|
792
|
+
},
|
|
793
|
+
config,
|
|
794
|
+
tokenize
|
|
795
|
+
});
|
|
796
|
+
expect(javaResult.strategy).toBe("language_aware");
|
|
797
|
+
expect(javaResult.fallback_reason).toBeUndefined();
|
|
798
|
+
expect(javaResult.language).toBe("java");
|
|
799
|
+
const javaMethodChunk = javaResult.chunks.find((chunk) => chunk.snippet.includes("public int sum(int left, int right) {"));
|
|
800
|
+
expect(javaMethodChunk).toBeDefined();
|
|
801
|
+
expect(javaMethodChunk?.snippet.includes("int total = left + right;")).toBe(true);
|
|
802
|
+
expect(javaMethodChunk?.snippet.includes("return total;")).toBe(true);
|
|
803
|
+
} finally {
|
|
804
|
+
parseSpy.mockRestore();
|
|
805
|
+
__resetChunkingParserStateForTests();
|
|
806
|
+
}
|
|
807
|
+
});
|
|
808
|
+
|
|
809
|
+
it("resolves rust/java aliases and paths for parser-aware chunking", () => {
|
|
810
|
+
__resetChunkingParserStateForTests();
|
|
811
|
+
const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
|
|
812
|
+
parseSpy.mockImplementation((source: string) => mockBoundaryTreeFromSource(source));
|
|
813
|
+
try {
|
|
814
|
+
const config = {
|
|
815
|
+
strategy: "language_aware" as const,
|
|
816
|
+
fallback_strategy: "sliding" as const,
|
|
817
|
+
target_chunk_tokens: 80,
|
|
818
|
+
chunk_overlap_tokens: 16,
|
|
819
|
+
budget_tokenizer: "lightweight" as const,
|
|
820
|
+
boundary_strictness: "legacy" as const,
|
|
821
|
+
max_chunks_per_file: 300,
|
|
822
|
+
parse_timeout_ms: 80,
|
|
823
|
+
enabled_languages: ["rust", "java"]
|
|
824
|
+
};
|
|
825
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
826
|
+
const rustContent = ["pub fn alpha(input: i32) -> i32 {", " input + 1", "}"].join("\n");
|
|
827
|
+
const javaContent = ["public class Demo {", " int alpha() {", " return 1;", " }", "}"].join("\n");
|
|
828
|
+
|
|
829
|
+
const rustAlias = buildChunksForFile({
|
|
830
|
+
file: {
|
|
831
|
+
path: "src/alpha.txt",
|
|
832
|
+
language: "rs",
|
|
833
|
+
content: rustContent
|
|
834
|
+
},
|
|
835
|
+
config,
|
|
836
|
+
tokenize
|
|
837
|
+
});
|
|
838
|
+
const rustPath = buildChunksForFile({
|
|
839
|
+
file: {
|
|
840
|
+
path: "src/alpha.rs",
|
|
841
|
+
content: rustContent
|
|
842
|
+
},
|
|
843
|
+
config,
|
|
844
|
+
tokenize
|
|
845
|
+
});
|
|
846
|
+
const javaPath = buildChunksForFile({
|
|
847
|
+
file: {
|
|
848
|
+
path: "src/Demo.java",
|
|
849
|
+
content: javaContent
|
|
850
|
+
},
|
|
851
|
+
config,
|
|
852
|
+
tokenize
|
|
853
|
+
});
|
|
854
|
+
|
|
855
|
+
expect(rustAlias.strategy).toBe("language_aware");
|
|
856
|
+
expect(rustPath.strategy).toBe("language_aware");
|
|
857
|
+
expect(javaPath.strategy).toBe("language_aware");
|
|
858
|
+
expect(rustAlias.fallback_reason).toBeUndefined();
|
|
859
|
+
expect(rustPath.fallback_reason).toBeUndefined();
|
|
860
|
+
expect(javaPath.fallback_reason).toBeUndefined();
|
|
861
|
+
expect(rustAlias.language).toBe("rust");
|
|
862
|
+
expect(rustPath.language).toBe("rust");
|
|
863
|
+
expect(javaPath.language).toBe("java");
|
|
864
|
+
} finally {
|
|
865
|
+
parseSpy.mockRestore();
|
|
866
|
+
__resetChunkingParserStateForTests();
|
|
867
|
+
}
|
|
868
|
+
});
|
|
869
|
+
|
|
870
|
+
it("falls back with parser_unavailable for rust/java parser load failures and caches deterministically", () => {
|
|
871
|
+
__resetChunkingParserStateForTests();
|
|
872
|
+
try {
|
|
873
|
+
__setChunkingParserLanguageLoaderForTests("rust", () => {
|
|
874
|
+
throw new Error("forced rust parser load failure");
|
|
875
|
+
});
|
|
876
|
+
__setChunkingParserLanguageLoaderForTests("java", () => {
|
|
877
|
+
throw new Error("forced java parser load failure");
|
|
878
|
+
});
|
|
879
|
+
|
|
880
|
+
const config = {
|
|
881
|
+
strategy: "language_aware" as const,
|
|
882
|
+
fallback_strategy: "sliding" as const,
|
|
883
|
+
target_chunk_tokens: 220,
|
|
884
|
+
chunk_overlap_tokens: 40,
|
|
885
|
+
budget_tokenizer: "ranking" as const,
|
|
886
|
+
boundary_strictness: "legacy" as const,
|
|
887
|
+
max_chunks_per_file: 300,
|
|
888
|
+
parse_timeout_ms: 80,
|
|
889
|
+
enabled_languages: ["rust", "java"]
|
|
890
|
+
};
|
|
891
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
892
|
+
|
|
893
|
+
const rustFile = {
|
|
894
|
+
path: "src/runtime.rs",
|
|
895
|
+
language: "rust",
|
|
896
|
+
content: ["pub fn alpha() -> i32 {", " 1", "}"].join("\n")
|
|
897
|
+
};
|
|
898
|
+
const javaFile = {
|
|
899
|
+
path: "src/Runtime.java",
|
|
900
|
+
language: "java",
|
|
901
|
+
content: ["public class Runtime {", " int alpha() {", " return 1;", " }", "}"].join("\n")
|
|
902
|
+
};
|
|
903
|
+
|
|
904
|
+
const rustFirst = buildChunksForFile({ file: rustFile, config, tokenize });
|
|
905
|
+
const rustSecond = buildChunksForFile({ file: rustFile, config, tokenize });
|
|
906
|
+
const javaFirst = buildChunksForFile({ file: javaFile, config, tokenize });
|
|
907
|
+
const javaSecond = buildChunksForFile({ file: javaFile, config, tokenize });
|
|
908
|
+
|
|
909
|
+
expect(rustFirst.fallback_reason).toBe("parser_unavailable");
|
|
910
|
+
expect(rustSecond.fallback_reason).toBe("parser_unavailable");
|
|
911
|
+
expect(javaFirst.fallback_reason).toBe("parser_unavailable");
|
|
912
|
+
expect(javaSecond.fallback_reason).toBe("parser_unavailable");
|
|
913
|
+
expect(rustFirst.language).toBe("rust");
|
|
914
|
+
expect(javaFirst.language).toBe("java");
|
|
915
|
+
|
|
916
|
+
const attempts = __getChunkingParserInitAttemptsForTests();
|
|
917
|
+
expect(attempts.rust).toBe(1);
|
|
918
|
+
expect(attempts.java).toBe(1);
|
|
919
|
+
|
|
920
|
+
const snapshot = getChunkingParserAvailabilitySnapshot({
|
|
921
|
+
enabled_languages: ["rust", "java"]
|
|
922
|
+
});
|
|
923
|
+
expect(snapshot.some((entry) => entry.language === "rust" && entry.status === "unavailable")).toBe(true);
|
|
924
|
+
expect(snapshot.some((entry) => entry.language === "java" && entry.status === "unavailable")).toBe(true);
|
|
925
|
+
} finally {
|
|
926
|
+
__resetChunkingParserStateForTests();
|
|
927
|
+
}
|
|
928
|
+
});
|
|
929
|
+
|
|
930
|
+
it("falls back with parse_error for rust/java when parser throws during parse", () => {
|
|
931
|
+
__resetChunkingParserStateForTests();
|
|
932
|
+
const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
|
|
933
|
+
parseSpy.mockImplementation(() => {
|
|
934
|
+
throw new Error("forced parse failure");
|
|
935
|
+
});
|
|
936
|
+
try {
|
|
937
|
+
const config = {
|
|
938
|
+
strategy: "language_aware" as const,
|
|
939
|
+
fallback_strategy: "sliding" as const,
|
|
940
|
+
target_chunk_tokens: 220,
|
|
941
|
+
chunk_overlap_tokens: 40,
|
|
942
|
+
budget_tokenizer: "ranking" as const,
|
|
943
|
+
boundary_strictness: "legacy" as const,
|
|
944
|
+
max_chunks_per_file: 300,
|
|
945
|
+
parse_timeout_ms: 80,
|
|
946
|
+
enabled_languages: ["rust", "java"]
|
|
947
|
+
};
|
|
948
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
949
|
+
const rustResult = buildChunksForFile({
|
|
950
|
+
file: {
|
|
951
|
+
path: "src/parse-error.rs",
|
|
952
|
+
language: "rust",
|
|
953
|
+
content: ["pub fn alpha() -> i32 {", " 1", "}"].join("\n")
|
|
954
|
+
},
|
|
955
|
+
config,
|
|
956
|
+
tokenize
|
|
957
|
+
});
|
|
958
|
+
const javaResult = buildChunksForFile({
|
|
959
|
+
file: {
|
|
960
|
+
path: "src/ParseError.java",
|
|
961
|
+
language: "java",
|
|
962
|
+
content: ["public class ParseError {", " int alpha() {", " return 1;", " }", "}"].join("\n")
|
|
963
|
+
},
|
|
964
|
+
config,
|
|
965
|
+
tokenize
|
|
966
|
+
});
|
|
967
|
+
|
|
968
|
+
expect(rustResult.strategy).toBe("sliding");
|
|
969
|
+
expect(javaResult.strategy).toBe("sliding");
|
|
970
|
+
expect(rustResult.fallback_reason).toBe("parse_error");
|
|
971
|
+
expect(javaResult.fallback_reason).toBe("parse_error");
|
|
972
|
+
expect(rustResult.language).toBe("rust");
|
|
973
|
+
expect(javaResult.language).toBe("java");
|
|
974
|
+
} finally {
|
|
975
|
+
parseSpy.mockRestore();
|
|
976
|
+
__resetChunkingParserStateForTests();
|
|
977
|
+
}
|
|
978
|
+
});
|
|
979
|
+
|
|
980
|
+
it("builds recursive semantic chunks with stable declaration boundaries", () => {
|
|
981
|
+
const config = {
|
|
982
|
+
strategy: "language_aware" as const,
|
|
983
|
+
fallback_strategy: "sliding" as const,
|
|
984
|
+
target_chunk_tokens: 28,
|
|
985
|
+
chunk_overlap_tokens: 6,
|
|
986
|
+
budget_tokenizer: "lightweight" as const,
|
|
987
|
+
boundary_strictness: "semantic_js_ts" as const,
|
|
988
|
+
max_chunks_per_file: 300,
|
|
989
|
+
parse_timeout_ms: 80,
|
|
990
|
+
enabled_languages: ["typescript"],
|
|
991
|
+
recursive_semantic_chunking_enabled: true,
|
|
992
|
+
semantic_merge_gap_lines: 1,
|
|
993
|
+
semantic_merge_max_span_lines: 120
|
|
994
|
+
};
|
|
995
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
996
|
+
const file = {
|
|
997
|
+
path: "src/recursive-runtime.ts",
|
|
998
|
+
language: "typescript",
|
|
999
|
+
content: [
|
|
1000
|
+
"export function alpha(input: number): number {",
|
|
1001
|
+
" const a = input + 1;",
|
|
1002
|
+
" return a;",
|
|
1003
|
+
"}",
|
|
1004
|
+
"",
|
|
1005
|
+
"export function beta(input: number): number {",
|
|
1006
|
+
" const b = alpha(input) * 2;",
|
|
1007
|
+
" return b;",
|
|
1008
|
+
"}",
|
|
1009
|
+
"",
|
|
1010
|
+
"export function gamma(input: number): number {",
|
|
1011
|
+
" const c = beta(input) + 3;",
|
|
1012
|
+
" return c;",
|
|
1013
|
+
"}"
|
|
1014
|
+
].join("\n")
|
|
1015
|
+
};
|
|
1016
|
+
const result = buildChunksForFile({ file, config, tokenize });
|
|
1017
|
+
expect(result.strategy).toBe("language_aware");
|
|
1018
|
+
expect(result.fallback_reason).toBeUndefined();
|
|
1019
|
+
expect(result.recursive_semantic_chunking_used).toBe(true);
|
|
1020
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("export function alpha"))).toBe(true);
|
|
1021
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("export function beta"))).toBe(true);
|
|
1022
|
+
expect(result.chunks.some((chunk) => chunk.snippet.includes("export function gamma"))).toBe(true);
|
|
1023
|
+
expect(result.chunks.every((chunk) => chunk.start_line >= 1 && chunk.end_line >= chunk.start_line)).toBe(true);
|
|
1024
|
+
});
|
|
1025
|
+
|
|
1026
|
+
it("forward-absorbs comment windows into following code windows when enabled", () => {
|
|
1027
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
1028
|
+
const baseConfig = {
|
|
1029
|
+
strategy: "language_aware" as const,
|
|
1030
|
+
fallback_strategy: "sliding" as const,
|
|
1031
|
+
target_chunk_tokens: 30,
|
|
1032
|
+
chunk_overlap_tokens: 4,
|
|
1033
|
+
budget_tokenizer: "lightweight" as const,
|
|
1034
|
+
boundary_strictness: "semantic_js_ts" as const,
|
|
1035
|
+
max_chunks_per_file: 300,
|
|
1036
|
+
parse_timeout_ms: 80,
|
|
1037
|
+
enabled_languages: ["typescript"],
|
|
1038
|
+
recursive_semantic_chunking_enabled: true,
|
|
1039
|
+
semantic_merge_gap_lines: 0,
|
|
1040
|
+
semantic_merge_max_span_lines: 120
|
|
1041
|
+
};
|
|
1042
|
+
const file = {
|
|
1043
|
+
path: "src/comment-absorb.ts",
|
|
1044
|
+
language: "typescript",
|
|
1045
|
+
content: [
|
|
1046
|
+
"// Explanation for alpha",
|
|
1047
|
+
"// Keep this doc attached to alpha",
|
|
1048
|
+
"",
|
|
1049
|
+
"export function alpha(input: number): number {",
|
|
1050
|
+
" return input + 1;",
|
|
1051
|
+
"}",
|
|
1052
|
+
"",
|
|
1053
|
+
"export function beta(input: number): number {",
|
|
1054
|
+
" return alpha(input);",
|
|
1055
|
+
"}"
|
|
1056
|
+
].join("\n")
|
|
1057
|
+
};
|
|
1058
|
+
|
|
1059
|
+
const withAbsorb = buildChunksForFile({
|
|
1060
|
+
file,
|
|
1061
|
+
config: {
|
|
1062
|
+
...baseConfig,
|
|
1063
|
+
comment_forward_absorb_enabled: true
|
|
1064
|
+
},
|
|
1065
|
+
tokenize
|
|
1066
|
+
});
|
|
1067
|
+
const withoutAbsorb = buildChunksForFile({
|
|
1068
|
+
file,
|
|
1069
|
+
config: {
|
|
1070
|
+
...baseConfig,
|
|
1071
|
+
comment_forward_absorb_enabled: false
|
|
1072
|
+
},
|
|
1073
|
+
tokenize
|
|
1074
|
+
});
|
|
1075
|
+
|
|
1076
|
+
expect(withAbsorb.chunks.some((chunk) => chunk.snippet.includes("Explanation for alpha") && chunk.snippet.includes("export function alpha"))).toBe(
|
|
1077
|
+
true
|
|
1078
|
+
);
|
|
1079
|
+
expect(
|
|
1080
|
+
withoutAbsorb.chunks.some(
|
|
1081
|
+
(chunk) => chunk.snippet.includes("Explanation for alpha") && !chunk.snippet.includes("export function alpha")
|
|
1082
|
+
)
|
|
1083
|
+
).toBe(true);
|
|
1084
|
+
});
|
|
1085
|
+
|
|
1086
|
+
it("emits parser availability snapshot and avoids repeated parser init attempts", async () => {
|
|
1087
|
+
__resetChunkingParserStateForTests();
|
|
1088
|
+
try {
|
|
1089
|
+
const store = new InMemoryIndexStore();
|
|
1090
|
+
const observability = getObservability(`retrieval-core-js-py-${Date.now()}`);
|
|
1091
|
+
const core = new RetrievalCore(store, new InMemoryQueryCache(), {
|
|
1092
|
+
observability,
|
|
1093
|
+
chunkingConfig: {
|
|
1094
|
+
strategy: "language_aware",
|
|
1095
|
+
enabled_languages: ["typescript", "javascript", "python", "go", "rust", "java"]
|
|
1096
|
+
}
|
|
1097
|
+
});
|
|
1098
|
+
|
|
1099
|
+
await indexAndListChunks({
|
|
1100
|
+
core,
|
|
1101
|
+
store,
|
|
1102
|
+
tenant_id: "tenant-d",
|
|
1103
|
+
workspace_id: "ws-d",
|
|
1104
|
+
index_version: "idx-d1",
|
|
1105
|
+
file: {
|
|
1106
|
+
path: "src/runtime.js",
|
|
1107
|
+
language: "javascript",
|
|
1108
|
+
content: ["export function alpha() {", " return 1;", "}"].join("\n")
|
|
1109
|
+
}
|
|
1110
|
+
});
|
|
1111
|
+
|
|
1112
|
+
await indexAndListChunks({
|
|
1113
|
+
core,
|
|
1114
|
+
store,
|
|
1115
|
+
tenant_id: "tenant-d",
|
|
1116
|
+
workspace_id: "ws-d",
|
|
1117
|
+
index_version: "idx-d2",
|
|
1118
|
+
file: {
|
|
1119
|
+
path: "src/runtime.py",
|
|
1120
|
+
language: "python",
|
|
1121
|
+
content: ["def alpha():", " return 1"].join("\n")
|
|
1122
|
+
}
|
|
1123
|
+
});
|
|
1124
|
+
|
|
1125
|
+
await indexAndListChunks({
|
|
1126
|
+
core,
|
|
1127
|
+
store,
|
|
1128
|
+
tenant_id: "tenant-d",
|
|
1129
|
+
workspace_id: "ws-d",
|
|
1130
|
+
index_version: "idx-d3",
|
|
1131
|
+
file: {
|
|
1132
|
+
path: "src/runtime.rs",
|
|
1133
|
+
language: "rust",
|
|
1134
|
+
content: ["pub fn alpha() -> i32 {", " 1", "}"].join("\n")
|
|
1135
|
+
}
|
|
1136
|
+
});
|
|
1137
|
+
|
|
1138
|
+
await indexAndListChunks({
|
|
1139
|
+
core,
|
|
1140
|
+
store,
|
|
1141
|
+
tenant_id: "tenant-d",
|
|
1142
|
+
workspace_id: "ws-d",
|
|
1143
|
+
index_version: "idx-d4",
|
|
1144
|
+
file: {
|
|
1145
|
+
path: "src/Runtime.java",
|
|
1146
|
+
language: "java",
|
|
1147
|
+
content: ["public class Runtime {", " int alpha() {", " return 1;", " }", "}"].join("\n")
|
|
1148
|
+
}
|
|
1149
|
+
});
|
|
1150
|
+
|
|
1151
|
+
const attempts = __getChunkingParserInitAttemptsForTests();
|
|
1152
|
+
expect((attempts.javascript ?? 0) <= 1).toBe(true);
|
|
1153
|
+
expect((attempts.python ?? 0) <= 1).toBe(true);
|
|
1154
|
+
expect((attempts.rust ?? 0) <= 1).toBe(true);
|
|
1155
|
+
expect((attempts.java ?? 0) <= 1).toBe(true);
|
|
1156
|
+
|
|
1157
|
+
const strategyCounters = observability.metrics.readCounter("index_chunking_strategy_total");
|
|
1158
|
+
expect(
|
|
1159
|
+
strategyCounters.some((counter) => counter.labels.language === "javascript")
|
|
1160
|
+
).toBe(true);
|
|
1161
|
+
expect(strategyCounters.some((counter) => counter.labels.language === "python")).toBe(true);
|
|
1162
|
+
expect(strategyCounters.some((counter) => counter.labels.language === "rust")).toBe(true);
|
|
1163
|
+
expect(strategyCounters.some((counter) => counter.labels.language === "java")).toBe(true);
|
|
1164
|
+
|
|
1165
|
+
const snapshot = getChunkingParserAvailabilitySnapshot({
|
|
1166
|
+
enabled_languages: ["typescript", "javascript", "python", "go", "rust", "java"]
|
|
1167
|
+
});
|
|
1168
|
+
expect(snapshot.some((entry) => entry.language === "javascript" && entry.status === "available")).toBe(true);
|
|
1169
|
+
expect(snapshot.some((entry) => entry.language === "python" && entry.status === "available")).toBe(true);
|
|
1170
|
+
expect(snapshot.some((entry) => entry.language === "rust" && entry.status === "available")).toBe(true);
|
|
1171
|
+
expect(snapshot.some((entry) => entry.language === "java" && entry.status === "available")).toBe(true);
|
|
1172
|
+
|
|
1173
|
+
const availabilityGauges = observability.metrics.readGauge("index_chunking_parser_availability");
|
|
1174
|
+
expect(
|
|
1175
|
+
availabilityGauges.some((point) => point.labels.language === "javascript" && point.labels.status === "available")
|
|
1176
|
+
).toBe(true);
|
|
1177
|
+
expect(availabilityGauges.some((point) => point.labels.language === "python" && point.labels.status === "available")).toBe(
|
|
1178
|
+
true
|
|
1179
|
+
);
|
|
1180
|
+
expect(availabilityGauges.some((point) => point.labels.language === "rust" && point.labels.status === "available")).toBe(
|
|
1181
|
+
true
|
|
1182
|
+
);
|
|
1183
|
+
expect(availabilityGauges.some((point) => point.labels.language === "java" && point.labels.status === "available")).toBe(
|
|
1184
|
+
true
|
|
1185
|
+
);
|
|
1186
|
+
|
|
1187
|
+
const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
|
|
1188
|
+
expect(
|
|
1189
|
+
fallbackCounters.some(
|
|
1190
|
+
(counter) =>
|
|
1191
|
+
counter.labels.reason === "parser_unavailable" &&
|
|
1192
|
+
(counter.labels.language === "javascript" ||
|
|
1193
|
+
counter.labels.language === "python" ||
|
|
1194
|
+
counter.labels.language === "rust" ||
|
|
1195
|
+
counter.labels.language === "java")
|
|
1196
|
+
)
|
|
1197
|
+
).toBe(false);
|
|
1198
|
+
} finally {
|
|
1199
|
+
__resetChunkingParserStateForTests();
|
|
1200
|
+
}
|
|
1201
|
+
});
|
|
1202
|
+
|
|
1203
|
+
it("caches parser unavailability and avoids repeated parser init attempts", () => {
|
|
1204
|
+
__resetChunkingParserStateForTests();
|
|
1205
|
+
try {
|
|
1206
|
+
__setChunkingParserLanguageLoaderForTests("python", () => {
|
|
1207
|
+
throw new Error("forced parser load failure");
|
|
1208
|
+
});
|
|
1209
|
+
|
|
1210
|
+
const config = {
|
|
1211
|
+
strategy: "language_aware" as const,
|
|
1212
|
+
fallback_strategy: "sliding" as const,
|
|
1213
|
+
target_chunk_tokens: 220,
|
|
1214
|
+
chunk_overlap_tokens: 40,
|
|
1215
|
+
budget_tokenizer: "ranking" as const,
|
|
1216
|
+
boundary_strictness: "legacy" as const,
|
|
1217
|
+
max_chunks_per_file: 300,
|
|
1218
|
+
parse_timeout_ms: 80,
|
|
1219
|
+
enabled_languages: ["python"]
|
|
1220
|
+
};
|
|
1221
|
+
const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
|
|
1222
|
+
const file = {
|
|
1223
|
+
path: "src/runtime.py",
|
|
1224
|
+
language: "python",
|
|
1225
|
+
content: ["def alpha():", " return 1"].join("\n")
|
|
1226
|
+
};
|
|
1227
|
+
|
|
1228
|
+
const first = buildChunksForFile({ file, config, tokenize });
|
|
1229
|
+
const second = buildChunksForFile({ file, config, tokenize });
|
|
1230
|
+
const third = buildChunksForFile({ file, config, tokenize });
|
|
1231
|
+
|
|
1232
|
+
expect(first.fallback_reason).toBe("parser_unavailable");
|
|
1233
|
+
expect(second.fallback_reason).toBe("parser_unavailable");
|
|
1234
|
+
expect(third.fallback_reason).toBe("parser_unavailable");
|
|
1235
|
+
|
|
1236
|
+
const attempts = __getChunkingParserInitAttemptsForTests();
|
|
1237
|
+
expect(attempts.python).toBe(1);
|
|
1238
|
+
|
|
1239
|
+
const snapshot = getChunkingParserAvailabilitySnapshot({
|
|
1240
|
+
enabled_languages: ["python", "typescript"]
|
|
1241
|
+
});
|
|
1242
|
+
expect(snapshot.some((entry) => entry.language === "python" && entry.status === "unavailable")).toBe(true);
|
|
1243
|
+
expect(snapshot.some((entry) => entry.language === "typescript")).toBe(true);
|
|
1244
|
+
} finally {
|
|
1245
|
+
__resetChunkingParserStateForTests();
|
|
1246
|
+
}
|
|
1247
|
+
});
|
|
1248
|
+
});
|