@cue-dev/retrieval-core 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1248 @@
1
+ import Parser from "tree-sitter";
2
+ import { describe, expect, it, vi } from "vitest";
3
+ import { InMemoryQueryCache } from "@cue-dev/data-plane";
4
+ import { getObservability } from "@cue-dev/observability";
5
+ import { InMemoryIndexStore, RetrievalCore } from "../src/index.js";
6
+ import {
7
+ __isChunkingBoundaryCandidateForTests,
8
+ __getChunkingParserInitAttemptsForTests,
9
+ __resetChunkingParserStateForTests,
10
+ __setChunkingParserLanguageLoaderForTests,
11
+ buildChunksForFile,
12
+ getChunkingParserAvailabilitySnapshot
13
+ } from "../src/chunking.js";
14
+
15
+ async function indexAndListChunks(input: {
16
+ core: RetrievalCore;
17
+ store: InMemoryIndexStore;
18
+ tenant_id: string;
19
+ workspace_id: string;
20
+ index_version: string;
21
+ file: {
22
+ path: string;
23
+ language?: string;
24
+ content: string;
25
+ };
26
+ }): Promise<
27
+ Array<{
28
+ path: string;
29
+ start_line: number;
30
+ end_line: number;
31
+ snippet: string;
32
+ }>
33
+ > {
34
+ await input.store.upsertWorkspace({
35
+ workspace_id: input.workspace_id,
36
+ tenant_id: input.tenant_id,
37
+ name: "chunking-test",
38
+ project_root_path: "/workspace/chunking-test"
39
+ });
40
+
41
+ const report = await input.core.indexArtifact({
42
+ tenant_id: input.tenant_id,
43
+ workspace_id: input.workspace_id,
44
+ index_version: input.index_version,
45
+ files: [input.file]
46
+ });
47
+ expect(report.status).toBe("ready");
48
+
49
+ const index = await input.store.getLatestReadyIndex({
50
+ tenant_id: input.tenant_id,
51
+ workspace_id: input.workspace_id
52
+ });
53
+ expect(index).toBeDefined();
54
+
55
+ const chunks = await input.store.listChunksByIndex({
56
+ tenant_id: input.tenant_id,
57
+ index_id: index!.index_id
58
+ });
59
+ return chunks.map((chunk) => ({
60
+ path: chunk.path,
61
+ start_line: chunk.start_line,
62
+ end_line: chunk.end_line,
63
+ snippet: chunk.snippet
64
+ }));
65
+ }
66
+
67
+ function mockBoundaryTreeFromSource(source: string): Parser.Tree {
68
+ const lines = source.split("\n");
69
+ const nodes: Parser.SyntaxNode[] = [];
70
+
71
+ const rustFunctionStart = lines.findIndex((line) => line.includes("pub fn"));
72
+ if (rustFunctionStart >= 0) {
73
+ let end = rustFunctionStart;
74
+ while (end < lines.length - 1 && lines[end]?.trim() !== "}") {
75
+ end += 1;
76
+ }
77
+ nodes.push({
78
+ type: "function_item",
79
+ startPosition: { row: rustFunctionStart, column: 0 },
80
+ endPosition: { row: end + 1, column: 0 },
81
+ parent: null
82
+ } as unknown as Parser.SyntaxNode);
83
+ }
84
+
85
+ const javaClassStart = lines.findIndex((line) => line.includes("class "));
86
+ if (javaClassStart >= 0) {
87
+ let end = lines.length - 1;
88
+ for (let row = lines.length - 1; row >= javaClassStart; row -= 1) {
89
+ if ((lines[row] ?? "").trim() === "}") {
90
+ end = row;
91
+ break;
92
+ }
93
+ }
94
+ nodes.push({
95
+ type: "class_declaration",
96
+ startPosition: { row: javaClassStart, column: 0 },
97
+ endPosition: { row: end + 1, column: 0 },
98
+ parent: null
99
+ } as unknown as Parser.SyntaxNode);
100
+ }
101
+
102
+ return {
103
+ rootNode: {
104
+ descendantsOfType(types: string[]): Parser.SyntaxNode[] {
105
+ return nodes.filter((node) => types.includes(node.type));
106
+ }
107
+ }
108
+ } as unknown as Parser.Tree;
109
+ }
110
+
111
+ describe("retrieval-core chunking", () => {
112
+ it("uses language-aware chunking for supported languages when configured", async () => {
113
+ const store = new InMemoryIndexStore();
114
+ const observability = getObservability(`retrieval-core-language-aware-${Date.now()}`);
115
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
116
+ observability,
117
+ chunkingConfig: {
118
+ strategy: "language_aware",
119
+ parse_timeout_ms: 500
120
+ }
121
+ });
122
+
123
+ const chunks = await indexAndListChunks({
124
+ core,
125
+ store,
126
+ tenant_id: "tenant-a",
127
+ workspace_id: "ws-a",
128
+ index_version: "idx-a1",
129
+ file: {
130
+ path: "src/feature.ts",
131
+ language: "typescript",
132
+ content: [
133
+ "import { dep } from './dep';",
134
+ "",
135
+ "export function alpha(input: number) {",
136
+ " const value = dep(input);",
137
+ " return value + 1;",
138
+ "}",
139
+ "",
140
+ "export class Greeter {",
141
+ " greet(name: string) {",
142
+ " return `hello ${name}`;",
143
+ " }",
144
+ "}",
145
+ "",
146
+ "export function beta(input: number) {",
147
+ " const value = dep(input);",
148
+ " return value * 2;",
149
+ "}"
150
+ ].join("\n")
151
+ }
152
+ });
153
+
154
+ expect(chunks.some((chunk) => chunk.snippet.includes("function alpha"))).toBe(true);
155
+ expect(chunks.some((chunk) => chunk.snippet.includes("class Greeter"))).toBe(true);
156
+ expect(chunks.some((chunk) => chunk.snippet.includes("function beta"))).toBe(true);
157
+ expect(chunks.every((chunk) => chunk.start_line >= 1 && chunk.end_line >= chunk.start_line)).toBe(true);
158
+
159
+ const strategyCounters = observability.metrics.readCounter("index_chunking_strategy_total");
160
+ expect(strategyCounters.some((counter) => counter.labels.strategy === "language_aware" && counter.labels.reason === "none")).toBe(
161
+ true
162
+ );
163
+ });
164
+
165
+ it("falls back to sliding chunks for unsupported languages", async () => {
166
+ const store = new InMemoryIndexStore();
167
+ const observability = getObservability(`retrieval-core-unsupported-${Date.now()}`);
168
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
169
+ observability,
170
+ chunkingConfig: {
171
+ strategy: "language_aware"
172
+ }
173
+ });
174
+
175
+ const chunks = await indexAndListChunks({
176
+ core,
177
+ store,
178
+ tenant_id: "tenant-b",
179
+ workspace_id: "ws-b",
180
+ index_version: "idx-b1",
181
+ file: {
182
+ path: "docs/readme.md",
183
+ language: "markdown",
184
+ content: Array.from({ length: 220 }, (_, idx) => `line ${idx + 1}: retrieval docs`).join("\n")
185
+ }
186
+ });
187
+
188
+ expect(chunks.length).toBeGreaterThan(0);
189
+ const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
190
+ expect(
191
+ fallbackCounters.some(
192
+ (counter) => counter.labels.reason === "unsupported_language" && counter.labels.language === "markdown"
193
+ )
194
+ ).toBe(true);
195
+ });
196
+
197
+ it("falls back with parse_timeout_exceeded when parse latency exceeds the configured timeout", () => {
198
+ __resetChunkingParserStateForTests();
199
+ const nowSpy = vi.spyOn(Date, "now");
200
+ let tick = 0;
201
+ nowSpy.mockImplementation(() => {
202
+ tick += 3;
203
+ return tick;
204
+ });
205
+ try {
206
+ const config = {
207
+ strategy: "language_aware" as const,
208
+ fallback_strategy: "sliding" as const,
209
+ target_chunk_tokens: 220,
210
+ chunk_overlap_tokens: 40,
211
+ budget_tokenizer: "ranking" as const,
212
+ boundary_strictness: "legacy" as const,
213
+ max_chunks_per_file: 300,
214
+ parse_timeout_ms: 1,
215
+ enabled_languages: ["typescript"]
216
+ };
217
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
218
+ const file = {
219
+ path: "src/timeout.ts",
220
+ language: "typescript",
221
+ content: ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n")
222
+ };
223
+
224
+ const result = buildChunksForFile({ file, config, tokenize });
225
+ expect(result.strategy).toBe("sliding");
226
+ expect(result.fallback_reason).toBe("parse_timeout_exceeded");
227
+ expect(result.language).toBe("typescript");
228
+ } finally {
229
+ nowSpy.mockRestore();
230
+ __resetChunkingParserStateForTests();
231
+ }
232
+ });
233
+
234
+ it("falls back with empty_language_boundaries when parser returns no eligible declaration boundaries", () => {
235
+ __resetChunkingParserStateForTests();
236
+ try {
237
+ const config = {
238
+ strategy: "language_aware" as const,
239
+ fallback_strategy: "sliding" as const,
240
+ target_chunk_tokens: 220,
241
+ chunk_overlap_tokens: 40,
242
+ budget_tokenizer: "ranking" as const,
243
+ boundary_strictness: "legacy" as const,
244
+ max_chunks_per_file: 300,
245
+ parse_timeout_ms: 80,
246
+ enabled_languages: ["typescript"]
247
+ };
248
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
249
+ const file = {
250
+ path: "src/no-boundaries.ts",
251
+ language: "typescript",
252
+ content: ["const alpha = 1;", "const beta = alpha + 1;", "const gamma = beta + 1;"].join("\n")
253
+ };
254
+
255
+ const result = buildChunksForFile({ file, config, tokenize });
256
+ expect(result.strategy).toBe("sliding");
257
+ expect(result.fallback_reason).toBe("empty_language_boundaries");
258
+ expect(result.language).toBe("typescript");
259
+ } finally {
260
+ __resetChunkingParserStateForTests();
261
+ }
262
+ });
263
+
264
+ it("falls back with parse_error when parser throws during language-aware parsing", () => {
265
+ __resetChunkingParserStateForTests();
266
+ const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
267
+ parseSpy.mockImplementation(() => {
268
+ throw new Error("forced parse failure");
269
+ });
270
+ try {
271
+ const config = {
272
+ strategy: "language_aware" as const,
273
+ fallback_strategy: "sliding" as const,
274
+ target_chunk_tokens: 220,
275
+ chunk_overlap_tokens: 40,
276
+ budget_tokenizer: "ranking" as const,
277
+ boundary_strictness: "legacy" as const,
278
+ max_chunks_per_file: 300,
279
+ parse_timeout_ms: 80,
280
+ enabled_languages: ["typescript"]
281
+ };
282
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
283
+ const file = {
284
+ path: "src/parse-error.ts",
285
+ language: "typescript",
286
+ content: ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n")
287
+ };
288
+
289
+ const result = buildChunksForFile({ file, config, tokenize });
290
+ expect(result.strategy).toBe("sliding");
291
+ expect(result.fallback_reason).toBe("parse_error");
292
+ expect(result.language).toBe("typescript");
293
+ } finally {
294
+ parseSpy.mockRestore();
295
+ __resetChunkingParserStateForTests();
296
+ }
297
+ });
298
+
299
+ it("emits fallback metrics with reason and language labels for parse_timeout_exceeded", async () => {
300
+ __resetChunkingParserStateForTests();
301
+ const nowSpy = vi.spyOn(Date, "now");
302
+ let tick = 0;
303
+ nowSpy.mockImplementation(() => {
304
+ tick += 3;
305
+ return tick;
306
+ });
307
+ try {
308
+ const store = new InMemoryIndexStore();
309
+ const observability = getObservability(`retrieval-core-timeout-fallback-${Date.now()}`);
310
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
311
+ observability,
312
+ chunkingConfig: {
313
+ strategy: "language_aware",
314
+ parse_timeout_ms: 1,
315
+ enabled_languages: ["typescript"]
316
+ }
317
+ });
318
+
319
+ const chunks = await indexAndListChunks({
320
+ core,
321
+ store,
322
+ tenant_id: "tenant-timeout",
323
+ workspace_id: "ws-timeout",
324
+ index_version: "idx-timeout-1",
325
+ file: {
326
+ path: "src/timeout.ts",
327
+ language: "typescript",
328
+ content: ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n")
329
+ }
330
+ });
331
+ expect(chunks.length).toBeGreaterThan(0);
332
+
333
+ const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
334
+ expect(
335
+ fallbackCounters.some(
336
+ (counter) => counter.labels.reason === "parse_timeout_exceeded" && counter.labels.language === "typescript"
337
+ )
338
+ ).toBe(true);
339
+ } finally {
340
+ nowSpy.mockRestore();
341
+ __resetChunkingParserStateForTests();
342
+ }
343
+ });
344
+
345
+ it("emits fallback metrics with reason and language labels for empty_language_boundaries", async () => {
346
+ __resetChunkingParserStateForTests();
347
+ try {
348
+ const store = new InMemoryIndexStore();
349
+ const observability = getObservability(`retrieval-core-empty-boundaries-${Date.now()}`);
350
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
351
+ observability,
352
+ chunkingConfig: {
353
+ strategy: "language_aware",
354
+ recursive_semantic_chunking_enabled: false,
355
+ enabled_languages: ["typescript"]
356
+ }
357
+ });
358
+
359
+ const chunks = await indexAndListChunks({
360
+ core,
361
+ store,
362
+ tenant_id: "tenant-empty",
363
+ workspace_id: "ws-empty",
364
+ index_version: "idx-empty-1",
365
+ file: {
366
+ path: "src/no-boundaries.ts",
367
+ language: "typescript",
368
+ content: ["const alpha = 1;", "const beta = alpha + 1;", "const gamma = beta + 1;"].join("\n")
369
+ }
370
+ });
371
+ expect(chunks.length).toBeGreaterThan(0);
372
+
373
+ const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
374
+ expect(
375
+ fallbackCounters.some(
376
+ (counter) => counter.labels.reason === "empty_language_boundaries" && counter.labels.language === "typescript"
377
+ )
378
+ ).toBe(true);
379
+ } finally {
380
+ __resetChunkingParserStateForTests();
381
+ }
382
+ });
383
+
384
+ it("emits fallback metrics with reason and language labels for parse_error", async () => {
385
+ __resetChunkingParserStateForTests();
386
+ const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
387
+ parseSpy.mockImplementation(() => {
388
+ throw new Error("forced parse failure");
389
+ });
390
+ try {
391
+ const store = new InMemoryIndexStore();
392
+ const observability = getObservability(`retrieval-core-parse-error-${Date.now()}`);
393
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
394
+ observability,
395
+ chunkingConfig: {
396
+ strategy: "language_aware",
397
+ enabled_languages: ["typescript"]
398
+ }
399
+ });
400
+
401
+ const chunks = await indexAndListChunks({
402
+ core,
403
+ store,
404
+ tenant_id: "tenant-parse-error",
405
+ workspace_id: "ws-parse-error",
406
+ index_version: "idx-parse-error-1",
407
+ file: {
408
+ path: "src/parse-error.ts",
409
+ language: "typescript",
410
+ content: ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n")
411
+ }
412
+ });
413
+ expect(chunks.length).toBeGreaterThan(0);
414
+
415
+ const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
416
+ expect(
417
+ fallbackCounters.some((counter) => counter.labels.reason === "parse_error" && counter.labels.language === "typescript")
418
+ ).toBe(true);
419
+ } finally {
420
+ parseSpy.mockRestore();
421
+ __resetChunkingParserStateForTests();
422
+ }
423
+ });
424
+
425
+ it("keeps deterministic line coordinates in sliding mode when lines repeat", async () => {
426
+ const store = new InMemoryIndexStore();
427
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
428
+ chunkingConfig: {
429
+ strategy: "sliding"
430
+ }
431
+ });
432
+
433
+ const chunks = await indexAndListChunks({
434
+ core,
435
+ store,
436
+ tenant_id: "tenant-c",
437
+ workspace_id: "ws-c",
438
+ index_version: "idx-c1",
439
+ file: {
440
+ path: "src/repeated.ts",
441
+ language: "typescript",
442
+ content: Array.from({ length: 400 }, () => "const token = 1;").join("\n")
443
+ }
444
+ });
445
+
446
+ expect(chunks.length).toBeGreaterThan(2);
447
+ const starts = chunks.map((chunk) => chunk.start_line);
448
+ expect(starts.some((line) => line > 1)).toBe(true);
449
+ for (let i = 1; i < starts.length; i += 1) {
450
+ expect(starts[i]).toBeGreaterThan(starts[i - 1] ?? 0);
451
+ }
452
+ });
453
+
454
+ it("keeps javascript boundary candidate filtering focused on symbol-bearing patterns", () => {
455
+ expect(
456
+ __isChunkingBoundaryCandidateForTests({
457
+ parserLanguage: "javascript",
458
+ nodeType: "function_expression",
459
+ parentType: "assignment_expression"
460
+ })
461
+ ).toBe(true);
462
+ expect(
463
+ __isChunkingBoundaryCandidateForTests({
464
+ parserLanguage: "javascript",
465
+ nodeType: "function_expression",
466
+ parentType: "arguments"
467
+ })
468
+ ).toBe(false);
469
+ expect(
470
+ __isChunkingBoundaryCandidateForTests({
471
+ parserLanguage: "javascript",
472
+ nodeType: "arrow_function",
473
+ parentType: "variable_declarator"
474
+ })
475
+ ).toBe(true);
476
+ expect(
477
+ __isChunkingBoundaryCandidateForTests({
478
+ parserLanguage: "javascript",
479
+ nodeType: "method_definition",
480
+ parentType: "object"
481
+ })
482
+ ).toBe(true);
483
+ expect(
484
+ __isChunkingBoundaryCandidateForTests({
485
+ parserLanguage: "typescript",
486
+ nodeType: "function_expression",
487
+ parentType: "arguments",
488
+ boundaryStrictness: "semantic_js_ts"
489
+ })
490
+ ).toBe(false);
491
+ expect(
492
+ __isChunkingBoundaryCandidateForTests({
493
+ parserLanguage: "typescript",
494
+ nodeType: "arrow_function",
495
+ parentType: "arguments",
496
+ boundaryStrictness: "semantic_js_ts"
497
+ })
498
+ ).toBe(false);
499
+ expect(
500
+ __isChunkingBoundaryCandidateForTests({
501
+ parserLanguage: "typescript",
502
+ nodeType: "arrow_function",
503
+ parentType: "parenthesized_expression",
504
+ ancestorTypes: ["as_expression", "variable_declarator"],
505
+ boundaryStrictness: "semantic_js_ts"
506
+ })
507
+ ).toBe(true);
508
+ expect(
509
+ __isChunkingBoundaryCandidateForTests({
510
+ parserLanguage: "typescript",
511
+ nodeType: "arrow_function",
512
+ parentType: "parenthesized_expression",
513
+ ancestorTypes: ["export_statement"],
514
+ boundaryStrictness: "semantic_js_ts"
515
+ })
516
+ ).toBe(true);
517
+ expect(
518
+ __isChunkingBoundaryCandidateForTests({
519
+ parserLanguage: "typescript",
520
+ nodeType: "arrow_function",
521
+ parentType: "parenthesized_expression",
522
+ ancestorTypes: ["arguments"],
523
+ boundaryStrictness: "semantic_js_ts"
524
+ })
525
+ ).toBe(false);
526
+ expect(
527
+ __isChunkingBoundaryCandidateForTests({
528
+ parserLanguage: "typescript",
529
+ nodeType: "class",
530
+ parentType: "export_statement",
531
+ boundaryStrictness: "semantic_js_ts"
532
+ })
533
+ ).toBe(true);
534
+ expect(
535
+ __isChunkingBoundaryCandidateForTests({
536
+ parserLanguage: "typescript",
537
+ nodeType: "class",
538
+ parentType: "arguments",
539
+ boundaryStrictness: "semantic_js_ts"
540
+ })
541
+ ).toBe(false);
542
+ });
543
+
544
+ it("keeps typescript arrow-function snippets complete with semantic boundary strictness", () => {
545
+ const config = {
546
+ strategy: "language_aware" as const,
547
+ fallback_strategy: "sliding" as const,
548
+ target_chunk_tokens: 12,
549
+ chunk_overlap_tokens: 4,
550
+ budget_tokenizer: "lightweight" as const,
551
+ boundary_strictness: "semantic_js_ts" as const,
552
+ max_chunks_per_file: 300,
553
+ parse_timeout_ms: 80,
554
+ enabled_languages: ["typescript"]
555
+ };
556
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
557
+ const file = {
558
+ path: "src/runtime.ts",
559
+ language: "typescript",
560
+ content: [
561
+ "export const alpha = (input: number) => {",
562
+ " const first = input + 1;",
563
+ " const second = first * 2;",
564
+ " return second;",
565
+ "};",
566
+ "",
567
+ "export const beta = (input: number) => {",
568
+ " return alpha(input);",
569
+ "};"
570
+ ].join("\n")
571
+ };
572
+ const result = buildChunksForFile({ file, config, tokenize });
573
+ expect(result.strategy).toBe("language_aware");
574
+ expect(result.fallback_reason).toBeUndefined();
575
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("export const alpha = (input: number) => {"))).toBe(true);
576
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("return second;"))).toBe(true);
577
+ });
578
+
579
+ it("keeps wrapped export-assigned functions as semantic boundaries", () => {
580
+ const config = {
581
+ strategy: "language_aware" as const,
582
+ fallback_strategy: "sliding" as const,
583
+ target_chunk_tokens: 60,
584
+ chunk_overlap_tokens: 12,
585
+ budget_tokenizer: "lightweight" as const,
586
+ boundary_strictness: "semantic_js_ts" as const,
587
+ max_chunks_per_file: 300,
588
+ parse_timeout_ms: 80,
589
+ enabled_languages: ["typescript"]
590
+ };
591
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
592
+ const file = {
593
+ path: "src/export-assigned.ts",
594
+ language: "typescript",
595
+ content: [
596
+ "export default ((input: number) => {",
597
+ " return input + 1;",
598
+ "});"
599
+ ].join("\n")
600
+ };
601
+ const result = buildChunksForFile({ file, config, tokenize });
602
+ expect(result.strategy).toBe("language_aware");
603
+ expect(result.fallback_reason).toBeUndefined();
604
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("export default ((input: number) => {"))).toBe(true);
605
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("return input + 1;"))).toBe(true);
606
+ });
607
+
608
+ it("keeps wrapped variable-initialized function expressions as semantic boundaries", () => {
609
+ const config = {
610
+ strategy: "language_aware" as const,
611
+ fallback_strategy: "sliding" as const,
612
+ target_chunk_tokens: 60,
613
+ chunk_overlap_tokens: 12,
614
+ budget_tokenizer: "lightweight" as const,
615
+ boundary_strictness: "semantic_js_ts" as const,
616
+ max_chunks_per_file: 300,
617
+ parse_timeout_ms: 80,
618
+ enabled_languages: ["typescript"]
619
+ };
620
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
621
+ const file = {
622
+ path: "src/variable-wrapped.ts",
623
+ language: "typescript",
624
+ content: [
625
+ "const build = ((input: number) => {",
626
+ " return input + 1;",
627
+ "}) as (input: number) => number;"
628
+ ].join("\n")
629
+ };
630
+ const result = buildChunksForFile({ file, config, tokenize });
631
+ expect(result.strategy).toBe("language_aware");
632
+ expect(result.fallback_reason).toBeUndefined();
633
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("const build = ((input: number) => {"))).toBe(true);
634
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("return input + 1;"))).toBe(true);
635
+ });
636
+
637
+ it("keeps export-assigned class expressions as semantic boundaries", () => {
638
+ const config = {
639
+ strategy: "language_aware" as const,
640
+ fallback_strategy: "sliding" as const,
641
+ target_chunk_tokens: 60,
642
+ chunk_overlap_tokens: 12,
643
+ budget_tokenizer: "lightweight" as const,
644
+ boundary_strictness: "semantic_js_ts" as const,
645
+ max_chunks_per_file: 300,
646
+ parse_timeout_ms: 80,
647
+ enabled_languages: ["typescript"]
648
+ };
649
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
650
+ const file = {
651
+ path: "src/export-class.ts",
652
+ language: "typescript",
653
+ content: ["export default class Service {}"].join("\n")
654
+ };
655
+ const result = buildChunksForFile({ file, config, tokenize });
656
+ expect(result.strategy).toBe("language_aware");
657
+ expect(result.fallback_reason).toBeUndefined();
658
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("export default class Service {}"))).toBe(true);
659
+ });
660
+
661
+ it("keeps object literal method blocks as semantic boundaries", () => {
662
+ const config = {
663
+ strategy: "language_aware" as const,
664
+ fallback_strategy: "sliding" as const,
665
+ target_chunk_tokens: 60,
666
+ chunk_overlap_tokens: 12,
667
+ budget_tokenizer: "lightweight" as const,
668
+ boundary_strictness: "semantic_js_ts" as const,
669
+ max_chunks_per_file: 300,
670
+ parse_timeout_ms: 80,
671
+ enabled_languages: ["typescript"]
672
+ };
673
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
674
+ const file = {
675
+ path: "src/object-methods.ts",
676
+ language: "typescript",
677
+ content: [
678
+ "export const handlers = {",
679
+ " onReady() {",
680
+ " return 1;",
681
+ " }",
682
+ "};"
683
+ ].join("\n")
684
+ };
685
+ const result = buildChunksForFile({ file, config, tokenize });
686
+ expect(result.strategy).toBe("language_aware");
687
+ expect(result.fallback_reason).toBeUndefined();
688
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("export const handlers = {"))).toBe(true);
689
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("onReady()"))).toBe(true);
690
+ });
691
+
692
+ it("treats .mts and .cts as typescript for language-aware chunking", () => {
693
+ const config = {
694
+ strategy: "language_aware" as const,
695
+ fallback_strategy: "sliding" as const,
696
+ target_chunk_tokens: 80,
697
+ chunk_overlap_tokens: 16,
698
+ budget_tokenizer: "lightweight" as const,
699
+ boundary_strictness: "semantic_js_ts" as const,
700
+ max_chunks_per_file: 300,
701
+ parse_timeout_ms: 80,
702
+ enabled_languages: ["typescript", "javascript"]
703
+ };
704
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
705
+ const content = ["export function alpha(input: number) {", " return input + 1;", "}"].join("\n");
706
+ const mts = buildChunksForFile({
707
+ file: {
708
+ path: "src/runtime.mts",
709
+ content
710
+ },
711
+ config,
712
+ tokenize
713
+ });
714
+ const cts = buildChunksForFile({
715
+ file: {
716
+ path: "src/runtime.cts",
717
+ content
718
+ },
719
+ config,
720
+ tokenize
721
+ });
722
+ expect(mts.strategy).toBe("language_aware");
723
+ expect(cts.strategy).toBe("language_aware");
724
+ expect(mts.fallback_reason).toBeUndefined();
725
+ expect(cts.fallback_reason).toBeUndefined();
726
+ });
727
+
728
+ it("uses parser-aware chunking for rust and java declarations without mid-body truncation", () => {
729
+ __resetChunkingParserStateForTests();
730
+ const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
731
+ parseSpy.mockImplementation((source: string) => mockBoundaryTreeFromSource(source));
732
+ try {
733
+ const config = {
734
+ strategy: "language_aware" as const,
735
+ fallback_strategy: "sliding" as const,
736
+ target_chunk_tokens: 64,
737
+ chunk_overlap_tokens: 12,
738
+ budget_tokenizer: "lightweight" as const,
739
+ boundary_strictness: "legacy" as const,
740
+ max_chunks_per_file: 300,
741
+ parse_timeout_ms: 80,
742
+ enabled_languages: ["rust", "java"]
743
+ };
744
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
745
+
746
+ const rustResult = buildChunksForFile({
747
+ file: {
748
+ path: "src/counter.rs",
749
+ content: [
750
+ "pub struct Counter {",
751
+ " value: i32,",
752
+ "}",
753
+ "",
754
+ "impl Counter {",
755
+ " pub fn increment(&mut self, amount: i32) {",
756
+ " self.value += amount;",
757
+ " }",
758
+ "}",
759
+ "",
760
+ "pub fn compute_total(input: i32) -> i32 {",
761
+ " let doubled = input * 2;",
762
+ " doubled + 1",
763
+ "}"
764
+ ].join("\n")
765
+ },
766
+ config,
767
+ tokenize
768
+ });
769
+ expect(rustResult.strategy).toBe("language_aware");
770
+ expect(rustResult.fallback_reason).toBeUndefined();
771
+ expect(rustResult.language).toBe("rust");
772
+ const rustFunctionChunk = rustResult.chunks.find((chunk) => chunk.snippet.includes("pub fn compute_total(input: i32) -> i32 {"));
773
+ expect(rustFunctionChunk).toBeDefined();
774
+ expect(rustFunctionChunk?.snippet.includes("let doubled = input * 2;")).toBe(true);
775
+ expect(rustFunctionChunk?.snippet.includes("doubled + 1")).toBe(true);
776
+
777
+ const javaResult = buildChunksForFile({
778
+ file: {
779
+ path: "src/Calculator.java",
780
+ content: [
781
+ "public class Calculator {",
782
+ " public int sum(int left, int right) {",
783
+ " int total = left + right;",
784
+ " return total;",
785
+ " }",
786
+ "",
787
+ " public int multiply(int left, int right) {",
788
+ " return left * right;",
789
+ " }",
790
+ "}"
791
+ ].join("\n")
792
+ },
793
+ config,
794
+ tokenize
795
+ });
796
+ expect(javaResult.strategy).toBe("language_aware");
797
+ expect(javaResult.fallback_reason).toBeUndefined();
798
+ expect(javaResult.language).toBe("java");
799
+ const javaMethodChunk = javaResult.chunks.find((chunk) => chunk.snippet.includes("public int sum(int left, int right) {"));
800
+ expect(javaMethodChunk).toBeDefined();
801
+ expect(javaMethodChunk?.snippet.includes("int total = left + right;")).toBe(true);
802
+ expect(javaMethodChunk?.snippet.includes("return total;")).toBe(true);
803
+ } finally {
804
+ parseSpy.mockRestore();
805
+ __resetChunkingParserStateForTests();
806
+ }
807
+ });
808
+
809
+ it("resolves rust/java aliases and paths for parser-aware chunking", () => {
810
+ __resetChunkingParserStateForTests();
811
+ const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
812
+ parseSpy.mockImplementation((source: string) => mockBoundaryTreeFromSource(source));
813
+ try {
814
+ const config = {
815
+ strategy: "language_aware" as const,
816
+ fallback_strategy: "sliding" as const,
817
+ target_chunk_tokens: 80,
818
+ chunk_overlap_tokens: 16,
819
+ budget_tokenizer: "lightweight" as const,
820
+ boundary_strictness: "legacy" as const,
821
+ max_chunks_per_file: 300,
822
+ parse_timeout_ms: 80,
823
+ enabled_languages: ["rust", "java"]
824
+ };
825
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
826
+ const rustContent = ["pub fn alpha(input: i32) -> i32 {", " input + 1", "}"].join("\n");
827
+ const javaContent = ["public class Demo {", " int alpha() {", " return 1;", " }", "}"].join("\n");
828
+
829
+ const rustAlias = buildChunksForFile({
830
+ file: {
831
+ path: "src/alpha.txt",
832
+ language: "rs",
833
+ content: rustContent
834
+ },
835
+ config,
836
+ tokenize
837
+ });
838
+ const rustPath = buildChunksForFile({
839
+ file: {
840
+ path: "src/alpha.rs",
841
+ content: rustContent
842
+ },
843
+ config,
844
+ tokenize
845
+ });
846
+ const javaPath = buildChunksForFile({
847
+ file: {
848
+ path: "src/Demo.java",
849
+ content: javaContent
850
+ },
851
+ config,
852
+ tokenize
853
+ });
854
+
855
+ expect(rustAlias.strategy).toBe("language_aware");
856
+ expect(rustPath.strategy).toBe("language_aware");
857
+ expect(javaPath.strategy).toBe("language_aware");
858
+ expect(rustAlias.fallback_reason).toBeUndefined();
859
+ expect(rustPath.fallback_reason).toBeUndefined();
860
+ expect(javaPath.fallback_reason).toBeUndefined();
861
+ expect(rustAlias.language).toBe("rust");
862
+ expect(rustPath.language).toBe("rust");
863
+ expect(javaPath.language).toBe("java");
864
+ } finally {
865
+ parseSpy.mockRestore();
866
+ __resetChunkingParserStateForTests();
867
+ }
868
+ });
869
+
870
+ it("falls back with parser_unavailable for rust/java parser load failures and caches deterministically", () => {
871
+ __resetChunkingParserStateForTests();
872
+ try {
873
+ __setChunkingParserLanguageLoaderForTests("rust", () => {
874
+ throw new Error("forced rust parser load failure");
875
+ });
876
+ __setChunkingParserLanguageLoaderForTests("java", () => {
877
+ throw new Error("forced java parser load failure");
878
+ });
879
+
880
+ const config = {
881
+ strategy: "language_aware" as const,
882
+ fallback_strategy: "sliding" as const,
883
+ target_chunk_tokens: 220,
884
+ chunk_overlap_tokens: 40,
885
+ budget_tokenizer: "ranking" as const,
886
+ boundary_strictness: "legacy" as const,
887
+ max_chunks_per_file: 300,
888
+ parse_timeout_ms: 80,
889
+ enabled_languages: ["rust", "java"]
890
+ };
891
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
892
+
893
+ const rustFile = {
894
+ path: "src/runtime.rs",
895
+ language: "rust",
896
+ content: ["pub fn alpha() -> i32 {", " 1", "}"].join("\n")
897
+ };
898
+ const javaFile = {
899
+ path: "src/Runtime.java",
900
+ language: "java",
901
+ content: ["public class Runtime {", " int alpha() {", " return 1;", " }", "}"].join("\n")
902
+ };
903
+
904
+ const rustFirst = buildChunksForFile({ file: rustFile, config, tokenize });
905
+ const rustSecond = buildChunksForFile({ file: rustFile, config, tokenize });
906
+ const javaFirst = buildChunksForFile({ file: javaFile, config, tokenize });
907
+ const javaSecond = buildChunksForFile({ file: javaFile, config, tokenize });
908
+
909
+ expect(rustFirst.fallback_reason).toBe("parser_unavailable");
910
+ expect(rustSecond.fallback_reason).toBe("parser_unavailable");
911
+ expect(javaFirst.fallback_reason).toBe("parser_unavailable");
912
+ expect(javaSecond.fallback_reason).toBe("parser_unavailable");
913
+ expect(rustFirst.language).toBe("rust");
914
+ expect(javaFirst.language).toBe("java");
915
+
916
+ const attempts = __getChunkingParserInitAttemptsForTests();
917
+ expect(attempts.rust).toBe(1);
918
+ expect(attempts.java).toBe(1);
919
+
920
+ const snapshot = getChunkingParserAvailabilitySnapshot({
921
+ enabled_languages: ["rust", "java"]
922
+ });
923
+ expect(snapshot.some((entry) => entry.language === "rust" && entry.status === "unavailable")).toBe(true);
924
+ expect(snapshot.some((entry) => entry.language === "java" && entry.status === "unavailable")).toBe(true);
925
+ } finally {
926
+ __resetChunkingParserStateForTests();
927
+ }
928
+ });
929
+
930
+ it("falls back with parse_error for rust/java when parser throws during parse", () => {
931
+ __resetChunkingParserStateForTests();
932
+ const parseSpy = vi.spyOn(Parser.prototype as { parse: (input: string) => unknown }, "parse");
933
+ parseSpy.mockImplementation(() => {
934
+ throw new Error("forced parse failure");
935
+ });
936
+ try {
937
+ const config = {
938
+ strategy: "language_aware" as const,
939
+ fallback_strategy: "sliding" as const,
940
+ target_chunk_tokens: 220,
941
+ chunk_overlap_tokens: 40,
942
+ budget_tokenizer: "ranking" as const,
943
+ boundary_strictness: "legacy" as const,
944
+ max_chunks_per_file: 300,
945
+ parse_timeout_ms: 80,
946
+ enabled_languages: ["rust", "java"]
947
+ };
948
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
949
+ const rustResult = buildChunksForFile({
950
+ file: {
951
+ path: "src/parse-error.rs",
952
+ language: "rust",
953
+ content: ["pub fn alpha() -> i32 {", " 1", "}"].join("\n")
954
+ },
955
+ config,
956
+ tokenize
957
+ });
958
+ const javaResult = buildChunksForFile({
959
+ file: {
960
+ path: "src/ParseError.java",
961
+ language: "java",
962
+ content: ["public class ParseError {", " int alpha() {", " return 1;", " }", "}"].join("\n")
963
+ },
964
+ config,
965
+ tokenize
966
+ });
967
+
968
+ expect(rustResult.strategy).toBe("sliding");
969
+ expect(javaResult.strategy).toBe("sliding");
970
+ expect(rustResult.fallback_reason).toBe("parse_error");
971
+ expect(javaResult.fallback_reason).toBe("parse_error");
972
+ expect(rustResult.language).toBe("rust");
973
+ expect(javaResult.language).toBe("java");
974
+ } finally {
975
+ parseSpy.mockRestore();
976
+ __resetChunkingParserStateForTests();
977
+ }
978
+ });
979
+
980
+ it("builds recursive semantic chunks with stable declaration boundaries", () => {
981
+ const config = {
982
+ strategy: "language_aware" as const,
983
+ fallback_strategy: "sliding" as const,
984
+ target_chunk_tokens: 28,
985
+ chunk_overlap_tokens: 6,
986
+ budget_tokenizer: "lightweight" as const,
987
+ boundary_strictness: "semantic_js_ts" as const,
988
+ max_chunks_per_file: 300,
989
+ parse_timeout_ms: 80,
990
+ enabled_languages: ["typescript"],
991
+ recursive_semantic_chunking_enabled: true,
992
+ semantic_merge_gap_lines: 1,
993
+ semantic_merge_max_span_lines: 120
994
+ };
995
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
996
+ const file = {
997
+ path: "src/recursive-runtime.ts",
998
+ language: "typescript",
999
+ content: [
1000
+ "export function alpha(input: number): number {",
1001
+ " const a = input + 1;",
1002
+ " return a;",
1003
+ "}",
1004
+ "",
1005
+ "export function beta(input: number): number {",
1006
+ " const b = alpha(input) * 2;",
1007
+ " return b;",
1008
+ "}",
1009
+ "",
1010
+ "export function gamma(input: number): number {",
1011
+ " const c = beta(input) + 3;",
1012
+ " return c;",
1013
+ "}"
1014
+ ].join("\n")
1015
+ };
1016
+ const result = buildChunksForFile({ file, config, tokenize });
1017
+ expect(result.strategy).toBe("language_aware");
1018
+ expect(result.fallback_reason).toBeUndefined();
1019
+ expect(result.recursive_semantic_chunking_used).toBe(true);
1020
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("export function alpha"))).toBe(true);
1021
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("export function beta"))).toBe(true);
1022
+ expect(result.chunks.some((chunk) => chunk.snippet.includes("export function gamma"))).toBe(true);
1023
+ expect(result.chunks.every((chunk) => chunk.start_line >= 1 && chunk.end_line >= chunk.start_line)).toBe(true);
1024
+ });
1025
+
1026
+ it("forward-absorbs comment windows into following code windows when enabled", () => {
1027
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
1028
+ const baseConfig = {
1029
+ strategy: "language_aware" as const,
1030
+ fallback_strategy: "sliding" as const,
1031
+ target_chunk_tokens: 30,
1032
+ chunk_overlap_tokens: 4,
1033
+ budget_tokenizer: "lightweight" as const,
1034
+ boundary_strictness: "semantic_js_ts" as const,
1035
+ max_chunks_per_file: 300,
1036
+ parse_timeout_ms: 80,
1037
+ enabled_languages: ["typescript"],
1038
+ recursive_semantic_chunking_enabled: true,
1039
+ semantic_merge_gap_lines: 0,
1040
+ semantic_merge_max_span_lines: 120
1041
+ };
1042
+ const file = {
1043
+ path: "src/comment-absorb.ts",
1044
+ language: "typescript",
1045
+ content: [
1046
+ "// Explanation for alpha",
1047
+ "// Keep this doc attached to alpha",
1048
+ "",
1049
+ "export function alpha(input: number): number {",
1050
+ " return input + 1;",
1051
+ "}",
1052
+ "",
1053
+ "export function beta(input: number): number {",
1054
+ " return alpha(input);",
1055
+ "}"
1056
+ ].join("\n")
1057
+ };
1058
+
1059
+ const withAbsorb = buildChunksForFile({
1060
+ file,
1061
+ config: {
1062
+ ...baseConfig,
1063
+ comment_forward_absorb_enabled: true
1064
+ },
1065
+ tokenize
1066
+ });
1067
+ const withoutAbsorb = buildChunksForFile({
1068
+ file,
1069
+ config: {
1070
+ ...baseConfig,
1071
+ comment_forward_absorb_enabled: false
1072
+ },
1073
+ tokenize
1074
+ });
1075
+
1076
+ expect(withAbsorb.chunks.some((chunk) => chunk.snippet.includes("Explanation for alpha") && chunk.snippet.includes("export function alpha"))).toBe(
1077
+ true
1078
+ );
1079
+ expect(
1080
+ withoutAbsorb.chunks.some(
1081
+ (chunk) => chunk.snippet.includes("Explanation for alpha") && !chunk.snippet.includes("export function alpha")
1082
+ )
1083
+ ).toBe(true);
1084
+ });
1085
+
1086
+ it("emits parser availability snapshot and avoids repeated parser init attempts", async () => {
1087
+ __resetChunkingParserStateForTests();
1088
+ try {
1089
+ const store = new InMemoryIndexStore();
1090
+ const observability = getObservability(`retrieval-core-js-py-${Date.now()}`);
1091
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
1092
+ observability,
1093
+ chunkingConfig: {
1094
+ strategy: "language_aware",
1095
+ enabled_languages: ["typescript", "javascript", "python", "go", "rust", "java"]
1096
+ }
1097
+ });
1098
+
1099
+ await indexAndListChunks({
1100
+ core,
1101
+ store,
1102
+ tenant_id: "tenant-d",
1103
+ workspace_id: "ws-d",
1104
+ index_version: "idx-d1",
1105
+ file: {
1106
+ path: "src/runtime.js",
1107
+ language: "javascript",
1108
+ content: ["export function alpha() {", " return 1;", "}"].join("\n")
1109
+ }
1110
+ });
1111
+
1112
+ await indexAndListChunks({
1113
+ core,
1114
+ store,
1115
+ tenant_id: "tenant-d",
1116
+ workspace_id: "ws-d",
1117
+ index_version: "idx-d2",
1118
+ file: {
1119
+ path: "src/runtime.py",
1120
+ language: "python",
1121
+ content: ["def alpha():", " return 1"].join("\n")
1122
+ }
1123
+ });
1124
+
1125
+ await indexAndListChunks({
1126
+ core,
1127
+ store,
1128
+ tenant_id: "tenant-d",
1129
+ workspace_id: "ws-d",
1130
+ index_version: "idx-d3",
1131
+ file: {
1132
+ path: "src/runtime.rs",
1133
+ language: "rust",
1134
+ content: ["pub fn alpha() -> i32 {", " 1", "}"].join("\n")
1135
+ }
1136
+ });
1137
+
1138
+ await indexAndListChunks({
1139
+ core,
1140
+ store,
1141
+ tenant_id: "tenant-d",
1142
+ workspace_id: "ws-d",
1143
+ index_version: "idx-d4",
1144
+ file: {
1145
+ path: "src/Runtime.java",
1146
+ language: "java",
1147
+ content: ["public class Runtime {", " int alpha() {", " return 1;", " }", "}"].join("\n")
1148
+ }
1149
+ });
1150
+
1151
+ const attempts = __getChunkingParserInitAttemptsForTests();
1152
+ expect((attempts.javascript ?? 0) <= 1).toBe(true);
1153
+ expect((attempts.python ?? 0) <= 1).toBe(true);
1154
+ expect((attempts.rust ?? 0) <= 1).toBe(true);
1155
+ expect((attempts.java ?? 0) <= 1).toBe(true);
1156
+
1157
+ const strategyCounters = observability.metrics.readCounter("index_chunking_strategy_total");
1158
+ expect(
1159
+ strategyCounters.some((counter) => counter.labels.language === "javascript")
1160
+ ).toBe(true);
1161
+ expect(strategyCounters.some((counter) => counter.labels.language === "python")).toBe(true);
1162
+ expect(strategyCounters.some((counter) => counter.labels.language === "rust")).toBe(true);
1163
+ expect(strategyCounters.some((counter) => counter.labels.language === "java")).toBe(true);
1164
+
1165
+ const snapshot = getChunkingParserAvailabilitySnapshot({
1166
+ enabled_languages: ["typescript", "javascript", "python", "go", "rust", "java"]
1167
+ });
1168
+ expect(snapshot.some((entry) => entry.language === "javascript" && entry.status === "available")).toBe(true);
1169
+ expect(snapshot.some((entry) => entry.language === "python" && entry.status === "available")).toBe(true);
1170
+ expect(snapshot.some((entry) => entry.language === "rust" && entry.status === "available")).toBe(true);
1171
+ expect(snapshot.some((entry) => entry.language === "java" && entry.status === "available")).toBe(true);
1172
+
1173
+ const availabilityGauges = observability.metrics.readGauge("index_chunking_parser_availability");
1174
+ expect(
1175
+ availabilityGauges.some((point) => point.labels.language === "javascript" && point.labels.status === "available")
1176
+ ).toBe(true);
1177
+ expect(availabilityGauges.some((point) => point.labels.language === "python" && point.labels.status === "available")).toBe(
1178
+ true
1179
+ );
1180
+ expect(availabilityGauges.some((point) => point.labels.language === "rust" && point.labels.status === "available")).toBe(
1181
+ true
1182
+ );
1183
+ expect(availabilityGauges.some((point) => point.labels.language === "java" && point.labels.status === "available")).toBe(
1184
+ true
1185
+ );
1186
+
1187
+ const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
1188
+ expect(
1189
+ fallbackCounters.some(
1190
+ (counter) =>
1191
+ counter.labels.reason === "parser_unavailable" &&
1192
+ (counter.labels.language === "javascript" ||
1193
+ counter.labels.language === "python" ||
1194
+ counter.labels.language === "rust" ||
1195
+ counter.labels.language === "java")
1196
+ )
1197
+ ).toBe(false);
1198
+ } finally {
1199
+ __resetChunkingParserStateForTests();
1200
+ }
1201
+ });
1202
+
1203
+ it("caches parser unavailability and avoids repeated parser init attempts", () => {
1204
+ __resetChunkingParserStateForTests();
1205
+ try {
1206
+ __setChunkingParserLanguageLoaderForTests("python", () => {
1207
+ throw new Error("forced parser load failure");
1208
+ });
1209
+
1210
+ const config = {
1211
+ strategy: "language_aware" as const,
1212
+ fallback_strategy: "sliding" as const,
1213
+ target_chunk_tokens: 220,
1214
+ chunk_overlap_tokens: 40,
1215
+ budget_tokenizer: "ranking" as const,
1216
+ boundary_strictness: "legacy" as const,
1217
+ max_chunks_per_file: 300,
1218
+ parse_timeout_ms: 80,
1219
+ enabled_languages: ["python"]
1220
+ };
1221
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
1222
+ const file = {
1223
+ path: "src/runtime.py",
1224
+ language: "python",
1225
+ content: ["def alpha():", " return 1"].join("\n")
1226
+ };
1227
+
1228
+ const first = buildChunksForFile({ file, config, tokenize });
1229
+ const second = buildChunksForFile({ file, config, tokenize });
1230
+ const third = buildChunksForFile({ file, config, tokenize });
1231
+
1232
+ expect(first.fallback_reason).toBe("parser_unavailable");
1233
+ expect(second.fallback_reason).toBe("parser_unavailable");
1234
+ expect(third.fallback_reason).toBe("parser_unavailable");
1235
+
1236
+ const attempts = __getChunkingParserInitAttemptsForTests();
1237
+ expect(attempts.python).toBe(1);
1238
+
1239
+ const snapshot = getChunkingParserAvailabilitySnapshot({
1240
+ enabled_languages: ["python", "typescript"]
1241
+ });
1242
+ expect(snapshot.some((entry) => entry.language === "python" && entry.status === "unavailable")).toBe(true);
1243
+ expect(snapshot.some((entry) => entry.language === "typescript")).toBe(true);
1244
+ } finally {
1245
+ __resetChunkingParserStateForTests();
1246
+ }
1247
+ });
1248
+ });