@rce-mcp/retrieval-core 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +7 -0
- package/dist/.tsbuildinfo +1 -1
- package/dist/chunking.d.ts +13 -0
- package/dist/chunking.js +488 -77
- package/dist/index.d.ts +61 -0
- package/dist/index.js +993 -20
- package/dist/remote-sync.js +2 -1
- package/package.json +2 -2
- package/scripts/poc-parser-availability-benchmark.ts +2 -0
- package/src/chunking.ts +573 -80
- package/src/index.ts +1247 -20
- package/src/remote-sync.ts +3 -1
- package/test/benchmark.thresholds.test.ts +8 -0
- package/test/chunking.config.test.ts +47 -1
- package/test/chunking.language-aware.test.ts +227 -0
- package/test/embedding-context-prefix.test.ts +101 -0
- package/test/enhance-confidence.test.ts +4 -4
- package/test/mcp-search-quality.regression.test.ts +691 -4
- package/test/remote-sync.integration.test.ts +5 -1
- package/test/smart-cutoff.config.test.ts +86 -0
- package/test/snippet-integrity.config.test.ts +59 -0
|
@@ -2,7 +2,8 @@ import { mkdtemp, rm } from "node:fs/promises";
|
|
|
2
2
|
import { tmpdir } from "node:os";
|
|
3
3
|
import { join } from "node:path";
|
|
4
4
|
import { afterEach, describe, expect, it } from "vitest";
|
|
5
|
-
import {
|
|
5
|
+
import { SearchContextOutputSchema } from "@rce-mcp/contracts";
|
|
6
|
+
import { InMemoryQueryCache, SqliteIndexRepository, SqliteQueryCache } from "@rce-mcp/data-plane";
|
|
6
7
|
import { RetrievalCore } from "../src/index.js";
|
|
7
8
|
|
|
8
9
|
function firstRank(results: Array<{ path: string }>, path: string): number {
|
|
@@ -71,6 +72,46 @@ function buildLongCircuitBreakerFixture(): string {
|
|
|
71
72
|
return lines.join("\n");
|
|
72
73
|
}
|
|
73
74
|
|
|
75
|
+
function buildOverlapHotspotFixture(): string {
|
|
76
|
+
const lines: string[] = [
|
|
77
|
+
"export function overlapMergeHotspot(seed: number): number {",
|
|
78
|
+
" let total = seed;"
|
|
79
|
+
];
|
|
80
|
+
for (let i = 0; i < 40; i += 1) {
|
|
81
|
+
lines.push(` total += overlapMergeSignal(seed, ${i}); // overlap chunk merge hotspot coverage`);
|
|
82
|
+
}
|
|
83
|
+
lines.push(
|
|
84
|
+
" return total;",
|
|
85
|
+
"}",
|
|
86
|
+
"",
|
|
87
|
+
"function overlapMergeSignal(seed: number, step: number): number {",
|
|
88
|
+
" return seed + step;",
|
|
89
|
+
"}"
|
|
90
|
+
);
|
|
91
|
+
return lines.join("\n");
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function buildLargeOverlapPressureFixture(): string {
|
|
95
|
+
const lines: string[] = [
|
|
96
|
+
"export function overlapPressureSignal(seed: number): number {",
|
|
97
|
+
" let total = seed;"
|
|
98
|
+
];
|
|
99
|
+
for (let i = 0; i < 220; i += 1) {
|
|
100
|
+
lines.push(
|
|
101
|
+
` total += overlapPressureStep(seed, ${i}); // overlap pressure signal token cluster for chunk overlap diagnostics`
|
|
102
|
+
);
|
|
103
|
+
}
|
|
104
|
+
lines.push(
|
|
105
|
+
" return total;",
|
|
106
|
+
"}",
|
|
107
|
+
"",
|
|
108
|
+
"function overlapPressureStep(seed: number, step: number): number {",
|
|
109
|
+
" return seed + step;",
|
|
110
|
+
"}"
|
|
111
|
+
);
|
|
112
|
+
return lines.join("\n");
|
|
113
|
+
}
|
|
114
|
+
|
|
74
115
|
describe("mcp search quality regressions", () => {
|
|
75
116
|
const dirs: string[] = [];
|
|
76
117
|
|
|
@@ -213,7 +254,7 @@ describe("mcp search quality regressions", () => {
|
|
|
213
254
|
}
|
|
214
255
|
});
|
|
215
256
|
|
|
216
|
-
it("
|
|
257
|
+
it("maintains or improves snippet completeness with upgraded chunk windows", async () => {
|
|
217
258
|
const root = await mkdtemp(join(tmpdir(), "rce-mcp-snippet-quality-"));
|
|
218
259
|
dirs.push(root);
|
|
219
260
|
const sqlitePath = join(root, "mcp-snippet-quality.sqlite");
|
|
@@ -306,8 +347,8 @@ describe("mcp search quality regressions", () => {
|
|
|
306
347
|
|
|
307
348
|
const legacySpan = (legacyTop?.end_line ?? 0) - (legacyTop?.start_line ?? 0);
|
|
308
349
|
const upgradedSpan = (upgradedTop?.end_line ?? 0) - (upgradedTop?.start_line ?? 0);
|
|
309
|
-
expect(upgradedSpan).
|
|
310
|
-
expect((upgradedTop?.snippet.length ?? 0)).
|
|
350
|
+
expect(upgradedSpan).toBeGreaterThanOrEqual(legacySpan);
|
|
351
|
+
expect((upgradedTop?.snippet.length ?? 0)).toBeGreaterThanOrEqual(legacyTop?.snippet.length ?? 0);
|
|
311
352
|
|
|
312
353
|
const tokenCoverage = (snippet: string | undefined): number =>
|
|
313
354
|
["circuitActivationSeed", "tripAuditDigest", "freezeOrderEntry"].filter((token) => snippet?.includes(token))
|
|
@@ -319,4 +360,650 @@ describe("mcp search quality regressions", () => {
|
|
|
319
360
|
repo.close();
|
|
320
361
|
}
|
|
321
362
|
});
|
|
363
|
+
|
|
364
|
+
it("merges overlapping top candidates to recover distinct file coverage", async () => {
|
|
365
|
+
const root = await mkdtemp(join(tmpdir(), "rce-overlap-merge-quality-"));
|
|
366
|
+
dirs.push(root);
|
|
367
|
+
const sqlitePath = join(root, "overlap-merge-quality.sqlite");
|
|
368
|
+
|
|
369
|
+
const repo = new SqliteIndexRepository(sqlitePath);
|
|
370
|
+
await repo.migrate();
|
|
371
|
+
await repo.upsertWorkspace({
|
|
372
|
+
workspace_id: "ws-overlap",
|
|
373
|
+
tenant_id: "tenant-overlap",
|
|
374
|
+
name: "overlap-merge",
|
|
375
|
+
project_root_path: "/workspace/overlap"
|
|
376
|
+
});
|
|
377
|
+
|
|
378
|
+
const writerCache = new InMemoryQueryCache();
|
|
379
|
+
const withoutMergeCache = new InMemoryQueryCache();
|
|
380
|
+
const withMergeCache = new InMemoryQueryCache();
|
|
381
|
+
const writer = new RetrievalCore(repo, writerCache, {
|
|
382
|
+
chunkingConfig: {
|
|
383
|
+
strategy: "sliding",
|
|
384
|
+
target_chunk_tokens: 70,
|
|
385
|
+
chunk_overlap_tokens: 60
|
|
386
|
+
}
|
|
387
|
+
});
|
|
388
|
+
const withoutMerge = new RetrievalCore(repo, withoutMergeCache, {
|
|
389
|
+
scoringConfig: {
|
|
390
|
+
candidate_weights: {
|
|
391
|
+
lexical_weight: 1,
|
|
392
|
+
vector_weight: 0,
|
|
393
|
+
path_match_boost: 0,
|
|
394
|
+
recency_boost: 0,
|
|
395
|
+
generated_penalty: 0
|
|
396
|
+
},
|
|
397
|
+
rerank: {
|
|
398
|
+
merge_overlapping_chunks_enabled: false
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
});
|
|
402
|
+
const withMerge = new RetrievalCore(repo, withMergeCache, {
|
|
403
|
+
scoringConfig: {
|
|
404
|
+
candidate_weights: {
|
|
405
|
+
lexical_weight: 1,
|
|
406
|
+
vector_weight: 0,
|
|
407
|
+
path_match_boost: 0,
|
|
408
|
+
recency_boost: 0,
|
|
409
|
+
generated_penalty: 0
|
|
410
|
+
},
|
|
411
|
+
rerank: {
|
|
412
|
+
merge_overlapping_chunks_enabled: true
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
});
|
|
416
|
+
|
|
417
|
+
try {
|
|
418
|
+
await writer.indexArtifact({
|
|
419
|
+
tenant_id: "tenant-overlap",
|
|
420
|
+
workspace_id: "ws-overlap",
|
|
421
|
+
index_version: "idx-overlap-v1",
|
|
422
|
+
files: [
|
|
423
|
+
{
|
|
424
|
+
path: "src/hotspot.ts",
|
|
425
|
+
language: "typescript",
|
|
426
|
+
content: buildOverlapHotspotFixture()
|
|
427
|
+
},
|
|
428
|
+
{
|
|
429
|
+
path: "src/secondary.ts",
|
|
430
|
+
language: "typescript",
|
|
431
|
+
content:
|
|
432
|
+
"export function overlapCoverageFallback(): string { return 'overlap merge hotspot coverage fallback target overlap chunk'; }"
|
|
433
|
+
},
|
|
434
|
+
{
|
|
435
|
+
path: "src/noise.ts",
|
|
436
|
+
language: "typescript",
|
|
437
|
+
content: "export const NOISE = 'unrelated';"
|
|
438
|
+
}
|
|
439
|
+
]
|
|
440
|
+
});
|
|
441
|
+
|
|
442
|
+
const query = "overlapMergeSignal overlap chunk merge hotspot coverage fallback target";
|
|
443
|
+
const baseline = await withoutMerge.searchContext({
|
|
444
|
+
trace_id: "trc-overlap-no-merge",
|
|
445
|
+
tenant_id: "tenant-overlap",
|
|
446
|
+
workspace_id: "ws-overlap",
|
|
447
|
+
request: {
|
|
448
|
+
project_root_path: "/workspace/overlap",
|
|
449
|
+
query,
|
|
450
|
+
top_k: 2
|
|
451
|
+
}
|
|
452
|
+
});
|
|
453
|
+
const merged = await withMerge.searchContext({
|
|
454
|
+
trace_id: "trc-overlap-merge",
|
|
455
|
+
tenant_id: "tenant-overlap",
|
|
456
|
+
workspace_id: "ws-overlap",
|
|
457
|
+
request: {
|
|
458
|
+
project_root_path: "/workspace/overlap",
|
|
459
|
+
query,
|
|
460
|
+
top_k: 2
|
|
461
|
+
}
|
|
462
|
+
});
|
|
463
|
+
|
|
464
|
+
expect(baseline.results[0]?.path).toBe("src/hotspot.ts");
|
|
465
|
+
expect(baseline.results[1]?.path).toBe("src/hotspot.ts");
|
|
466
|
+
const baselineFirst = baseline.results[0];
|
|
467
|
+
const baselineSecond = baseline.results[1];
|
|
468
|
+
expect(baselineFirst).toBeDefined();
|
|
469
|
+
expect(baselineSecond).toBeDefined();
|
|
470
|
+
const overlapStart = Math.max(baselineFirst?.start_line ?? 0, baselineSecond?.start_line ?? 0);
|
|
471
|
+
const overlapEnd = Math.min(baselineFirst?.end_line ?? 0, baselineSecond?.end_line ?? 0);
|
|
472
|
+
expect(overlapStart).toBeLessThanOrEqual(overlapEnd);
|
|
473
|
+
|
|
474
|
+
expect(merged.results[0]?.path).toBe("src/hotspot.ts");
|
|
475
|
+
expect(merged.results.some((result) => result.path === "src/secondary.ts")).toBe(true);
|
|
476
|
+
expect(new Set(merged.results.map((result) => result.path)).size).toBeGreaterThan(
|
|
477
|
+
new Set(baseline.results.map((result) => result.path)).size
|
|
478
|
+
);
|
|
479
|
+
const mergedHotspot = merged.results.find((result) => result.path === "src/hotspot.ts");
|
|
480
|
+
expect((mergedHotspot?.end_line ?? 0) - (mergedHotspot?.start_line ?? 0)).toBeGreaterThan(40);
|
|
481
|
+
} finally {
|
|
482
|
+
repo.close();
|
|
483
|
+
}
|
|
484
|
+
});
|
|
485
|
+
|
|
486
|
+
it("avoids heavy same-file overlap even when merge span cap prevents direct consolidation", async () => {
|
|
487
|
+
const root = await mkdtemp(join(tmpdir(), "rce-overlap-selection-quality-"));
|
|
488
|
+
dirs.push(root);
|
|
489
|
+
const sqlitePath = join(root, "overlap-selection-quality.sqlite");
|
|
490
|
+
|
|
491
|
+
const repo = new SqliteIndexRepository(sqlitePath);
|
|
492
|
+
await repo.migrate();
|
|
493
|
+
await repo.upsertWorkspace({
|
|
494
|
+
workspace_id: "ws-overlap-selection",
|
|
495
|
+
tenant_id: "tenant-overlap-selection",
|
|
496
|
+
name: "overlap-selection",
|
|
497
|
+
project_root_path: "/workspace/overlap-selection"
|
|
498
|
+
});
|
|
499
|
+
|
|
500
|
+
const writer = new RetrievalCore(repo, new InMemoryQueryCache(), {
|
|
501
|
+
chunkingConfig: {
|
|
502
|
+
strategy: "sliding",
|
|
503
|
+
target_chunk_tokens: 70,
|
|
504
|
+
chunk_overlap_tokens: 60
|
|
505
|
+
}
|
|
506
|
+
});
|
|
507
|
+
const merged = new RetrievalCore(repo, new InMemoryQueryCache(), {
|
|
508
|
+
scoringConfig: {
|
|
509
|
+
candidate_weights: {
|
|
510
|
+
lexical_weight: 1,
|
|
511
|
+
vector_weight: 0,
|
|
512
|
+
path_match_boost: 0,
|
|
513
|
+
recency_boost: 0,
|
|
514
|
+
generated_penalty: 0
|
|
515
|
+
},
|
|
516
|
+
rerank: {
|
|
517
|
+
merge_overlapping_chunks_enabled: true,
|
|
518
|
+
merge_max_span_lines: 8
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
});
|
|
522
|
+
|
|
523
|
+
try {
|
|
524
|
+
await writer.indexArtifact({
|
|
525
|
+
tenant_id: "tenant-overlap-selection",
|
|
526
|
+
workspace_id: "ws-overlap-selection",
|
|
527
|
+
index_version: "idx-overlap-selection-v1",
|
|
528
|
+
files: [
|
|
529
|
+
{
|
|
530
|
+
path: "src/pressure.ts",
|
|
531
|
+
language: "typescript",
|
|
532
|
+
content: buildLargeOverlapPressureFixture()
|
|
533
|
+
},
|
|
534
|
+
{
|
|
535
|
+
path: "src/fallback.ts",
|
|
536
|
+
language: "typescript",
|
|
537
|
+
content:
|
|
538
|
+
"export function overlapPressureFallback(): string { return 'overlap pressure signal token cluster fallback target'; }"
|
|
539
|
+
},
|
|
540
|
+
{
|
|
541
|
+
path: "src/extra.ts",
|
|
542
|
+
language: "typescript",
|
|
543
|
+
content:
|
|
544
|
+
"export function overlapPressureExtra(): string { return 'overlap pressure signal token cluster extra coverage'; }"
|
|
545
|
+
}
|
|
546
|
+
]
|
|
547
|
+
});
|
|
548
|
+
|
|
549
|
+
const retrieval = await merged.searchContext({
|
|
550
|
+
trace_id: "trc-overlap-selection",
|
|
551
|
+
tenant_id: "tenant-overlap-selection",
|
|
552
|
+
workspace_id: "ws-overlap-selection",
|
|
553
|
+
request: {
|
|
554
|
+
project_root_path: "/workspace/overlap-selection",
|
|
555
|
+
query: "overlap pressure signal token cluster fallback target",
|
|
556
|
+
top_k: 3
|
|
557
|
+
}
|
|
558
|
+
});
|
|
559
|
+
|
|
560
|
+
expect(retrieval.results.some((result) => result.path === "src/fallback.ts")).toBe(true);
|
|
561
|
+
|
|
562
|
+
const byPath = new Map<string, Array<{ start: number; end: number }>>();
|
|
563
|
+
for (const result of retrieval.results) {
|
|
564
|
+
const rows = byPath.get(result.path);
|
|
565
|
+
const range = { start: result.start_line, end: result.end_line };
|
|
566
|
+
if (rows) {
|
|
567
|
+
rows.push(range);
|
|
568
|
+
} else {
|
|
569
|
+
byPath.set(result.path, [range]);
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
for (const ranges of byPath.values()) {
|
|
574
|
+
for (let i = 0; i < ranges.length; i += 1) {
|
|
575
|
+
const left = ranges[i];
|
|
576
|
+
if (!left) {
|
|
577
|
+
continue;
|
|
578
|
+
}
|
|
579
|
+
for (let j = i + 1; j < ranges.length; j += 1) {
|
|
580
|
+
const right = ranges[j];
|
|
581
|
+
if (!right) {
|
|
582
|
+
continue;
|
|
583
|
+
}
|
|
584
|
+
const overlapStart = Math.max(left.start, right.start);
|
|
585
|
+
const overlapEnd = Math.min(left.end, right.end);
|
|
586
|
+
const overlapLength = overlapEnd >= overlapStart ? overlapEnd - overlapStart + 1 : 0;
|
|
587
|
+
const leftLength = Math.max(1, left.end - left.start + 1);
|
|
588
|
+
const rightLength = Math.max(1, right.end - right.start + 1);
|
|
589
|
+
const smaller = Math.min(leftLength, rightLength);
|
|
590
|
+
const overlapRatio = overlapLength / smaller;
|
|
591
|
+
expect(overlapRatio).toBeLessThan(0.2);
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
} finally {
|
|
596
|
+
repo.close();
|
|
597
|
+
}
|
|
598
|
+
});
|
|
599
|
+
|
|
600
|
+
it("packs same-file contextual spans with elision markers when context packing is enabled", async () => {
|
|
601
|
+
const root = await mkdtemp(join(tmpdir(), "rce-context-packing-quality-"));
|
|
602
|
+
dirs.push(root);
|
|
603
|
+
const sqlitePath = join(root, "context-packing-quality.sqlite");
|
|
604
|
+
|
|
605
|
+
const repo = new SqliteIndexRepository(sqlitePath);
|
|
606
|
+
await repo.migrate();
|
|
607
|
+
await repo.upsertWorkspace({
|
|
608
|
+
workspace_id: "ws-context-pack",
|
|
609
|
+
tenant_id: "tenant-context-pack",
|
|
610
|
+
name: "context-pack",
|
|
611
|
+
project_root_path: "/workspace/context-pack"
|
|
612
|
+
});
|
|
613
|
+
|
|
614
|
+
const cache = new InMemoryQueryCache();
|
|
615
|
+
const core = new RetrievalCore(repo, cache, {
|
|
616
|
+
chunkingConfig: {
|
|
617
|
+
strategy: "sliding",
|
|
618
|
+
target_chunk_tokens: 35,
|
|
619
|
+
chunk_overlap_tokens: 8
|
|
620
|
+
},
|
|
621
|
+
scoringConfig: {
|
|
622
|
+
rerank: {
|
|
623
|
+
merge_overlapping_chunks_enabled: false
|
|
624
|
+
}
|
|
625
|
+
},
|
|
626
|
+
contextPackingConfig: {
|
|
627
|
+
enabled: true,
|
|
628
|
+
max_spans_per_result: 2,
|
|
629
|
+
max_gap_lines: 90,
|
|
630
|
+
max_snippet_chars: 3_200,
|
|
631
|
+
enhancer_snippet_char_limit: 2_200
|
|
632
|
+
}
|
|
633
|
+
});
|
|
634
|
+
|
|
635
|
+
try {
|
|
636
|
+
const bridgeNoise = Array.from({ length: 60 }, (_, idx) => `const bridgeNoise${idx} = ${idx};`);
|
|
637
|
+
await core.indexArtifact({
|
|
638
|
+
tenant_id: "tenant-context-pack",
|
|
639
|
+
workspace_id: "ws-context-pack",
|
|
640
|
+
index_version: "idx-context-pack-v1",
|
|
641
|
+
files: [
|
|
642
|
+
{
|
|
643
|
+
path: "packages/oracle/src/estimator.ts",
|
|
644
|
+
language: "typescript",
|
|
645
|
+
content: [
|
|
646
|
+
"export function computeProbability(marketSignal: number): number {",
|
|
647
|
+
" const base = marketSignal * 0.7;",
|
|
648
|
+
" const normalized = Math.max(0, Math.min(1, base));",
|
|
649
|
+
" return normalized;",
|
|
650
|
+
"}",
|
|
651
|
+
"",
|
|
652
|
+
...bridgeNoise,
|
|
653
|
+
"",
|
|
654
|
+
"export function fetchLlmInputs(llmInputFlow: string[]): string[] {",
|
|
655
|
+
" return llmInputFlow.filter(Boolean);",
|
|
656
|
+
"}"
|
|
657
|
+
].join("\n")
|
|
658
|
+
}
|
|
659
|
+
]
|
|
660
|
+
});
|
|
661
|
+
|
|
662
|
+
const retrieval = await core.searchContext({
|
|
663
|
+
trace_id: "trc-context-pack",
|
|
664
|
+
tenant_id: "tenant-context-pack",
|
|
665
|
+
workspace_id: "ws-context-pack",
|
|
666
|
+
request: {
|
|
667
|
+
project_root_path: "/workspace/context-pack",
|
|
668
|
+
query: "computeProbability marketSignal fetchLlmInputs llmInputFlow",
|
|
669
|
+
top_k: 1
|
|
670
|
+
}
|
|
671
|
+
});
|
|
672
|
+
|
|
673
|
+
const top = retrieval.results[0];
|
|
674
|
+
expect(top).toBeDefined();
|
|
675
|
+
expect(top?.path).toBe("packages/oracle/src/estimator.ts");
|
|
676
|
+
expect(top?.snippet.includes("computeProbability")).toBe(true);
|
|
677
|
+
expect(top?.snippet.includes("fetchLlmInputs")).toBe(true);
|
|
678
|
+
expect(top?.snippet.includes("...")).toBe(true);
|
|
679
|
+
expect(top?.reason.includes("contextual spans")).toBe(true);
|
|
680
|
+
expect((top?.end_line ?? 0) - (top?.start_line ?? 0)).toBeGreaterThan(20);
|
|
681
|
+
} finally {
|
|
682
|
+
repo.close();
|
|
683
|
+
}
|
|
684
|
+
});
|
|
685
|
+
|
|
686
|
+
it("adds deterministic truncation metadata marker for broken TS function boundaries when enabled", async () => {
|
|
687
|
+
const root = await mkdtemp(join(tmpdir(), "rce-snippet-integrity-quality-"));
|
|
688
|
+
dirs.push(root);
|
|
689
|
+
const sqlitePath = join(root, "snippet-integrity-quality.sqlite");
|
|
690
|
+
|
|
691
|
+
const repo = new SqliteIndexRepository(sqlitePath);
|
|
692
|
+
await repo.migrate();
|
|
693
|
+
await repo.upsertWorkspace({
|
|
694
|
+
workspace_id: "ws-snippet-integrity",
|
|
695
|
+
tenant_id: "tenant-snippet-integrity",
|
|
696
|
+
name: "snippet-integrity",
|
|
697
|
+
project_root_path: "/workspace/snippet-integrity"
|
|
698
|
+
});
|
|
699
|
+
|
|
700
|
+
const cache = new InMemoryQueryCache();
|
|
701
|
+
const core = new RetrievalCore(repo, cache, {
|
|
702
|
+
chunkingConfig: {
|
|
703
|
+
strategy: "sliding",
|
|
704
|
+
target_chunk_tokens: 35,
|
|
705
|
+
chunk_overlap_tokens: 8
|
|
706
|
+
},
|
|
707
|
+
scoringConfig: {
|
|
708
|
+
rerank: {
|
|
709
|
+
merge_overlapping_chunks_enabled: false
|
|
710
|
+
}
|
|
711
|
+
},
|
|
712
|
+
snippetIntegrityConfig: {
|
|
713
|
+
enabled: true,
|
|
714
|
+
target_languages: ["typescript"],
|
|
715
|
+
max_contiguous_gap_lines: 6,
|
|
716
|
+
marker_template_version: "v1"
|
|
717
|
+
}
|
|
718
|
+
});
|
|
719
|
+
|
|
720
|
+
try {
|
|
721
|
+
const body = Array.from(
|
|
722
|
+
{ length: 500 },
|
|
723
|
+
(_, idx) => ` const checkpoint_${idx} = input + ${idx}; // estimator continuity signal`
|
|
724
|
+
);
|
|
725
|
+
await core.indexArtifact({
|
|
726
|
+
tenant_id: "tenant-snippet-integrity",
|
|
727
|
+
workspace_id: "ws-snippet-integrity",
|
|
728
|
+
index_version: "idx-snippet-integrity-v1",
|
|
729
|
+
files: [
|
|
730
|
+
{
|
|
731
|
+
path: "src/estimator.ts",
|
|
732
|
+
language: "typescript",
|
|
733
|
+
content: [
|
|
734
|
+
"export function estimateProbability(input: number): number {",
|
|
735
|
+
...body,
|
|
736
|
+
" return input;",
|
|
737
|
+
"}",
|
|
738
|
+
"",
|
|
739
|
+
"export function calibrateProbability(input: number): number {",
|
|
740
|
+
" return estimateProbability(input + 1);",
|
|
741
|
+
"}"
|
|
742
|
+
].join("\n")
|
|
743
|
+
}
|
|
744
|
+
]
|
|
745
|
+
});
|
|
746
|
+
|
|
747
|
+
const retrieval = await core.searchContext({
|
|
748
|
+
trace_id: "trc-snippet-integrity",
|
|
749
|
+
tenant_id: "tenant-snippet-integrity",
|
|
750
|
+
workspace_id: "ws-snippet-integrity",
|
|
751
|
+
request: {
|
|
752
|
+
project_root_path: "/workspace/snippet-integrity",
|
|
753
|
+
query: "estimateProbability checkpoint_20 checkpoint_430",
|
|
754
|
+
top_k: 1
|
|
755
|
+
}
|
|
756
|
+
});
|
|
757
|
+
|
|
758
|
+
expect(() => SearchContextOutputSchema.parse(retrieval)).not.toThrow();
|
|
759
|
+
const top = retrieval.results[0];
|
|
760
|
+
expect(top).toBeDefined();
|
|
761
|
+
expect(top?.path).toBe("src/estimator.ts");
|
|
762
|
+
const markerLine = top?.snippet.split("\n").find((line) => line.includes("[truncated:v1"));
|
|
763
|
+
expect(markerLine).toBeDefined();
|
|
764
|
+
expect(markerLine).toMatch(/symbol=(?!unknown)[A-Za-z_$][\w$]*/);
|
|
765
|
+
expect(markerLine).toContain("estimated_total_lines=");
|
|
766
|
+
expect(markerLine).toContain("through_line=");
|
|
767
|
+
const omittedAfterMatch = markerLine?.match(/omitted_after=(\d+)/);
|
|
768
|
+
expect(Number.parseInt(omittedAfterMatch?.[1] ?? "0", 10)).toBeGreaterThan(0);
|
|
769
|
+
} finally {
|
|
770
|
+
repo.close();
|
|
771
|
+
}
|
|
772
|
+
});
|
|
773
|
+
|
|
774
|
+
it("repairs contiguous TS snippets before annotation when repair is enabled", async () => {
|
|
775
|
+
const root = await mkdtemp(join(tmpdir(), "rce-snippet-repair-quality-"));
|
|
776
|
+
dirs.push(root);
|
|
777
|
+
const sqlitePath = join(root, "snippet-repair-quality.sqlite");
|
|
778
|
+
|
|
779
|
+
const repo = new SqliteIndexRepository(sqlitePath);
|
|
780
|
+
await repo.migrate();
|
|
781
|
+
await repo.upsertWorkspace({
|
|
782
|
+
workspace_id: "ws-snippet-repair",
|
|
783
|
+
tenant_id: "tenant-snippet-repair",
|
|
784
|
+
name: "snippet-repair",
|
|
785
|
+
project_root_path: "/workspace/snippet-repair"
|
|
786
|
+
});
|
|
787
|
+
|
|
788
|
+
const cache = new InMemoryQueryCache();
|
|
789
|
+
const core = new RetrievalCore(repo, cache, {
|
|
790
|
+
chunkingConfig: {
|
|
791
|
+
strategy: "sliding",
|
|
792
|
+
target_chunk_tokens: 35,
|
|
793
|
+
chunk_overlap_tokens: 8
|
|
794
|
+
},
|
|
795
|
+
scoringConfig: {
|
|
796
|
+
rerank: {
|
|
797
|
+
merge_overlapping_chunks_enabled: false
|
|
798
|
+
}
|
|
799
|
+
},
|
|
800
|
+
snippetIntegrityConfig: {
|
|
801
|
+
enabled: true,
|
|
802
|
+
target_languages: ["typescript"],
|
|
803
|
+
max_contiguous_gap_lines: 6,
|
|
804
|
+
marker_template_version: "v1",
|
|
805
|
+
repair_enabled: true,
|
|
806
|
+
repair_max_envelope_lines: 260,
|
|
807
|
+
repair_max_snippet_chars: 8_000
|
|
808
|
+
}
|
|
809
|
+
});
|
|
810
|
+
|
|
811
|
+
try {
|
|
812
|
+
const body = Array.from(
|
|
813
|
+
{ length: 120 },
|
|
814
|
+
(_, idx) => ` const checkpoint_${idx} = input + ${idx}; // repair continuity signal`
|
|
815
|
+
);
|
|
816
|
+
await core.indexArtifact({
|
|
817
|
+
tenant_id: "tenant-snippet-repair",
|
|
818
|
+
workspace_id: "ws-snippet-repair",
|
|
819
|
+
index_version: "idx-snippet-repair-v1",
|
|
820
|
+
files: [
|
|
821
|
+
{
|
|
822
|
+
path: "src/repair.ts",
|
|
823
|
+
language: "typescript",
|
|
824
|
+
content: [
|
|
825
|
+
"export function estimateRepair(input: number): number {",
|
|
826
|
+
...body,
|
|
827
|
+
" return input;",
|
|
828
|
+
"}"
|
|
829
|
+
].join("\n")
|
|
830
|
+
}
|
|
831
|
+
]
|
|
832
|
+
});
|
|
833
|
+
|
|
834
|
+
const retrieval = await core.searchContext({
|
|
835
|
+
trace_id: "trc-snippet-repair",
|
|
836
|
+
tenant_id: "tenant-snippet-repair",
|
|
837
|
+
workspace_id: "ws-snippet-repair",
|
|
838
|
+
request: {
|
|
839
|
+
project_root_path: "/workspace/snippet-repair",
|
|
840
|
+
query: "estimateRepair checkpoint_10 checkpoint_110",
|
|
841
|
+
top_k: 1
|
|
842
|
+
}
|
|
843
|
+
});
|
|
844
|
+
|
|
845
|
+
const top = retrieval.results[0];
|
|
846
|
+
expect(top).toBeDefined();
|
|
847
|
+
expect(top?.path).toBe("src/repair.ts");
|
|
848
|
+
expect(top?.snippet.includes("checkpoint_10")).toBe(true);
|
|
849
|
+
expect(top?.snippet.includes("checkpoint_110")).toBe(true);
|
|
850
|
+
expect(top?.snippet.includes("[truncated:v1")).toBe(false);
|
|
851
|
+
expect((top?.end_line ?? 0) - (top?.start_line ?? 0)).toBeGreaterThan(80);
|
|
852
|
+
} finally {
|
|
853
|
+
repo.close();
|
|
854
|
+
}
|
|
855
|
+
});
|
|
856
|
+
|
|
857
|
+
it("falls back to truncation marker when repair output still exceeds caps", async () => {
|
|
858
|
+
const root = await mkdtemp(join(tmpdir(), "rce-snippet-repair-fallback-"));
|
|
859
|
+
dirs.push(root);
|
|
860
|
+
const sqlitePath = join(root, "snippet-repair-fallback.sqlite");
|
|
861
|
+
|
|
862
|
+
const repo = new SqliteIndexRepository(sqlitePath);
|
|
863
|
+
await repo.migrate();
|
|
864
|
+
await repo.upsertWorkspace({
|
|
865
|
+
workspace_id: "ws-snippet-repair-fallback",
|
|
866
|
+
tenant_id: "tenant-snippet-repair-fallback",
|
|
867
|
+
name: "snippet-repair-fallback",
|
|
868
|
+
project_root_path: "/workspace/snippet-repair-fallback"
|
|
869
|
+
});
|
|
870
|
+
|
|
871
|
+
const cache = new InMemoryQueryCache();
|
|
872
|
+
const core = new RetrievalCore(repo, cache, {
|
|
873
|
+
chunkingConfig: {
|
|
874
|
+
strategy: "sliding",
|
|
875
|
+
target_chunk_tokens: 35,
|
|
876
|
+
chunk_overlap_tokens: 8
|
|
877
|
+
},
|
|
878
|
+
scoringConfig: {
|
|
879
|
+
rerank: {
|
|
880
|
+
merge_overlapping_chunks_enabled: false
|
|
881
|
+
}
|
|
882
|
+
},
|
|
883
|
+
snippetIntegrityConfig: {
|
|
884
|
+
enabled: true,
|
|
885
|
+
target_languages: ["typescript"],
|
|
886
|
+
max_contiguous_gap_lines: 6,
|
|
887
|
+
marker_template_version: "v1",
|
|
888
|
+
repair_enabled: true,
|
|
889
|
+
repair_max_envelope_lines: 260,
|
|
890
|
+
repair_max_snippet_chars: 220
|
|
891
|
+
}
|
|
892
|
+
});
|
|
893
|
+
|
|
894
|
+
try {
|
|
895
|
+
const body = Array.from(
|
|
896
|
+
{ length: 120 },
|
|
897
|
+
(_, idx) => ` const checkpoint_${idx} = input + ${idx}; // repair fallback signal`
|
|
898
|
+
);
|
|
899
|
+
await core.indexArtifact({
|
|
900
|
+
tenant_id: "tenant-snippet-repair-fallback",
|
|
901
|
+
workspace_id: "ws-snippet-repair-fallback",
|
|
902
|
+
index_version: "idx-snippet-repair-fallback-v1",
|
|
903
|
+
files: [
|
|
904
|
+
{
|
|
905
|
+
path: "src/repair-fallback.ts",
|
|
906
|
+
language: "typescript",
|
|
907
|
+
content: [
|
|
908
|
+
"export function estimateRepairFallback(input: number): number {",
|
|
909
|
+
...body,
|
|
910
|
+
" return input;",
|
|
911
|
+
"}"
|
|
912
|
+
].join("\n")
|
|
913
|
+
}
|
|
914
|
+
]
|
|
915
|
+
});
|
|
916
|
+
|
|
917
|
+
const retrieval = await core.searchContext({
|
|
918
|
+
trace_id: "trc-snippet-repair-fallback",
|
|
919
|
+
tenant_id: "tenant-snippet-repair-fallback",
|
|
920
|
+
workspace_id: "ws-snippet-repair-fallback",
|
|
921
|
+
request: {
|
|
922
|
+
project_root_path: "/workspace/snippet-repair-fallback",
|
|
923
|
+
query: "estimateRepairFallback checkpoint_10 checkpoint_110",
|
|
924
|
+
top_k: 1
|
|
925
|
+
}
|
|
926
|
+
});
|
|
927
|
+
|
|
928
|
+
const top = retrieval.results[0];
|
|
929
|
+
expect(top).toBeDefined();
|
|
930
|
+
expect(top?.path).toBe("src/repair-fallback.ts");
|
|
931
|
+
expect(top?.snippet.includes("[truncated:v1")).toBe(true);
|
|
932
|
+
} finally {
|
|
933
|
+
repo.close();
|
|
934
|
+
}
|
|
935
|
+
});
|
|
936
|
+
|
|
937
|
+
it("does not add truncation marker for non-target languages", async () => {
|
|
938
|
+
const root = await mkdtemp(join(tmpdir(), "rce-snippet-integrity-language-gate-"));
|
|
939
|
+
dirs.push(root);
|
|
940
|
+
const sqlitePath = join(root, "snippet-integrity-language-gate.sqlite");
|
|
941
|
+
|
|
942
|
+
const repo = new SqliteIndexRepository(sqlitePath);
|
|
943
|
+
await repo.migrate();
|
|
944
|
+
await repo.upsertWorkspace({
|
|
945
|
+
workspace_id: "ws-snippet-language-gate",
|
|
946
|
+
tenant_id: "tenant-snippet-language-gate",
|
|
947
|
+
name: "snippet-language-gate",
|
|
948
|
+
project_root_path: "/workspace/snippet-language-gate"
|
|
949
|
+
});
|
|
950
|
+
|
|
951
|
+
const cache = new InMemoryQueryCache();
|
|
952
|
+
const core = new RetrievalCore(repo, cache, {
|
|
953
|
+
chunkingConfig: {
|
|
954
|
+
strategy: "language_aware",
|
|
955
|
+
target_chunk_tokens: 40,
|
|
956
|
+
chunk_overlap_tokens: 8,
|
|
957
|
+
enabled_languages: ["go"]
|
|
958
|
+
},
|
|
959
|
+
snippetIntegrityConfig: {
|
|
960
|
+
enabled: true,
|
|
961
|
+
target_languages: ["typescript"],
|
|
962
|
+
max_contiguous_gap_lines: 6,
|
|
963
|
+
marker_template_version: "v1"
|
|
964
|
+
}
|
|
965
|
+
});
|
|
966
|
+
|
|
967
|
+
try {
|
|
968
|
+
const steps = Array.from({ length: 220 }, (_, idx) => `\ttotal += input + ${idx}`);
|
|
969
|
+
await core.indexArtifact({
|
|
970
|
+
tenant_id: "tenant-snippet-language-gate",
|
|
971
|
+
workspace_id: "ws-snippet-language-gate",
|
|
972
|
+
index_version: "idx-snippet-language-gate-v1",
|
|
973
|
+
files: [
|
|
974
|
+
{
|
|
975
|
+
path: "pkg/runtime/estimate.go",
|
|
976
|
+
language: "go",
|
|
977
|
+
content: [
|
|
978
|
+
"package runtime",
|
|
979
|
+
"",
|
|
980
|
+
"func EstimateProbability(input int) int {",
|
|
981
|
+
"\ttotal := 0",
|
|
982
|
+
...steps,
|
|
983
|
+
"\treturn total",
|
|
984
|
+
"}"
|
|
985
|
+
].join("\n")
|
|
986
|
+
}
|
|
987
|
+
]
|
|
988
|
+
});
|
|
989
|
+
|
|
990
|
+
const retrieval = await core.searchContext({
|
|
991
|
+
trace_id: "trc-snippet-language-gate",
|
|
992
|
+
tenant_id: "tenant-snippet-language-gate",
|
|
993
|
+
workspace_id: "ws-snippet-language-gate",
|
|
994
|
+
request: {
|
|
995
|
+
project_root_path: "/workspace/snippet-language-gate",
|
|
996
|
+
query: "EstimateProbability input 200",
|
|
997
|
+
top_k: 1
|
|
998
|
+
}
|
|
999
|
+
});
|
|
1000
|
+
|
|
1001
|
+
const top = retrieval.results[0];
|
|
1002
|
+
expect(top).toBeDefined();
|
|
1003
|
+
expect(top?.path).toBe("pkg/runtime/estimate.go");
|
|
1004
|
+
expect(top?.snippet.includes("[truncated:v1")).toBe(false);
|
|
1005
|
+
} finally {
|
|
1006
|
+
repo.close();
|
|
1007
|
+
}
|
|
1008
|
+
});
|
|
322
1009
|
});
|