@cue-dev/retrieval-core 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +27 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/chunking.d.ts +64 -0
- package/dist/chunking.js +983 -0
- package/dist/index.d.ts +673 -0
- package/dist/index.js +6605 -0
- package/dist/indexing-ignore.d.ts +9 -0
- package/dist/indexing-ignore.js +151 -0
- package/dist/remote-sync.d.ts +193 -0
- package/dist/remote-sync.js +816 -0
- package/package.json +37 -0
- package/scripts/poc-node-parser-host.cjs +105 -0
- package/scripts/poc-parser-availability-benchmark.ts +338 -0
- package/src/chunking.ts +1187 -0
- package/src/index.ts +8338 -0
- package/src/indexing-ignore.ts +179 -0
- package/src/remote-sync.ts +1119 -0
- package/test/benchmark.thresholds.test.ts +815 -0
- package/test/chunking.config.test.ts +84 -0
- package/test/chunking.language-aware.test.ts +1248 -0
- package/test/chunking.parser-availability.poc.test.ts +86 -0
- package/test/claude-agent-provider.test.ts +209 -0
- package/test/embedding-context-prefix.test.ts +101 -0
- package/test/embedding-provider.test.ts +570 -0
- package/test/enhance-confidence.test.ts +752 -0
- package/test/index-prep.concurrency.regression.test.ts +142 -0
- package/test/integration.test.ts +508 -0
- package/test/local-sqlite.integration.test.ts +258 -0
- package/test/mcp-search-quality.regression.test.ts +1358 -0
- package/test/remote-sync.integration.test.ts +350 -0
- package/test/smart-cutoff.config.test.ts +86 -0
- package/test/snippet-integrity.config.test.ts +59 -0
- package/tsconfig.build.json +17 -0
- package/tsconfig.json +4 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { DEFAULT_RETRIEVAL_CHUNKING_CONFIG, mergeRetrievalChunkingConfig } from "../src/index.js";
|
|
3
|
+
|
|
4
|
+
describe("retrieval-core chunking config validation", () => {
|
|
5
|
+
it("uses wider default chunk windows for better snippet completeness", () => {
|
|
6
|
+
expect(DEFAULT_RETRIEVAL_CHUNKING_CONFIG.strategy).toBe("language_aware");
|
|
7
|
+
expect(DEFAULT_RETRIEVAL_CHUNKING_CONFIG.fallback_strategy).toBe("sliding");
|
|
8
|
+
expect(DEFAULT_RETRIEVAL_CHUNKING_CONFIG.target_chunk_tokens).toBe(420);
|
|
9
|
+
expect(DEFAULT_RETRIEVAL_CHUNKING_CONFIG.chunk_overlap_tokens).toBe(90);
|
|
10
|
+
expect(DEFAULT_RETRIEVAL_CHUNKING_CONFIG.budget_tokenizer).toBe("ranking");
|
|
11
|
+
expect(DEFAULT_RETRIEVAL_CHUNKING_CONFIG.boundary_strictness).toBe("semantic_js_ts");
|
|
12
|
+
expect(DEFAULT_RETRIEVAL_CHUNKING_CONFIG.enabled_languages).toEqual([
|
|
13
|
+
"typescript",
|
|
14
|
+
"javascript",
|
|
15
|
+
"python",
|
|
16
|
+
"go",
|
|
17
|
+
"rust",
|
|
18
|
+
"java"
|
|
19
|
+
]);
|
|
20
|
+
expect(DEFAULT_RETRIEVAL_CHUNKING_CONFIG.recursive_semantic_chunking_enabled).toBe(true);
|
|
21
|
+
expect(DEFAULT_RETRIEVAL_CHUNKING_CONFIG.embedding_context_prefix_enabled).toBe(true);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
it("accepts valid chunk window overrides", () => {
|
|
25
|
+
const merged = mergeRetrievalChunkingConfig(DEFAULT_RETRIEVAL_CHUNKING_CONFIG, {
|
|
26
|
+
target_chunk_tokens: 512,
|
|
27
|
+
chunk_overlap_tokens: 128,
|
|
28
|
+
budget_tokenizer: "lightweight",
|
|
29
|
+
boundary_strictness: "semantic_js_ts",
|
|
30
|
+
recursive_semantic_chunking_enabled: true,
|
|
31
|
+
semantic_merge_gap_lines: 8,
|
|
32
|
+
semantic_merge_max_span_lines: 260,
|
|
33
|
+
comment_forward_absorb_enabled: false,
|
|
34
|
+
embedding_context_prefix_enabled: false
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
expect(merged.target_chunk_tokens).toBe(512);
|
|
38
|
+
expect(merged.chunk_overlap_tokens).toBe(128);
|
|
39
|
+
expect(merged.budget_tokenizer).toBe("lightweight");
|
|
40
|
+
expect(merged.boundary_strictness).toBe("semantic_js_ts");
|
|
41
|
+
expect(merged.recursive_semantic_chunking_enabled).toBe(true);
|
|
42
|
+
expect(merged.semantic_merge_gap_lines).toBe(8);
|
|
43
|
+
expect(merged.semantic_merge_max_span_lines).toBe(260);
|
|
44
|
+
expect(merged.comment_forward_absorb_enabled).toBe(false);
|
|
45
|
+
expect(merged.embedding_context_prefix_enabled).toBe(false);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
it("rejects overlap values that are not less than target chunk tokens", () => {
|
|
49
|
+
expect(() =>
|
|
50
|
+
mergeRetrievalChunkingConfig(DEFAULT_RETRIEVAL_CHUNKING_CONFIG, {
|
|
51
|
+
target_chunk_tokens: 300,
|
|
52
|
+
chunk_overlap_tokens: 300
|
|
53
|
+
})
|
|
54
|
+
).toThrow(/chunk_overlap_tokens must be less than target_chunk_tokens/);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it("rejects invalid budget tokenizer and boundary strictness values", () => {
|
|
58
|
+
expect(() =>
|
|
59
|
+
mergeRetrievalChunkingConfig(DEFAULT_RETRIEVAL_CHUNKING_CONFIG, {
|
|
60
|
+
budget_tokenizer: "invalid" as "ranking"
|
|
61
|
+
})
|
|
62
|
+
).toThrow(/budget_tokenizer must be ranking\|lightweight/);
|
|
63
|
+
|
|
64
|
+
expect(() =>
|
|
65
|
+
mergeRetrievalChunkingConfig(DEFAULT_RETRIEVAL_CHUNKING_CONFIG, {
|
|
66
|
+
boundary_strictness: "invalid" as "legacy"
|
|
67
|
+
})
|
|
68
|
+
).toThrow(/boundary_strictness must be legacy\|semantic_js_ts/);
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it("rejects invalid semantic merge settings", () => {
|
|
72
|
+
expect(() =>
|
|
73
|
+
mergeRetrievalChunkingConfig(DEFAULT_RETRIEVAL_CHUNKING_CONFIG, {
|
|
74
|
+
semantic_merge_gap_lines: -1
|
|
75
|
+
})
|
|
76
|
+
).toThrow(/semantic_merge_gap_lines must be a non-negative integer/);
|
|
77
|
+
|
|
78
|
+
expect(() =>
|
|
79
|
+
mergeRetrievalChunkingConfig(DEFAULT_RETRIEVAL_CHUNKING_CONFIG, {
|
|
80
|
+
semantic_merge_max_span_lines: 0
|
|
81
|
+
})
|
|
82
|
+
).toThrow(/semantic_merge_max_span_lines must be a positive integer/);
|
|
83
|
+
});
|
|
84
|
+
});
|