@rce-mcp/retrieval-core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,279 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { InMemoryQueryCache } from "@rce-mcp/data-plane";
3
+ import { getObservability } from "@rce-mcp/observability";
4
+ import { InMemoryIndexStore, RetrievalCore } from "../src/index.js";
5
+ import {
6
+ __getChunkingParserInitAttemptsForTests,
7
+ __resetChunkingParserStateForTests,
8
+ __setChunkingParserLanguageLoaderForTests,
9
+ buildChunksForFile,
10
+ getChunkingParserAvailabilitySnapshot
11
+ } from "../src/chunking.js";
12
+
13
+ async function indexAndListChunks(input: {
14
+ core: RetrievalCore;
15
+ store: InMemoryIndexStore;
16
+ tenant_id: string;
17
+ workspace_id: string;
18
+ index_version: string;
19
+ file: {
20
+ path: string;
21
+ language?: string;
22
+ content: string;
23
+ };
24
+ }): Promise<
25
+ Array<{
26
+ path: string;
27
+ start_line: number;
28
+ end_line: number;
29
+ snippet: string;
30
+ }>
31
+ > {
32
+ await input.store.upsertWorkspace({
33
+ workspace_id: input.workspace_id,
34
+ tenant_id: input.tenant_id,
35
+ name: "chunking-test",
36
+ project_root_path: "/workspace/chunking-test"
37
+ });
38
+
39
+ const report = await input.core.indexArtifact({
40
+ tenant_id: input.tenant_id,
41
+ workspace_id: input.workspace_id,
42
+ index_version: input.index_version,
43
+ files: [input.file]
44
+ });
45
+ expect(report.status).toBe("ready");
46
+
47
+ const index = await input.store.getLatestReadyIndex({
48
+ tenant_id: input.tenant_id,
49
+ workspace_id: input.workspace_id
50
+ });
51
+ expect(index).toBeDefined();
52
+
53
+ const chunks = await input.store.listChunksByIndex({
54
+ tenant_id: input.tenant_id,
55
+ index_id: index!.index_id
56
+ });
57
+ return chunks.map((chunk) => ({
58
+ path: chunk.path,
59
+ start_line: chunk.start_line,
60
+ end_line: chunk.end_line,
61
+ snippet: chunk.snippet
62
+ }));
63
+ }
64
+
65
+ describe("retrieval-core chunking", () => {
66
+ it("uses language-aware chunking for supported languages when configured", async () => {
67
+ const store = new InMemoryIndexStore();
68
+ const observability = getObservability(`retrieval-core-language-aware-${Date.now()}`);
69
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
70
+ observability,
71
+ chunkingConfig: {
72
+ strategy: "language_aware",
73
+ parse_timeout_ms: 500
74
+ }
75
+ });
76
+
77
+ const chunks = await indexAndListChunks({
78
+ core,
79
+ store,
80
+ tenant_id: "tenant-a",
81
+ workspace_id: "ws-a",
82
+ index_version: "idx-a1",
83
+ file: {
84
+ path: "src/feature.ts",
85
+ language: "typescript",
86
+ content: [
87
+ "import { dep } from './dep';",
88
+ "",
89
+ "export function alpha(input: number) {",
90
+ " const value = dep(input);",
91
+ " return value + 1;",
92
+ "}",
93
+ "",
94
+ "export class Greeter {",
95
+ " greet(name: string) {",
96
+ " return `hello ${name}`;",
97
+ " }",
98
+ "}",
99
+ "",
100
+ "export function beta(input: number) {",
101
+ " const value = dep(input);",
102
+ " return value * 2;",
103
+ "}"
104
+ ].join("\n")
105
+ }
106
+ });
107
+
108
+ expect(chunks.some((chunk) => chunk.snippet.includes("function alpha"))).toBe(true);
109
+ expect(chunks.some((chunk) => chunk.snippet.includes("class Greeter"))).toBe(true);
110
+ expect(chunks.some((chunk) => chunk.snippet.includes("function beta"))).toBe(true);
111
+ expect(chunks.every((chunk) => chunk.start_line >= 1 && chunk.end_line >= chunk.start_line)).toBe(true);
112
+
113
+ const strategyCounters = observability.metrics.readCounter("index_chunking_strategy_total");
114
+ expect(strategyCounters.some((counter) => counter.labels.strategy === "language_aware" && counter.labels.reason === "none")).toBe(
115
+ true
116
+ );
117
+ });
118
+
119
+ it("falls back to sliding chunks for unsupported languages", async () => {
120
+ const store = new InMemoryIndexStore();
121
+ const observability = getObservability(`retrieval-core-unsupported-${Date.now()}`);
122
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
123
+ observability,
124
+ chunkingConfig: {
125
+ strategy: "language_aware"
126
+ }
127
+ });
128
+
129
+ const chunks = await indexAndListChunks({
130
+ core,
131
+ store,
132
+ tenant_id: "tenant-b",
133
+ workspace_id: "ws-b",
134
+ index_version: "idx-b1",
135
+ file: {
136
+ path: "docs/readme.md",
137
+ language: "markdown",
138
+ content: Array.from({ length: 220 }, (_, idx) => `line ${idx + 1}: retrieval docs`).join("\n")
139
+ }
140
+ });
141
+
142
+ expect(chunks.length).toBeGreaterThan(0);
143
+ const fallbackCounters = observability.metrics.readCounter("index_chunking_fallback_total");
144
+ expect(
145
+ fallbackCounters.some(
146
+ (counter) => counter.labels.reason === "unsupported_language" && counter.labels.language === "markdown"
147
+ )
148
+ ).toBe(true);
149
+ });
150
+
151
+ it("keeps deterministic line coordinates in sliding mode when lines repeat", async () => {
152
+ const store = new InMemoryIndexStore();
153
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
154
+ chunkingConfig: {
155
+ strategy: "sliding"
156
+ }
157
+ });
158
+
159
+ const chunks = await indexAndListChunks({
160
+ core,
161
+ store,
162
+ tenant_id: "tenant-c",
163
+ workspace_id: "ws-c",
164
+ index_version: "idx-c1",
165
+ file: {
166
+ path: "src/repeated.ts",
167
+ language: "typescript",
168
+ content: Array.from({ length: 400 }, () => "const token = 1;").join("\n")
169
+ }
170
+ });
171
+
172
+ expect(chunks.length).toBeGreaterThan(2);
173
+ const starts = chunks.map((chunk) => chunk.start_line);
174
+ expect(starts.some((line) => line > 1)).toBe(true);
175
+ for (let i = 1; i < starts.length; i += 1) {
176
+ expect(starts[i]).toBeGreaterThan(starts[i - 1] ?? 0);
177
+ }
178
+ });
179
+
180
+ it("emits parser availability snapshot and avoids repeated parser init attempts", async () => {
181
+ __resetChunkingParserStateForTests();
182
+ try {
183
+ const store = new InMemoryIndexStore();
184
+ const observability = getObservability(`retrieval-core-js-py-${Date.now()}`);
185
+ const core = new RetrievalCore(store, new InMemoryQueryCache(), {
186
+ observability,
187
+ chunkingConfig: {
188
+ strategy: "language_aware"
189
+ }
190
+ });
191
+
192
+ await indexAndListChunks({
193
+ core,
194
+ store,
195
+ tenant_id: "tenant-d",
196
+ workspace_id: "ws-d",
197
+ index_version: "idx-d1",
198
+ file: {
199
+ path: "src/runtime.js",
200
+ language: "javascript",
201
+ content: ["export function alpha() {", " return 1;", "}"].join("\n")
202
+ }
203
+ });
204
+
205
+ await indexAndListChunks({
206
+ core,
207
+ store,
208
+ tenant_id: "tenant-d",
209
+ workspace_id: "ws-d",
210
+ index_version: "idx-d2",
211
+ file: {
212
+ path: "src/runtime.py",
213
+ language: "python",
214
+ content: ["def alpha():", " return 1"].join("\n")
215
+ }
216
+ });
217
+
218
+ const attempts = __getChunkingParserInitAttemptsForTests();
219
+ expect((attempts.javascript ?? 0) <= 1).toBe(true);
220
+ expect((attempts.python ?? 0) <= 1).toBe(true);
221
+
222
+ const snapshot = getChunkingParserAvailabilitySnapshot({
223
+ enabled_languages: ["typescript", "javascript", "python", "go"]
224
+ });
225
+ expect(snapshot.some((entry) => entry.language === "javascript")).toBe(true);
226
+ expect(snapshot.some((entry) => entry.language === "python")).toBe(true);
227
+
228
+ const availabilityGauges = observability.metrics.readGauge("index_chunking_parser_availability");
229
+ expect(availabilityGauges.some((point) => point.labels.language === "javascript")).toBe(true);
230
+ expect(availabilityGauges.some((point) => point.labels.language === "python")).toBe(true);
231
+ } finally {
232
+ __resetChunkingParserStateForTests();
233
+ }
234
+ });
235
+
236
+ it("caches parser unavailability and avoids repeated parser init attempts", () => {
237
+ __resetChunkingParserStateForTests();
238
+ try {
239
+ __setChunkingParserLanguageLoaderForTests("python", () => {
240
+ throw new Error("forced parser load failure");
241
+ });
242
+
243
+ const config = {
244
+ strategy: "language_aware" as const,
245
+ fallback_strategy: "sliding" as const,
246
+ target_chunk_tokens: 220,
247
+ chunk_overlap_tokens: 40,
248
+ max_chunks_per_file: 300,
249
+ parse_timeout_ms: 80,
250
+ enabled_languages: ["python"]
251
+ };
252
+ const tokenize = (text: string) => text.split(/\s+/).filter(Boolean);
253
+ const file = {
254
+ path: "src/runtime.py",
255
+ language: "python",
256
+ content: ["def alpha():", " return 1"].join("\n")
257
+ };
258
+
259
+ const first = buildChunksForFile({ file, config, tokenize });
260
+ const second = buildChunksForFile({ file, config, tokenize });
261
+ const third = buildChunksForFile({ file, config, tokenize });
262
+
263
+ expect(first.fallback_reason).toBe("parser_unavailable");
264
+ expect(second.fallback_reason).toBe("parser_unavailable");
265
+ expect(third.fallback_reason).toBe("parser_unavailable");
266
+
267
+ const attempts = __getChunkingParserInitAttemptsForTests();
268
+ expect(attempts.python).toBe(1);
269
+
270
+ const snapshot = getChunkingParserAvailabilitySnapshot({
271
+ enabled_languages: ["python", "typescript"]
272
+ });
273
+ expect(snapshot.some((entry) => entry.language === "python" && entry.status === "unavailable")).toBe(true);
274
+ expect(snapshot.some((entry) => entry.language === "typescript")).toBe(true);
275
+ } finally {
276
+ __resetChunkingParserStateForTests();
277
+ }
278
+ });
279
+ });
@@ -0,0 +1,60 @@
1
+ import { createRequire } from "node:module";
2
+ import { describe, expect, it } from "vitest";
3
+ import Parser from "tree-sitter";
4
+ import JavaScriptV023 from "tree-sitter-javascript-v023";
5
+ import PythonV023 from "tree-sitter-python-v023";
6
+ import { Language as WebLanguage, Parser as WebParser } from "web-tree-sitter";
7
+ import {
8
+ __resetChunkingParserStateForTests,
9
+ getChunkingParserAvailabilitySnapshot
10
+ } from "../src/chunking.js";
11
+
12
+ const require = createRequire(import.meta.url);
13
+
14
+ describe("chunking parser availability PoC", () => {
15
+ it("captures current baseline parser unavailability for javascript and python", () => {
16
+ __resetChunkingParserStateForTests();
17
+ const snapshot = getChunkingParserAvailabilitySnapshot({
18
+ enabled_languages: ["typescript", "javascript", "python", "go"]
19
+ });
20
+ const byLanguage = new Map(snapshot.map((entry) => [entry.language, entry]));
21
+
22
+ expect(byLanguage.get("typescript")?.status).toBe("available");
23
+ expect(byLanguage.get("go")?.status).toBe("available");
24
+ expect(byLanguage.get("javascript")?.status).toBe("unavailable");
25
+ expect(byLanguage.get("python")?.status).toBe("unavailable");
26
+ });
27
+
28
+ it("proves native availability with tree-sitter-compatible alias grammars", () => {
29
+ const javascriptParser = new Parser();
30
+ javascriptParser.setLanguage(JavaScriptV023 as unknown as Parser.Language);
31
+ const javascriptTree = javascriptParser.parse("function alpha(){ return 1; }\n");
32
+ expect(javascriptTree?.rootNode?.type).toBe("program");
33
+
34
+ const pythonParser = new Parser();
35
+ pythonParser.setLanguage(PythonV023 as unknown as Parser.Language);
36
+ const pythonTree = pythonParser.parse("def alpha():\n return 1\n");
37
+ expect(pythonTree?.rootNode?.type).toBe("module");
38
+ });
39
+
40
+ it("proves wasm availability with web-tree-sitter for javascript and python", async () => {
41
+ const runtimeWasm = require.resolve("web-tree-sitter/web-tree-sitter.wasm");
42
+ await WebParser.init({
43
+ locateFile() {
44
+ return runtimeWasm;
45
+ }
46
+ });
47
+
48
+ const javascriptLanguage = await WebLanguage.load(require.resolve("tree-sitter-javascript/tree-sitter-javascript.wasm"));
49
+ const javascriptParser = new WebParser();
50
+ javascriptParser.setLanguage(javascriptLanguage);
51
+ const javascriptTree = javascriptParser.parse("function alpha(){ return 1; }\n");
52
+ expect(javascriptTree?.rootNode?.type).toBe("program");
53
+
54
+ const pythonLanguage = await WebLanguage.load(require.resolve("tree-sitter-python/tree-sitter-python.wasm"));
55
+ const pythonParser = new WebParser();
56
+ pythonParser.setLanguage(pythonLanguage);
57
+ const pythonTree = pythonParser.parse("def alpha():\n return 1\n");
58
+ expect(pythonTree?.rootNode?.type).toBe("module");
59
+ });
60
+ });
@@ -0,0 +1,121 @@
1
+ import { afterEach, describe, expect, it, vi } from "vitest";
2
+ import { OpenAICompatibleEmbeddingProvider, RetrievalError } from "../src/index.js";
3
+
4
+ describe("openai-compatible embedding provider", () => {
5
+ const originalFetch = globalThis.fetch;
6
+
7
+ afterEach(() => {
8
+ globalThis.fetch = originalFetch;
9
+ vi.restoreAllMocks();
10
+ });
11
+
12
+ it("retries on 5xx and succeeds", async () => {
13
+ let calls = 0;
14
+ globalThis.fetch = vi.fn(async () => {
15
+ calls += 1;
16
+ if (calls === 1) {
17
+ return new Response(JSON.stringify({ error: "temporary" }), { status: 503 });
18
+ }
19
+ return new Response(
20
+ JSON.stringify({
21
+ data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }]
22
+ }),
23
+ { status: 200 }
24
+ );
25
+ }) as typeof fetch;
26
+
27
+ const provider = new OpenAICompatibleEmbeddingProvider({
28
+ base_url: "https://router.tumuer.me/v1",
29
+ api_key: "test-key",
30
+ model: "Qwen/Qwen3-Embedding-4B",
31
+ dimensions: 3,
32
+ timeout_ms: 200,
33
+ batch_size: 16,
34
+ max_retries: 2
35
+ });
36
+
37
+ const vectors = await provider.embed({
38
+ texts: ["hello"],
39
+ purpose: "query"
40
+ });
41
+
42
+ expect(vectors).toEqual([[0.1, 0.2, 0.3]]);
43
+ expect(calls).toBe(2);
44
+ });
45
+
46
+ it("does not retry auth/config 4xx failures", async () => {
47
+ let calls = 0;
48
+ globalThis.fetch = vi.fn(async () => {
49
+ calls += 1;
50
+ return new Response(JSON.stringify({ error: "bad key" }), { status: 401 });
51
+ }) as typeof fetch;
52
+
53
+ const provider = new OpenAICompatibleEmbeddingProvider({
54
+ base_url: "https://router.tumuer.me/v1",
55
+ api_key: "test-key",
56
+ dimensions: 3
57
+ });
58
+
59
+ await expect(
60
+ provider.embed({
61
+ texts: ["hello"],
62
+ purpose: "query"
63
+ })
64
+ ).rejects.toMatchObject({
65
+ code: "UPSTREAM_FAILURE"
66
+ } satisfies Partial<RetrievalError>);
67
+ expect(calls).toBe(1);
68
+ });
69
+
70
+ it("rejects non-numeric vectors", async () => {
71
+ globalThis.fetch = vi.fn(async () => {
72
+ return new Response(
73
+ JSON.stringify({
74
+ data: [{ index: 0, embedding: [0.1, "bad", 0.3] }]
75
+ }),
76
+ { status: 200 }
77
+ );
78
+ }) as typeof fetch;
79
+
80
+ const provider = new OpenAICompatibleEmbeddingProvider({
81
+ base_url: "https://router.tumuer.me/v1",
82
+ api_key: "test-key",
83
+ dimensions: 3
84
+ });
85
+
86
+ await expect(
87
+ provider.embed({
88
+ texts: ["hello"],
89
+ purpose: "query"
90
+ })
91
+ ).rejects.toMatchObject({
92
+ code: "UPSTREAM_FAILURE"
93
+ } satisfies Partial<RetrievalError>);
94
+ });
95
+
96
+ it("rejects vectors with unexpected dimensions", async () => {
97
+ globalThis.fetch = vi.fn(async () => {
98
+ return new Response(
99
+ JSON.stringify({
100
+ data: [{ index: 0, embedding: [0.1, 0.2] }]
101
+ }),
102
+ { status: 200 }
103
+ );
104
+ }) as typeof fetch;
105
+
106
+ const provider = new OpenAICompatibleEmbeddingProvider({
107
+ base_url: "https://router.tumuer.me/v1",
108
+ api_key: "test-key",
109
+ dimensions: 3
110
+ });
111
+
112
+ await expect(
113
+ provider.embed({
114
+ texts: ["hello"],
115
+ purpose: "query"
116
+ })
117
+ ).rejects.toMatchObject({
118
+ code: "UPSTREAM_FAILURE"
119
+ } satisfies Partial<RetrievalError>);
120
+ });
121
+ });