@cue-dev/retrieval-core 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "name": "@cue-dev/retrieval-core",
3
+ "version": "0.1.3",
4
+ "type": "module",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "exports": {
8
+ ".": {
9
+ "types": "./dist/index.d.ts",
10
+ "default": "./dist/index.js"
11
+ }
12
+ },
13
+ "dependencies": {
14
+ "@anthropic-ai/claude-agent-sdk": "^0.2.42",
15
+ "@anthropic-ai/sdk": "^0.55.0",
16
+ "@cue-dev/contracts": "0.1.1",
17
+ "@cue-dev/data-plane": "0.1.2",
18
+ "@cue-dev/observability": "0.1.1",
19
+ "tree-sitter": "^0.22.4",
20
+ "tree-sitter-go": "^0.23.4",
21
+ "tree-sitter-javascript": "^0.25.0",
22
+ "tree-sitter-javascript-v023": "npm:tree-sitter-javascript@0.23.1",
23
+ "tree-sitter-java": "^0.23.5",
24
+ "tree-sitter-python": "^0.25.0",
25
+ "tree-sitter-python-v023": "npm:tree-sitter-python@0.23.6",
26
+ "tree-sitter-rust": "^0.23.3",
27
+ "tree-sitter-typescript": "^0.23.2"
28
+ },
29
+ "devDependencies": {
30
+ "pg-mem": "^3.0.5",
31
+ "tree-sitter-go-v025": "npm:tree-sitter-go@0.25.0",
32
+ "tree-sitter-javascript-v025": "npm:tree-sitter-javascript@0.25.0",
33
+ "tree-sitter-python-v025": "npm:tree-sitter-python@0.25.0",
34
+ "tree-sitter-v025": "npm:tree-sitter@0.25.0",
35
+ "web-tree-sitter": "^0.26.5"
36
+ }
37
+ }
@@ -0,0 +1,105 @@
1
+ "use strict";
2
+
3
+ const readline = require("node:readline");
4
+ const Parser = require("tree-sitter");
5
+ const JavaScript = require("tree-sitter-javascript-v023");
6
+ const Java = require("tree-sitter-java");
7
+ const Python = require("tree-sitter-python-v023");
8
+ const Rust = require("tree-sitter-rust");
9
+
10
+ const parserByLanguage = new Map();
11
+
12
+ function initParser(language, grammarModule) {
13
+ const parser = new Parser();
14
+ parser.setLanguage(grammarModule);
15
+ parserByLanguage.set(language, parser);
16
+ }
17
+
18
+ const initStart = performance.now();
19
+ initParser("javascript", JavaScript);
20
+ initParser("python", Python);
21
+ initParser("rust", Rust);
22
+ initParser("java", Java);
23
+ const initMs = performance.now() - initStart;
24
+
25
+ if (process.argv[2] === "--batch") {
26
+ const language = String(process.argv[3] ?? "");
27
+ const iterations = Number(process.argv[4] ?? "1");
28
+ const source = Buffer.from(String(process.argv[5] ?? ""), "base64").toString("utf8");
29
+ const parser = parserByLanguage.get(language);
30
+ if (!parser) {
31
+ process.stdout.write(`${JSON.stringify({ ok: false, error: `unsupported language: ${language}` })}\n`);
32
+ process.exit(0);
33
+ }
34
+
35
+ const parseStart = performance.now();
36
+ let root = null;
37
+ for (let i = 0; i < iterations; i += 1) {
38
+ const tree = parser.parse(source);
39
+ root = tree?.rootNode?.type ?? null;
40
+ }
41
+ const totalParseMs = performance.now() - parseStart;
42
+ process.stdout.write(
43
+ `${JSON.stringify({
44
+ ok: true,
45
+ language,
46
+ init_ms: Number(initMs.toFixed(3)),
47
+ parse_avg_ms: Number((totalParseMs / Math.max(1, iterations)).toFixed(3)),
48
+ parse_total_ms: Number(totalParseMs.toFixed(3)),
49
+ root
50
+ })}\n`
51
+ );
52
+ process.exit(0);
53
+ }
54
+
55
+ process.stdout.write(`${JSON.stringify({ type: "ready", init_ms: Number(initMs.toFixed(3)) })}\n`);
56
+
57
+ const rl = readline.createInterface({
58
+ input: process.stdin,
59
+ terminal: false
60
+ });
61
+
62
+ rl.on("line", (line) => {
63
+ if (!line.trim()) {
64
+ return;
65
+ }
66
+ let payload;
67
+ try {
68
+ payload = JSON.parse(line);
69
+ } catch (error) {
70
+ process.stdout.write(
71
+ `${JSON.stringify({ ok: false, error: error instanceof Error ? error.message : String(error) })}\n`
72
+ );
73
+ return;
74
+ }
75
+
76
+ const parser = parserByLanguage.get(payload.language);
77
+ if (!parser) {
78
+ process.stdout.write(
79
+ `${JSON.stringify({ id: payload.id, ok: false, error: `unsupported language: ${String(payload.language)}` })}\n`
80
+ );
81
+ return;
82
+ }
83
+
84
+ try {
85
+ const start = performance.now();
86
+ const tree = parser.parse(String(payload.source ?? ""));
87
+ const parseMs = performance.now() - start;
88
+ process.stdout.write(
89
+ `${JSON.stringify({
90
+ id: payload.id,
91
+ ok: true,
92
+ parse_ms: Number(parseMs.toFixed(3)),
93
+ root: tree?.rootNode?.type ?? null
94
+ })}\n`
95
+ );
96
+ } catch (error) {
97
+ process.stdout.write(
98
+ `${JSON.stringify({
99
+ id: payload.id,
100
+ ok: false,
101
+ error: error instanceof Error ? error.message : String(error)
102
+ })}\n`
103
+ );
104
+ }
105
+ });
@@ -0,0 +1,338 @@
1
+ import { spawnSync } from "node:child_process";
2
+ import { createRequire } from "node:module";
3
+ import { fileURLToPath } from "node:url";
4
+ import Parser from "tree-sitter";
5
+ import JavaScriptV023 from "tree-sitter-javascript-v023";
6
+ import Java from "tree-sitter-java";
7
+ import PythonV023 from "tree-sitter-python-v023";
8
+ import Rust from "tree-sitter-rust";
9
+ import { Language as WebLanguage, Parser as WebParser } from "web-tree-sitter";
10
+ import {
11
+ __resetChunkingParserStateForTests,
12
+ buildChunksForFile,
13
+ getChunkingParserAvailabilitySnapshot
14
+ } from "../src/chunking.js";
15
+
16
+ type TargetLanguage = "javascript" | "python" | "rust" | "java";
17
+ type BackendId = "baseline_chunking" | "native_v023_bun" | "wasm_web_tree_sitter" | "node_boundary_native";
18
+
19
+ interface BenchRow {
20
+ backend: BackendId;
21
+ language: TargetLanguage;
22
+ available: boolean;
23
+ cold_start_ms: number;
24
+ per_file_ms: number;
25
+ parse_engine_ms?: number;
26
+ fallback_reason?: string;
27
+ notes?: string;
28
+ }
29
+
30
+ const ITERATIONS = 60;
31
+ const require = createRequire(import.meta.url);
32
+
33
+ const SAMPLES: Record<TargetLanguage, string> = {
34
+ javascript: [
35
+ "import { dep } from './dep';",
36
+ "",
37
+ "export function alpha(input) {",
38
+ " const value = dep(input);",
39
+ " return value + 1;",
40
+ "}",
41
+ "",
42
+ "export class Greeter {",
43
+ " greet(name) {",
44
+ " return `hello ${name}`;",
45
+ " }",
46
+ "}",
47
+ "",
48
+ "export function beta(input) {",
49
+ " return dep(input) * 2;",
50
+ "}"
51
+ ].join("\n"),
52
+ python: [
53
+ "def alpha(x):",
54
+ " return x + 1",
55
+ "",
56
+ "class Greeter:",
57
+ " def greet(self, name):",
58
+ " return f\"hello {name}\"",
59
+ "",
60
+ "def beta(x):",
61
+ " return x * 2"
62
+ ].join("\n"),
63
+ rust: [
64
+ "pub struct Counter {",
65
+ " value: i32,",
66
+ "}",
67
+ "",
68
+ "impl Counter {",
69
+ " pub fn increment(&mut self, amount: i32) {",
70
+ " self.value += amount;",
71
+ " }",
72
+ "}",
73
+ "",
74
+ "pub fn alpha(input: i32) -> i32 {",
75
+ " input + 1",
76
+ "}"
77
+ ].join("\n"),
78
+ java: [
79
+ "public class Example {",
80
+ " int alpha(int input) {",
81
+ " return input + 1;",
82
+ " }",
83
+ "",
84
+ " int beta(int input) {",
85
+ " return input * 2;",
86
+ " }",
87
+ "}"
88
+ ].join("\n")
89
+ };
90
+
91
+ function nowMs(): number {
92
+ return performance.now();
93
+ }
94
+
95
+ function formatMs(value: number): number {
96
+ return Number(value.toFixed(3));
97
+ }
98
+
99
+ async function benchBaselineChunking(language: TargetLanguage): Promise<BenchRow> {
100
+ __resetChunkingParserStateForTests();
101
+ const file = {
102
+ path:
103
+ language === "javascript"
104
+ ? "src/example.js"
105
+ : language === "python"
106
+ ? "src/example.py"
107
+ : language === "rust"
108
+ ? "src/example.rs"
109
+ : "src/Example.java",
110
+ language,
111
+ content: SAMPLES[language]
112
+ };
113
+ const config = {
114
+ strategy: "language_aware" as const,
115
+ fallback_strategy: "sliding" as const,
116
+ target_chunk_tokens: 220,
117
+ chunk_overlap_tokens: 40,
118
+ budget_tokenizer: "ranking" as const,
119
+ boundary_strictness: "legacy" as const,
120
+ max_chunks_per_file: 300,
121
+ parse_timeout_ms: 80,
122
+ enabled_languages: ["javascript", "python", "rust", "java"]
123
+ };
124
+ const tokenize = (text: string): string[] => text.split(/\s+/).filter(Boolean);
125
+
126
+ let totalMs = 0;
127
+ let coldMs = 0;
128
+ let fallbackReason: string | undefined;
129
+ let available = false;
130
+
131
+ for (let i = 0; i < ITERATIONS; i += 1) {
132
+ const start = nowMs();
133
+ const result = buildChunksForFile({ file, config, tokenize });
134
+ const elapsed = nowMs() - start;
135
+ if (i === 0) {
136
+ coldMs = elapsed;
137
+ }
138
+ totalMs += elapsed;
139
+ fallbackReason = result.fallback_reason;
140
+ available = result.strategy === "language_aware" && !result.fallback_reason;
141
+ }
142
+
143
+ return {
144
+ backend: "baseline_chunking",
145
+ language,
146
+ available,
147
+ cold_start_ms: formatMs(coldMs),
148
+ per_file_ms: formatMs(totalMs / ITERATIONS),
149
+ fallback_reason: fallbackReason
150
+ };
151
+ }
152
+
153
+ async function benchNativeCompatible(language: TargetLanguage): Promise<BenchRow> {
154
+ const grammar =
155
+ language === "javascript"
156
+ ? JavaScriptV023
157
+ : language === "python"
158
+ ? PythonV023
159
+ : language === "rust"
160
+ ? Rust
161
+ : Java;
162
+ const parser = new Parser();
163
+ const coldStart = nowMs();
164
+ parser.setLanguage(grammar as unknown as Parser.Language);
165
+ const firstTree = parser.parse(SAMPLES[language]);
166
+ const coldMs = nowMs() - coldStart;
167
+
168
+ let parseTotalMs = 0;
169
+ for (let i = 0; i < ITERATIONS; i += 1) {
170
+ const start = nowMs();
171
+ parser.parse(SAMPLES[language]);
172
+ parseTotalMs += nowMs() - start;
173
+ }
174
+
175
+ return {
176
+ backend: "native_v023_bun",
177
+ language,
178
+ available: Boolean(firstTree?.rootNode),
179
+ cold_start_ms: formatMs(coldMs),
180
+ per_file_ms: formatMs(parseTotalMs / ITERATIONS),
181
+ parse_engine_ms: formatMs(parseTotalMs / ITERATIONS)
182
+ };
183
+ }
184
+
185
+ let webTreeSitterRuntimeReady = false;
186
+
187
+ async function ensureWebTreeSitterRuntime(): Promise<void> {
188
+ if (webTreeSitterRuntimeReady) {
189
+ return;
190
+ }
191
+ const runtimeWasm = require.resolve("web-tree-sitter/web-tree-sitter.wasm");
192
+ await WebParser.init({
193
+ locateFile() {
194
+ return runtimeWasm;
195
+ }
196
+ });
197
+ webTreeSitterRuntimeReady = true;
198
+ }
199
+
200
+ async function benchWasm(language: TargetLanguage): Promise<BenchRow> {
201
+ await ensureWebTreeSitterRuntime();
202
+ const grammarWasm =
203
+ language === "javascript"
204
+ ? require.resolve("tree-sitter-javascript/tree-sitter-javascript.wasm")
205
+ : language === "python"
206
+ ? require.resolve("tree-sitter-python/tree-sitter-python.wasm")
207
+ : language === "rust"
208
+ ? require.resolve("tree-sitter-rust/tree-sitter-rust.wasm")
209
+ : require.resolve("tree-sitter-java/tree-sitter-java.wasm");
210
+
211
+ const coldStart = nowMs();
212
+ const webLanguage = await WebLanguage.load(grammarWasm);
213
+ const parser = new WebParser();
214
+ parser.setLanguage(webLanguage);
215
+ const firstTree = parser.parse(SAMPLES[language]);
216
+ const coldMs = nowMs() - coldStart;
217
+
218
+ let parseTotalMs = 0;
219
+ for (let i = 0; i < ITERATIONS; i += 1) {
220
+ const start = nowMs();
221
+ parser.parse(SAMPLES[language]);
222
+ parseTotalMs += nowMs() - start;
223
+ }
224
+
225
+ return {
226
+ backend: "wasm_web_tree_sitter",
227
+ language,
228
+ available: Boolean(firstTree?.rootNode),
229
+ cold_start_ms: formatMs(coldMs),
230
+ per_file_ms: formatMs(parseTotalMs / ITERATIONS),
231
+ parse_engine_ms: formatMs(parseTotalMs / ITERATIONS)
232
+ };
233
+ }
234
+
235
+ async function benchNodeBoundary(language: TargetLanguage): Promise<BenchRow> {
236
+ const hostPath = fileURLToPath(new URL("./poc-node-parser-host.cjs", import.meta.url));
237
+ const sourceBase64 = Buffer.from(SAMPLES[language], "utf8").toString("base64");
238
+ const start = nowMs();
239
+ const child = spawnSync("node", [hostPath, "--batch", language, String(ITERATIONS), sourceBase64], {
240
+ cwd: process.cwd(),
241
+ encoding: "utf8"
242
+ });
243
+ const elapsed = nowMs() - start;
244
+ const stdoutRaw = child.stdout ?? "";
245
+ const stderrRaw = child.stderr ?? "";
246
+
247
+ if (child.status !== 0) {
248
+ return {
249
+ backend: "node_boundary_native",
250
+ language,
251
+ available: false,
252
+ cold_start_ms: formatMs(elapsed),
253
+ per_file_ms: formatMs(elapsed / ITERATIONS),
254
+ notes: stderrRaw || `node exited with code ${String(child.status)}`
255
+ };
256
+ }
257
+
258
+ const stdout = stdoutRaw.trim();
259
+ const payload = JSON.parse(stdout) as {
260
+ ok: boolean;
261
+ error?: string;
262
+ init_ms?: number;
263
+ parse_avg_ms?: number;
264
+ };
265
+ if (!payload.ok) {
266
+ return {
267
+ backend: "node_boundary_native",
268
+ language,
269
+ available: false,
270
+ cold_start_ms: formatMs(elapsed),
271
+ per_file_ms: formatMs(elapsed / ITERATIONS),
272
+ notes: payload.error ?? "node boundary parse failed"
273
+ };
274
+ }
275
+
276
+ return {
277
+ backend: "node_boundary_native",
278
+ language,
279
+ available: true,
280
+ cold_start_ms: formatMs(payload.init_ms ?? elapsed),
281
+ per_file_ms: formatMs(elapsed / ITERATIONS),
282
+ parse_engine_ms: formatMs(payload.parse_avg_ms ?? 0),
283
+ notes: "includes process boundary overhead"
284
+ };
285
+ }
286
+
287
+ function renderTable(rows: BenchRow[]): string {
288
+ const header = [
289
+ "backend",
290
+ "language",
291
+ "available",
292
+ "cold_start_ms",
293
+ "per_file_ms",
294
+ "parse_engine_ms",
295
+ "fallback_reason",
296
+ "notes"
297
+ ];
298
+ const lines = [header.join("\t")];
299
+ for (const row of rows) {
300
+ lines.push(
301
+ [
302
+ row.backend,
303
+ row.language,
304
+ row.available ? "yes" : "no",
305
+ row.cold_start_ms.toFixed(3),
306
+ row.per_file_ms.toFixed(3),
307
+ row.parse_engine_ms?.toFixed(3) ?? "",
308
+ row.fallback_reason ?? "",
309
+ row.notes ?? ""
310
+ ].join("\t")
311
+ );
312
+ }
313
+ return lines.join("\n");
314
+ }
315
+
316
+ async function main(): Promise<void> {
317
+ const baselineSnapshot = getChunkingParserAvailabilitySnapshot({
318
+ enabled_languages: ["javascript", "python", "rust", "java", "typescript", "go"]
319
+ });
320
+ const rows: BenchRow[] = [];
321
+ for (const language of ["javascript", "python", "rust", "java"] as const) {
322
+ rows.push(await benchBaselineChunking(language));
323
+ rows.push(await benchNativeCompatible(language));
324
+ rows.push(await benchWasm(language));
325
+ rows.push(await benchNodeBoundary(language));
326
+ }
327
+
328
+ console.log("=== parser availability snapshot (current chunking path) ===");
329
+ console.log(JSON.stringify(baselineSnapshot, null, 2));
330
+ console.log("");
331
+ console.log("=== benchmark table (tsv) ===");
332
+ console.log(renderTable(rows));
333
+ console.log("");
334
+ console.log("=== benchmark rows (json) ===");
335
+ console.log(JSON.stringify(rows, null, 2));
336
+ }
337
+
338
+ await main();