@code-rag/core 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api-contracts/index.d.ts +2 -0
- package/dist/api-contracts/index.js +5 -0
- package/dist/api-contracts/viewer-contracts.d.ts +181 -0
- package/dist/api-contracts/viewer-contracts.js +124 -0
- package/dist/benchmarks/benchmark-evaluator.d.ts +84 -0
- package/dist/benchmarks/benchmark-evaluator.js +220 -0
- package/dist/benchmarks/index-scanner.d.ts +54 -0
- package/dist/benchmarks/index-scanner.js +135 -0
- package/dist/benchmarks/index.d.ts +6 -0
- package/dist/benchmarks/index.js +4 -0
- package/dist/benchmarks/query-generator.d.ts +68 -0
- package/dist/benchmarks/query-generator.js +205 -0
- package/dist/config/config-parser.js +27 -14
- package/dist/embedding/hybrid-search.js +15 -8
- package/dist/embedding/lancedb-store.d.ts +13 -0
- package/dist/embedding/lancedb-store.js +106 -12
- package/dist/index.d.ts +8 -1
- package/dist/index.js +5 -0
- package/dist/retrieval/context-expander.d.ts +4 -2
- package/dist/retrieval/context-expander.js +2 -2
- package/dist/retrieval/index.d.ts +1 -1
- package/dist/runtime.d.ts +37 -0
- package/dist/runtime.js +170 -0
- package/dist/utils/safe-cast.d.ts +32 -0
- package/dist/utils/safe-cast.js +76 -0
- package/package.json +5 -1
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scans an existing CodeRAG index (LanceDB) to extract entity information
|
|
3
|
+
* used for auto-generating benchmark queries with ground truth.
|
|
4
|
+
*
|
|
5
|
+
* All functions are pure where possible, taking data as input rather than
|
|
6
|
+
* connecting to stores directly.
|
|
7
|
+
*/
|
|
8
|
+
import { ok, err } from 'neverthrow';
|
|
9
|
+
import { safeString, safeArray } from '../utils/safe-cast.js';
|
|
10
|
+
export class IndexScanError extends Error {
|
|
11
|
+
constructor(message) {
|
|
12
|
+
super(message);
|
|
13
|
+
this.name = 'IndexScanError';
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
const CHUNK_TYPES = [
|
|
17
|
+
'function', 'method', 'class', 'module', 'interface',
|
|
18
|
+
'type_alias', 'config_block', 'import_block', 'doc',
|
|
19
|
+
];
|
|
20
|
+
function toChunkType(value) {
|
|
21
|
+
const found = CHUNK_TYPES.find((t) => t === value);
|
|
22
|
+
return found ?? 'function';
|
|
23
|
+
}
|
|
24
|
+
function toStringArray(value) {
|
|
25
|
+
const arr = safeArray(value, []);
|
|
26
|
+
return arr.filter((item) => typeof item === 'string');
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Convert raw index rows (from LanceDBStore.getAll()) into ScannedEntity objects.
|
|
30
|
+
* This is a pure function that operates on already-fetched data.
|
|
31
|
+
*/
|
|
32
|
+
export function parseIndexRows(rows) {
|
|
33
|
+
try {
|
|
34
|
+
const entities = [];
|
|
35
|
+
const entityMap = new Map();
|
|
36
|
+
const nameToChunkIds = new Map();
|
|
37
|
+
const fileToChunkIds = new Map();
|
|
38
|
+
for (const row of rows) {
|
|
39
|
+
const meta = row.metadata;
|
|
40
|
+
const name = safeString(meta['name'], '');
|
|
41
|
+
const chunkType = toChunkType(safeString(meta['chunk_type'], 'function'));
|
|
42
|
+
const filePath = safeString(meta['file_path'], '');
|
|
43
|
+
const language = safeString(meta['language'], 'unknown');
|
|
44
|
+
const nlSummary = safeString(meta['nl_summary'], '');
|
|
45
|
+
const imports = toStringArray(meta['imports']);
|
|
46
|
+
const exports = toStringArray(meta['exports']);
|
|
47
|
+
const declarations = toStringArray(meta['declarations']);
|
|
48
|
+
const entity = {
|
|
49
|
+
chunkId: row.id,
|
|
50
|
+
name,
|
|
51
|
+
chunkType,
|
|
52
|
+
filePath,
|
|
53
|
+
language,
|
|
54
|
+
nlSummary,
|
|
55
|
+
imports,
|
|
56
|
+
exports,
|
|
57
|
+
declarations,
|
|
58
|
+
};
|
|
59
|
+
entities.push(entity);
|
|
60
|
+
entityMap.set(row.id, entity);
|
|
61
|
+
// Index by name (skip empty names)
|
|
62
|
+
if (name.length > 0) {
|
|
63
|
+
const existing = nameToChunkIds.get(name);
|
|
64
|
+
if (existing) {
|
|
65
|
+
existing.push(row.id);
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
nameToChunkIds.set(name, [row.id]);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
// Index by file path
|
|
72
|
+
if (filePath.length > 0) {
|
|
73
|
+
const existing = fileToChunkIds.get(filePath);
|
|
74
|
+
if (existing) {
|
|
75
|
+
existing.push(row.id);
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
fileToChunkIds.set(filePath, [row.id]);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
return ok({
|
|
83
|
+
entities,
|
|
84
|
+
totalChunks: entities.length,
|
|
85
|
+
entityMap,
|
|
86
|
+
nameToChunkIds,
|
|
87
|
+
fileToChunkIds,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
catch (error) {
|
|
91
|
+
const message = error instanceof Error ? error.message : 'Unknown error';
|
|
92
|
+
return err(new IndexScanError(`Failed to parse index rows: ${message}`));
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Build a caller map from graph edges.
|
|
97
|
+
* Maps target chunkId to source chunkIds that reference it.
|
|
98
|
+
*/
|
|
99
|
+
export function buildCallerMap(edges) {
|
|
100
|
+
const callerMap = new Map();
|
|
101
|
+
for (const edge of edges) {
|
|
102
|
+
if (edge.type === 'calls' || edge.type === 'references' || edge.type === 'imports') {
|
|
103
|
+
const existing = callerMap.get(edge.target);
|
|
104
|
+
if (existing) {
|
|
105
|
+
existing.push(edge.source);
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
callerMap.set(edge.target, [edge.source]);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return callerMap;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Build a test file map: maps source file paths to test file chunk IDs.
|
|
116
|
+
* Heuristic: a file at `foo.test.ts` or `foo.spec.ts` is the test for `foo.ts`.
|
|
117
|
+
*/
|
|
118
|
+
export function buildTestMap(fileToChunkIds) {
|
|
119
|
+
const testMap = new Map();
|
|
120
|
+
for (const [filePath, chunkIds] of fileToChunkIds) {
|
|
121
|
+
const isTestFile = /\.(test|spec)\.[^.]+$/.test(filePath);
|
|
122
|
+
if (isTestFile) {
|
|
123
|
+
// Derive the source file path
|
|
124
|
+
const sourceFilePath = filePath.replace(/\.(test|spec)\./, '.');
|
|
125
|
+
const existing = testMap.get(sourceFilePath);
|
|
126
|
+
if (existing) {
|
|
127
|
+
existing.push(...chunkIds);
|
|
128
|
+
}
|
|
129
|
+
else {
|
|
130
|
+
testMap.set(sourceFilePath, [...chunkIds]);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
return testMap;
|
|
135
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export type { ScannedEntity, IndexScanResult, } from './index-scanner.js';
|
|
2
|
+
export { IndexScanError, parseIndexRows, buildCallerMap, buildTestMap, } from './index-scanner.js';
|
|
3
|
+
export type { BenchmarkQueryType, GeneratedQuery, QueryGeneratorOptions, } from './query-generator.js';
|
|
4
|
+
export { generateQueries, generateFindByNameQueries, generateFindByDescriptionQueries, generateFindCallersQueries, generateFindTestsQueries, generateFindImportsQueries, } from './query-generator.js';
|
|
5
|
+
export type { QueryEvalResult, QueryMetrics, AggregateEvalMetrics, QueryTypeBreakdown, BenchmarkReport, BenchmarkMetadata, SearchFn, BenchmarkProgressFn, } from './benchmark-evaluator.js';
|
|
6
|
+
export { BenchmarkEvalError, computeQueryMetrics, computeAggregateMetrics, computeQueryTypeBreakdown, runBenchmark, formatBenchmarkSummary, } from './benchmark-evaluator.js';
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
// Auto-generated benchmark module — barrel export
|
|
2
|
+
export { IndexScanError, parseIndexRows, buildCallerMap, buildTestMap, } from './index-scanner.js';
|
|
3
|
+
export { generateQueries, generateFindByNameQueries, generateFindByDescriptionQueries, generateFindCallersQueries, generateFindTestsQueries, generateFindImportsQueries, } from './query-generator.js';
|
|
4
|
+
export { BenchmarkEvalError, computeQueryMetrics, computeAggregateMetrics, computeQueryTypeBreakdown, runBenchmark, formatBenchmarkSummary, } from './benchmark-evaluator.js';
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Auto-generates benchmark queries with ground-truth from scanned index data.
|
|
3
|
+
*
|
|
4
|
+
* Query types:
|
|
5
|
+
* - find-by-name: "Where is the X function/class/interface?"
|
|
6
|
+
* - find-by-description: Uses NL summary as query text
|
|
7
|
+
* - find-callers: "What calls X?" or "What references X?"
|
|
8
|
+
* - find-tests: "Tests for X" or "Test file for foo.ts"
|
|
9
|
+
* - find-imports: "What does X import?" or "What imports X?"
|
|
10
|
+
*
|
|
11
|
+
* Each query has a ground-truth set of expected chunk IDs.
|
|
12
|
+
*/
|
|
13
|
+
import type { ScannedEntity, IndexScanResult } from './index-scanner.js';
|
|
14
|
+
import type { GraphEdge } from '../graph/dependency-graph.js';
|
|
15
|
+
/** The type of benchmark query generated. */
|
|
16
|
+
export type BenchmarkQueryType = 'find-by-name' | 'find-by-description' | 'find-callers' | 'find-tests' | 'find-imports';
|
|
17
|
+
/** A single auto-generated benchmark query with ground truth. */
|
|
18
|
+
export interface GeneratedQuery {
|
|
19
|
+
readonly query: string;
|
|
20
|
+
readonly expectedChunkIds: readonly string[];
|
|
21
|
+
readonly queryType: BenchmarkQueryType;
|
|
22
|
+
/** Source entity that inspired this query. */
|
|
23
|
+
readonly sourceEntityId: string;
|
|
24
|
+
}
|
|
25
|
+
/** Options for query generation. */
|
|
26
|
+
export interface QueryGeneratorOptions {
|
|
27
|
+
/** Total number of queries to generate (default: 100). */
|
|
28
|
+
readonly maxQueries: number;
|
|
29
|
+
/** Distribution of query types as fractions (must sum to 1.0). */
|
|
30
|
+
readonly distribution?: Readonly<Record<BenchmarkQueryType, number>>;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Generate find-by-name queries.
|
|
34
|
+
* Query: "Where is the <name> <type>?" or "<name> <type>"
|
|
35
|
+
* Ground truth: chunk ID of the entity itself.
|
|
36
|
+
*/
|
|
37
|
+
export declare function generateFindByNameQueries(entities: readonly ScannedEntity[], count: number, random: () => number): GeneratedQuery[];
|
|
38
|
+
/**
|
|
39
|
+
* Generate find-by-description queries.
|
|
40
|
+
* Query: The NL summary of the entity.
|
|
41
|
+
* Ground truth: chunk ID of the entity itself.
|
|
42
|
+
*/
|
|
43
|
+
export declare function generateFindByDescriptionQueries(entities: readonly ScannedEntity[], count: number, random: () => number): GeneratedQuery[];
|
|
44
|
+
/**
|
|
45
|
+
* Generate find-callers queries.
|
|
46
|
+
* Query: "What calls <name>?" or "callers of <name>"
|
|
47
|
+
* Ground truth: chunk IDs of callers from the dependency graph.
|
|
48
|
+
*/
|
|
49
|
+
export declare function generateFindCallersQueries(entities: readonly ScannedEntity[], callerMap: ReadonlyMap<string, readonly string[]>, count: number, random: () => number): GeneratedQuery[];
|
|
50
|
+
/**
|
|
51
|
+
* Generate find-tests queries.
|
|
52
|
+
* Query: "tests for <name>" or "test file for <filePath>"
|
|
53
|
+
* Ground truth: chunk IDs in the corresponding test file.
|
|
54
|
+
*/
|
|
55
|
+
export declare function generateFindTestsQueries(entities: readonly ScannedEntity[], testMap: ReadonlyMap<string, readonly string[]>, count: number, random: () => number): GeneratedQuery[];
|
|
56
|
+
/**
|
|
57
|
+
* Generate find-imports queries.
|
|
58
|
+
* Query: "imports of <name>" or "what does <name> import"
|
|
59
|
+
* Ground truth: chunk IDs of imported modules (resolved via name map).
|
|
60
|
+
*/
|
|
61
|
+
export declare function generateFindImportsQueries(entities: readonly ScannedEntity[], edges: readonly GraphEdge[], count: number, random: () => number): GeneratedQuery[];
|
|
62
|
+
/**
|
|
63
|
+
* Generate a complete benchmark dataset from scanned index data.
|
|
64
|
+
*
|
|
65
|
+
* Distributes queries across types according to the configured distribution,
|
|
66
|
+
* using a deterministic seeded RNG for reproducibility.
|
|
67
|
+
*/
|
|
68
|
+
export declare function generateQueries(scanResult: IndexScanResult, edges: readonly GraphEdge[], callerMap: ReadonlyMap<string, readonly string[]>, testMap: ReadonlyMap<string, readonly string[]>, options: QueryGeneratorOptions, seed?: number): readonly GeneratedQuery[];
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Auto-generates benchmark queries with ground-truth from scanned index data.
|
|
3
|
+
*
|
|
4
|
+
* Query types:
|
|
5
|
+
* - find-by-name: "Where is the X function/class/interface?"
|
|
6
|
+
* - find-by-description: Uses NL summary as query text
|
|
7
|
+
* - find-callers: "What calls X?" or "What references X?"
|
|
8
|
+
* - find-tests: "Tests for X" or "Test file for foo.ts"
|
|
9
|
+
* - find-imports: "What does X import?" or "What imports X?"
|
|
10
|
+
*
|
|
11
|
+
* Each query has a ground-truth set of expected chunk IDs.
|
|
12
|
+
*/
|
|
13
|
+
const DEFAULT_DISTRIBUTION = {
|
|
14
|
+
'find-by-name': 0.30,
|
|
15
|
+
'find-by-description': 0.25,
|
|
16
|
+
'find-callers': 0.15,
|
|
17
|
+
'find-tests': 0.15,
|
|
18
|
+
'find-imports': 0.15,
|
|
19
|
+
};
|
|
20
|
+
/** Types eligible for name-based queries (skip import_block, config_block). */
|
|
21
|
+
const NAME_QUERY_TYPES = new Set([
|
|
22
|
+
'function', 'method', 'class', 'interface', 'type_alias', 'module',
|
|
23
|
+
]);
|
|
24
|
+
/**
|
|
25
|
+
* Deterministic seeded pseudo-random number generator (mulberry32).
|
|
26
|
+
* Allows reproducible benchmark datasets.
|
|
27
|
+
*/
|
|
28
|
+
function createSeededRandom(seed) {
|
|
29
|
+
let state = seed | 0;
|
|
30
|
+
return () => {
|
|
31
|
+
state = (state + 0x6D2B79F5) | 0;
|
|
32
|
+
let t = Math.imul(state ^ (state >>> 15), 1 | state);
|
|
33
|
+
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
|
|
34
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
35
|
+
};
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Shuffle an array deterministically using Fisher-Yates with seeded RNG.
|
|
39
|
+
*/
|
|
40
|
+
function shuffleDeterministic(items, random) {
|
|
41
|
+
const result = [...items];
|
|
42
|
+
for (let i = result.length - 1; i > 0; i--) {
|
|
43
|
+
const j = Math.floor(random() * (i + 1));
|
|
44
|
+
const temp = result[i];
|
|
45
|
+
result[i] = result[j];
|
|
46
|
+
result[j] = temp;
|
|
47
|
+
}
|
|
48
|
+
return result;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Generate find-by-name queries.
|
|
52
|
+
* Query: "Where is the <name> <type>?" or "<name> <type>"
|
|
53
|
+
* Ground truth: chunk ID of the entity itself.
|
|
54
|
+
*/
|
|
55
|
+
export function generateFindByNameQueries(entities, count, random) {
|
|
56
|
+
const eligible = entities.filter((e) => e.name.length > 0 && NAME_QUERY_TYPES.has(e.chunkType));
|
|
57
|
+
if (eligible.length === 0)
|
|
58
|
+
return [];
|
|
59
|
+
const shuffled = shuffleDeterministic(eligible, random);
|
|
60
|
+
const selected = shuffled.slice(0, count);
|
|
61
|
+
return selected.map((entity) => {
|
|
62
|
+
const typeLabel = entity.chunkType === 'type_alias' ? 'type' : entity.chunkType;
|
|
63
|
+
return {
|
|
64
|
+
query: `${entity.name} ${typeLabel}`,
|
|
65
|
+
expectedChunkIds: [entity.chunkId],
|
|
66
|
+
queryType: 'find-by-name',
|
|
67
|
+
sourceEntityId: entity.chunkId,
|
|
68
|
+
};
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Generate find-by-description queries.
|
|
73
|
+
* Query: The NL summary of the entity.
|
|
74
|
+
* Ground truth: chunk ID of the entity itself.
|
|
75
|
+
*/
|
|
76
|
+
export function generateFindByDescriptionQueries(entities, count, random) {
|
|
77
|
+
const eligible = entities.filter((e) => e.nlSummary.length > 20 && NAME_QUERY_TYPES.has(e.chunkType));
|
|
78
|
+
if (eligible.length === 0)
|
|
79
|
+
return [];
|
|
80
|
+
const shuffled = shuffleDeterministic(eligible, random);
|
|
81
|
+
const selected = shuffled.slice(0, count);
|
|
82
|
+
return selected.map((entity) => ({
|
|
83
|
+
query: entity.nlSummary,
|
|
84
|
+
expectedChunkIds: [entity.chunkId],
|
|
85
|
+
queryType: 'find-by-description',
|
|
86
|
+
sourceEntityId: entity.chunkId,
|
|
87
|
+
}));
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Generate find-callers queries.
|
|
91
|
+
* Query: "What calls <name>?" or "callers of <name>"
|
|
92
|
+
* Ground truth: chunk IDs of callers from the dependency graph.
|
|
93
|
+
*/
|
|
94
|
+
export function generateFindCallersQueries(entities, callerMap, count, random) {
|
|
95
|
+
// Only generate queries for entities that actually have callers
|
|
96
|
+
const eligible = entities.filter((e) => e.name.length > 0 && (callerMap.get(e.chunkId)?.length ?? 0) > 0);
|
|
97
|
+
if (eligible.length === 0)
|
|
98
|
+
return [];
|
|
99
|
+
const shuffled = shuffleDeterministic(eligible, random);
|
|
100
|
+
const selected = shuffled.slice(0, count);
|
|
101
|
+
return selected.map((entity) => {
|
|
102
|
+
const callers = callerMap.get(entity.chunkId) ?? [];
|
|
103
|
+
return {
|
|
104
|
+
query: `callers of ${entity.name}`,
|
|
105
|
+
expectedChunkIds: [...callers, entity.chunkId],
|
|
106
|
+
queryType: 'find-callers',
|
|
107
|
+
sourceEntityId: entity.chunkId,
|
|
108
|
+
};
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Generate find-tests queries.
|
|
113
|
+
* Query: "tests for <name>" or "test file for <filePath>"
|
|
114
|
+
* Ground truth: chunk IDs in the corresponding test file.
|
|
115
|
+
*/
|
|
116
|
+
export function generateFindTestsQueries(entities, testMap, count, random) {
|
|
117
|
+
// Only generate for entities whose file has a test file
|
|
118
|
+
const eligible = entities.filter((e) => e.name.length > 0 &&
|
|
119
|
+
NAME_QUERY_TYPES.has(e.chunkType) &&
|
|
120
|
+
(testMap.get(e.filePath)?.length ?? 0) > 0);
|
|
121
|
+
if (eligible.length === 0)
|
|
122
|
+
return [];
|
|
123
|
+
const shuffled = shuffleDeterministic(eligible, random);
|
|
124
|
+
const selected = shuffled.slice(0, count);
|
|
125
|
+
return selected.map((entity) => {
|
|
126
|
+
const testChunkIds = testMap.get(entity.filePath) ?? [];
|
|
127
|
+
return {
|
|
128
|
+
query: `tests for ${entity.name}`,
|
|
129
|
+
expectedChunkIds: [...testChunkIds, entity.chunkId],
|
|
130
|
+
queryType: 'find-tests',
|
|
131
|
+
sourceEntityId: entity.chunkId,
|
|
132
|
+
};
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Generate find-imports queries.
|
|
137
|
+
* Query: "imports of <name>" or "what does <name> import"
|
|
138
|
+
* Ground truth: chunk IDs of imported modules (resolved via name map).
|
|
139
|
+
*/
|
|
140
|
+
export function generateFindImportsQueries(entities, edges, count, random) {
|
|
141
|
+
// Build import targets from graph edges
|
|
142
|
+
const importTargets = new Map();
|
|
143
|
+
for (const edge of edges) {
|
|
144
|
+
if (edge.type === 'imports') {
|
|
145
|
+
const existing = importTargets.get(edge.source);
|
|
146
|
+
if (existing) {
|
|
147
|
+
existing.push(edge.target);
|
|
148
|
+
}
|
|
149
|
+
else {
|
|
150
|
+
importTargets.set(edge.source, [edge.target]);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
const eligible = entities.filter((e) => e.name.length > 0 &&
|
|
155
|
+
NAME_QUERY_TYPES.has(e.chunkType) &&
|
|
156
|
+
(importTargets.get(e.chunkId)?.length ?? 0) > 0);
|
|
157
|
+
if (eligible.length === 0)
|
|
158
|
+
return [];
|
|
159
|
+
const shuffled = shuffleDeterministic(eligible, random);
|
|
160
|
+
const selected = shuffled.slice(0, count);
|
|
161
|
+
return selected.map((entity) => {
|
|
162
|
+
const targets = importTargets.get(entity.chunkId) ?? [];
|
|
163
|
+
return {
|
|
164
|
+
query: `imports of ${entity.name}`,
|
|
165
|
+
expectedChunkIds: [...targets, entity.chunkId],
|
|
166
|
+
queryType: 'find-imports',
|
|
167
|
+
sourceEntityId: entity.chunkId,
|
|
168
|
+
};
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Generate a complete benchmark dataset from scanned index data.
|
|
173
|
+
*
|
|
174
|
+
* Distributes queries across types according to the configured distribution,
|
|
175
|
+
* using a deterministic seeded RNG for reproducibility.
|
|
176
|
+
*/
|
|
177
|
+
export function generateQueries(scanResult, edges, callerMap, testMap, options, seed = 42) {
|
|
178
|
+
const random = createSeededRandom(seed);
|
|
179
|
+
const distribution = options.distribution ?? DEFAULT_DISTRIBUTION;
|
|
180
|
+
const maxQueries = options.maxQueries;
|
|
181
|
+
// Calculate target counts per type
|
|
182
|
+
const targetCounts = {
|
|
183
|
+
'find-by-name': Math.round(maxQueries * distribution['find-by-name']),
|
|
184
|
+
'find-by-description': Math.round(maxQueries * distribution['find-by-description']),
|
|
185
|
+
'find-callers': Math.round(maxQueries * distribution['find-callers']),
|
|
186
|
+
'find-tests': Math.round(maxQueries * distribution['find-tests']),
|
|
187
|
+
'find-imports': Math.round(maxQueries * distribution['find-imports']),
|
|
188
|
+
};
|
|
189
|
+
const { entities } = scanResult;
|
|
190
|
+
const nameQueries = generateFindByNameQueries(entities, targetCounts['find-by-name'], random);
|
|
191
|
+
const descQueries = generateFindByDescriptionQueries(entities, targetCounts['find-by-description'], random);
|
|
192
|
+
const callerQueries = generateFindCallersQueries(entities, callerMap, targetCounts['find-callers'], random);
|
|
193
|
+
const testQueries = generateFindTestsQueries(entities, testMap, targetCounts['find-tests'], random);
|
|
194
|
+
const importQueries = generateFindImportsQueries(entities, edges, targetCounts['find-imports'], random);
|
|
195
|
+
// Combine and trim to maxQueries
|
|
196
|
+
const allQueries = [
|
|
197
|
+
...nameQueries,
|
|
198
|
+
...descQueries,
|
|
199
|
+
...callerQueries,
|
|
200
|
+
...testQueries,
|
|
201
|
+
...importQueries,
|
|
202
|
+
];
|
|
203
|
+
// Shuffle the combined set for fair evaluation
|
|
204
|
+
return shuffleDeterministic(allQueries, random).slice(0, maxQueries);
|
|
205
|
+
}
|
|
@@ -3,6 +3,7 @@ import { join } from 'node:path';
|
|
|
3
3
|
import { ok, err } from 'neverthrow';
|
|
4
4
|
import { parse } from 'yaml';
|
|
5
5
|
import { z } from 'zod';
|
|
6
|
+
import { safeString, safeRecord } from '../utils/safe-cast.js';
|
|
6
7
|
export class ConfigError extends Error {
|
|
7
8
|
constructor(message) {
|
|
8
9
|
super(message);
|
|
@@ -156,8 +157,10 @@ export function interpolateEnvVars(obj) {
|
|
|
156
157
|
return result;
|
|
157
158
|
}
|
|
158
159
|
if (obj !== null && typeof obj === 'object') {
|
|
160
|
+
// Runtime guard above ensures obj is a non-null object (not an array — handled earlier)
|
|
161
|
+
const record = safeRecord(obj, {});
|
|
159
162
|
const result = {};
|
|
160
|
-
for (const [key, value] of Object.entries(
|
|
163
|
+
for (const [key, value] of Object.entries(record)) {
|
|
161
164
|
const interpolated = interpolateEnvVars(value);
|
|
162
165
|
if (interpolated instanceof ConfigError)
|
|
163
166
|
return interpolated;
|
|
@@ -181,8 +184,10 @@ function normalizeEmbeddingConfig(embeddingPartial) {
|
|
|
181
184
|
const defaults = { ...DEFAULT_CONFIG.embedding };
|
|
182
185
|
const merged = { ...defaults, ...embeddingPartial };
|
|
183
186
|
// Support snake_case key from YAML: openai_compatible -> openaiCompatible
|
|
184
|
-
const
|
|
185
|
-
|
|
187
|
+
const openaiCompatRaw = merged['openaiCompatible'] ?? merged['openai_compatible'];
|
|
188
|
+
const openaiCompat = openaiCompatRaw !== undefined && openaiCompatRaw !== null
|
|
189
|
+
? safeRecord(openaiCompatRaw, {})
|
|
190
|
+
: undefined;
|
|
186
191
|
// Remove the snake_case variant so only the camelCase one remains
|
|
187
192
|
delete merged['openai_compatible'];
|
|
188
193
|
if (openaiCompat) {
|
|
@@ -208,35 +213,42 @@ function normalizeEmbeddingConfig(embeddingPartial) {
|
|
|
208
213
|
}
|
|
209
214
|
return merged;
|
|
210
215
|
}
|
|
216
|
+
/** Extract a sub-record from a config object, returning undefined if not a valid record. */
|
|
217
|
+
function optionalRecord(value) {
|
|
218
|
+
if (value !== null && typeof value === 'object' && !Array.isArray(value)) {
|
|
219
|
+
return safeRecord(value);
|
|
220
|
+
}
|
|
221
|
+
return undefined;
|
|
222
|
+
}
|
|
211
223
|
function applyDefaults(partial) {
|
|
212
224
|
return {
|
|
213
|
-
version: partial['version']
|
|
225
|
+
version: safeString(partial['version'], DEFAULT_CONFIG.version),
|
|
214
226
|
project: {
|
|
215
227
|
...DEFAULT_CONFIG.project,
|
|
216
|
-
...partial['project'],
|
|
228
|
+
...optionalRecord(partial['project']),
|
|
217
229
|
},
|
|
218
230
|
ingestion: {
|
|
219
231
|
...DEFAULT_CONFIG.ingestion,
|
|
220
|
-
...partial['ingestion'],
|
|
232
|
+
...optionalRecord(partial['ingestion']),
|
|
221
233
|
},
|
|
222
|
-
embedding: normalizeEmbeddingConfig(partial['embedding']),
|
|
234
|
+
embedding: normalizeEmbeddingConfig(optionalRecord(partial['embedding'])),
|
|
223
235
|
llm: {
|
|
224
236
|
...DEFAULT_CONFIG.llm,
|
|
225
|
-
...partial['llm'],
|
|
237
|
+
...optionalRecord(partial['llm']),
|
|
226
238
|
},
|
|
227
239
|
search: {
|
|
228
240
|
...DEFAULT_CONFIG.search,
|
|
229
|
-
...partial['search'],
|
|
241
|
+
...optionalRecord(partial['search']),
|
|
230
242
|
},
|
|
231
243
|
storage: {
|
|
232
244
|
...DEFAULT_CONFIG.storage,
|
|
233
|
-
...partial['storage'],
|
|
245
|
+
...optionalRecord(partial['storage']),
|
|
234
246
|
},
|
|
235
247
|
...(partial['reranker'] !== undefined
|
|
236
248
|
? {
|
|
237
249
|
reranker: {
|
|
238
250
|
...DEFAULT_CONFIG.reranker,
|
|
239
|
-
...partial['reranker'],
|
|
251
|
+
...optionalRecord(partial['reranker']),
|
|
240
252
|
},
|
|
241
253
|
}
|
|
242
254
|
: {}),
|
|
@@ -256,7 +268,7 @@ function deepMerge(target, source) {
|
|
|
256
268
|
const sv = source[key];
|
|
257
269
|
const tv = target[key];
|
|
258
270
|
if (sv !== null && typeof sv === 'object' && !Array.isArray(sv) && tv !== null && typeof tv === 'object' && !Array.isArray(tv)) {
|
|
259
|
-
result[key] = deepMerge(tv, sv);
|
|
271
|
+
result[key] = deepMerge(safeRecord(tv), safeRecord(sv));
|
|
260
272
|
}
|
|
261
273
|
else {
|
|
262
274
|
result[key] = sv;
|
|
@@ -290,7 +302,7 @@ export async function loadConfig(rootDir) {
|
|
|
290
302
|
const localContent = await readFile(localPath, 'utf-8');
|
|
291
303
|
const localParsed = parse(localContent);
|
|
292
304
|
if (localParsed !== null && localParsed !== undefined && typeof localParsed === 'object') {
|
|
293
|
-
parsed = deepMerge(parsed, localParsed);
|
|
305
|
+
parsed = deepMerge(safeRecord(parsed, {}), safeRecord(localParsed, {}));
|
|
294
306
|
}
|
|
295
307
|
}
|
|
296
308
|
catch {
|
|
@@ -301,10 +313,11 @@ export async function loadConfig(rootDir) {
|
|
|
301
313
|
if (interpolated instanceof ConfigError) {
|
|
302
314
|
return err(interpolated);
|
|
303
315
|
}
|
|
304
|
-
const withDefaults = applyDefaults(interpolated);
|
|
316
|
+
const withDefaults = applyDefaults(safeRecord(interpolated, {}));
|
|
305
317
|
const validationResult = codeRAGConfigSchema.safeParse(withDefaults);
|
|
306
318
|
if (!validationResult.success) {
|
|
307
319
|
return err(new ConfigError(`Config validation failed: ${formatZodErrors(validationResult.error)}`));
|
|
308
320
|
}
|
|
321
|
+
// eslint-disable-next-line @typescript-eslint/consistent-type-assertions -- Zod schema structurally matches CodeRAGConfig; safeParse validates all fields
|
|
309
322
|
return ok(validationResult.data);
|
|
310
323
|
}
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import { ok, err } from 'neverthrow';
|
|
2
2
|
import { EmbedError } from '../types/provider.js';
|
|
3
|
+
import { safeString, safeStringUnion } from '../utils/safe-cast.js';
|
|
3
4
|
const RRF_K = 60;
|
|
5
|
+
const CHUNK_TYPES = [
|
|
6
|
+
'function', 'method', 'class', 'module', 'interface',
|
|
7
|
+
'type_alias', 'config_block', 'import_block', 'doc',
|
|
8
|
+
];
|
|
4
9
|
export class HybridSearch {
|
|
5
10
|
vectorStore;
|
|
6
11
|
bm25Index;
|
|
@@ -86,10 +91,12 @@ export class HybridSearch {
|
|
|
86
91
|
else {
|
|
87
92
|
// Vector-only hit: hydrate from vector store metadata
|
|
88
93
|
const meta = vectorMetadataMap.get(chunkId) ?? {};
|
|
89
|
-
const storedName = meta['name']
|
|
90
|
-
const storedChunkType = meta['chunk_type']
|
|
91
|
-
const storedFilePath = meta['file_path']
|
|
92
|
-
const storedLanguage = meta['language']
|
|
94
|
+
const storedName = safeString(meta['name'], '');
|
|
95
|
+
const storedChunkType = safeStringUnion(meta['chunk_type'], CHUNK_TYPES, 'function');
|
|
96
|
+
const storedFilePath = safeString(meta['file_path'], '');
|
|
97
|
+
const storedLanguage = safeString(meta['language'], 'unknown');
|
|
98
|
+
const storedContent = safeString(meta['content'], '');
|
|
99
|
+
const storedNlSummary = safeString(meta['nl_summary'], '');
|
|
93
100
|
const chunkMetadata = {
|
|
94
101
|
chunkType: storedChunkType,
|
|
95
102
|
name: storedName,
|
|
@@ -99,15 +106,15 @@ export class HybridSearch {
|
|
|
99
106
|
};
|
|
100
107
|
merged.push({
|
|
101
108
|
chunkId,
|
|
102
|
-
content:
|
|
103
|
-
nlSummary:
|
|
109
|
+
content: storedContent,
|
|
110
|
+
nlSummary: storedNlSummary,
|
|
104
111
|
score: fusedScore,
|
|
105
112
|
method: 'hybrid',
|
|
106
113
|
metadata: chunkMetadata,
|
|
107
114
|
chunk: {
|
|
108
115
|
id: chunkId,
|
|
109
|
-
content:
|
|
110
|
-
nlSummary:
|
|
116
|
+
content: storedContent,
|
|
117
|
+
nlSummary: storedNlSummary,
|
|
111
118
|
filePath: storedFilePath,
|
|
112
119
|
startLine: 0,
|
|
113
120
|
endLine: 0,
|
|
@@ -17,5 +17,18 @@ export declare class LanceDBStore implements VectorStore {
|
|
|
17
17
|
}[], StoreError>>;
|
|
18
18
|
delete(ids: string[]): Promise<Result<void, StoreError>>;
|
|
19
19
|
count(): Promise<Result<number, StoreError>>;
|
|
20
|
+
getById(id: string): Promise<Result<{
|
|
21
|
+
id: string;
|
|
22
|
+
metadata: Record<string, unknown>;
|
|
23
|
+
} | undefined, StoreError>>;
|
|
24
|
+
/**
|
|
25
|
+
* Scan all rows from the table.
|
|
26
|
+
* Returns an array of { id, metadata } objects (no vectors).
|
|
27
|
+
* Useful for index analysis and benchmark generation.
|
|
28
|
+
*/
|
|
29
|
+
getAll(limit?: number): Promise<Result<{
|
|
30
|
+
id: string;
|
|
31
|
+
metadata: Record<string, unknown>;
|
|
32
|
+
}[], StoreError>>;
|
|
20
33
|
close(): void;
|
|
21
34
|
}
|