@code-rag/core 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ /**
2
+ * Scans an existing CodeRAG index (LanceDB) to extract entity information
3
+ * used for auto-generating benchmark queries with ground truth.
4
+ *
5
+ * All functions are pure where possible, taking data as input rather than
6
+ * connecting to stores directly.
7
+ */
8
+ import { ok, err } from 'neverthrow';
9
+ import { safeString, safeArray } from '../utils/safe-cast.js';
10
+ export class IndexScanError extends Error {
11
+ constructor(message) {
12
+ super(message);
13
+ this.name = 'IndexScanError';
14
+ }
15
+ }
16
+ const CHUNK_TYPES = [
17
+ 'function', 'method', 'class', 'module', 'interface',
18
+ 'type_alias', 'config_block', 'import_block', 'doc',
19
+ ];
20
+ function toChunkType(value) {
21
+ const found = CHUNK_TYPES.find((t) => t === value);
22
+ return found ?? 'function';
23
+ }
24
+ function toStringArray(value) {
25
+ const arr = safeArray(value, []);
26
+ return arr.filter((item) => typeof item === 'string');
27
+ }
28
+ /**
29
+ * Convert raw index rows (from LanceDBStore.getAll()) into ScannedEntity objects.
30
+ * This is a pure function that operates on already-fetched data.
31
+ */
32
+ export function parseIndexRows(rows) {
33
+ try {
34
+ const entities = [];
35
+ const entityMap = new Map();
36
+ const nameToChunkIds = new Map();
37
+ const fileToChunkIds = new Map();
38
+ for (const row of rows) {
39
+ const meta = row.metadata;
40
+ const name = safeString(meta['name'], '');
41
+ const chunkType = toChunkType(safeString(meta['chunk_type'], 'function'));
42
+ const filePath = safeString(meta['file_path'], '');
43
+ const language = safeString(meta['language'], 'unknown');
44
+ const nlSummary = safeString(meta['nl_summary'], '');
45
+ const imports = toStringArray(meta['imports']);
46
+ const exports = toStringArray(meta['exports']);
47
+ const declarations = toStringArray(meta['declarations']);
48
+ const entity = {
49
+ chunkId: row.id,
50
+ name,
51
+ chunkType,
52
+ filePath,
53
+ language,
54
+ nlSummary,
55
+ imports,
56
+ exports,
57
+ declarations,
58
+ };
59
+ entities.push(entity);
60
+ entityMap.set(row.id, entity);
61
+ // Index by name (skip empty names)
62
+ if (name.length > 0) {
63
+ const existing = nameToChunkIds.get(name);
64
+ if (existing) {
65
+ existing.push(row.id);
66
+ }
67
+ else {
68
+ nameToChunkIds.set(name, [row.id]);
69
+ }
70
+ }
71
+ // Index by file path
72
+ if (filePath.length > 0) {
73
+ const existing = fileToChunkIds.get(filePath);
74
+ if (existing) {
75
+ existing.push(row.id);
76
+ }
77
+ else {
78
+ fileToChunkIds.set(filePath, [row.id]);
79
+ }
80
+ }
81
+ }
82
+ return ok({
83
+ entities,
84
+ totalChunks: entities.length,
85
+ entityMap,
86
+ nameToChunkIds,
87
+ fileToChunkIds,
88
+ });
89
+ }
90
+ catch (error) {
91
+ const message = error instanceof Error ? error.message : 'Unknown error';
92
+ return err(new IndexScanError(`Failed to parse index rows: ${message}`));
93
+ }
94
+ }
95
+ /**
96
+ * Build a caller map from graph edges.
97
+ * Maps target chunkId to source chunkIds that reference it.
98
+ */
99
+ export function buildCallerMap(edges) {
100
+ const callerMap = new Map();
101
+ for (const edge of edges) {
102
+ if (edge.type === 'calls' || edge.type === 'references' || edge.type === 'imports') {
103
+ const existing = callerMap.get(edge.target);
104
+ if (existing) {
105
+ existing.push(edge.source);
106
+ }
107
+ else {
108
+ callerMap.set(edge.target, [edge.source]);
109
+ }
110
+ }
111
+ }
112
+ return callerMap;
113
+ }
114
+ /**
115
+ * Build a test file map: maps source file paths to test file chunk IDs.
116
+ * Heuristic: a file at `foo.test.ts` or `foo.spec.ts` is the test for `foo.ts`.
117
+ */
118
+ export function buildTestMap(fileToChunkIds) {
119
+ const testMap = new Map();
120
+ for (const [filePath, chunkIds] of fileToChunkIds) {
121
+ const isTestFile = /\.(test|spec)\.[^.]+$/.test(filePath);
122
+ if (isTestFile) {
123
+ // Derive the source file path
124
+ const sourceFilePath = filePath.replace(/\.(test|spec)\./, '.');
125
+ const existing = testMap.get(sourceFilePath);
126
+ if (existing) {
127
+ existing.push(...chunkIds);
128
+ }
129
+ else {
130
+ testMap.set(sourceFilePath, [...chunkIds]);
131
+ }
132
+ }
133
+ }
134
+ return testMap;
135
+ }
@@ -0,0 +1,6 @@
1
+ export type { ScannedEntity, IndexScanResult, } from './index-scanner.js';
2
+ export { IndexScanError, parseIndexRows, buildCallerMap, buildTestMap, } from './index-scanner.js';
3
+ export type { BenchmarkQueryType, GeneratedQuery, QueryGeneratorOptions, } from './query-generator.js';
4
+ export { generateQueries, generateFindByNameQueries, generateFindByDescriptionQueries, generateFindCallersQueries, generateFindTestsQueries, generateFindImportsQueries, } from './query-generator.js';
5
+ export type { QueryEvalResult, QueryMetrics, AggregateEvalMetrics, QueryTypeBreakdown, BenchmarkReport, BenchmarkMetadata, SearchFn, BenchmarkProgressFn, } from './benchmark-evaluator.js';
6
+ export { BenchmarkEvalError, computeQueryMetrics, computeAggregateMetrics, computeQueryTypeBreakdown, runBenchmark, formatBenchmarkSummary, } from './benchmark-evaluator.js';
@@ -0,0 +1,4 @@
1
+ // Auto-generated benchmark module — barrel export
2
+ export { IndexScanError, parseIndexRows, buildCallerMap, buildTestMap, } from './index-scanner.js';
3
+ export { generateQueries, generateFindByNameQueries, generateFindByDescriptionQueries, generateFindCallersQueries, generateFindTestsQueries, generateFindImportsQueries, } from './query-generator.js';
4
+ export { BenchmarkEvalError, computeQueryMetrics, computeAggregateMetrics, computeQueryTypeBreakdown, runBenchmark, formatBenchmarkSummary, } from './benchmark-evaluator.js';
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Auto-generates benchmark queries with ground-truth from scanned index data.
3
+ *
4
+ * Query types:
5
+ * - find-by-name: "Where is the X function/class/interface?"
6
+ * - find-by-description: Uses NL summary as query text
7
+ * - find-callers: "What calls X?" or "What references X?"
8
+ * - find-tests: "Tests for X" or "Test file for foo.ts"
9
+ * - find-imports: "What does X import?" or "What imports X?"
10
+ *
11
+ * Each query has a ground-truth set of expected chunk IDs.
12
+ */
13
+ import type { ScannedEntity, IndexScanResult } from './index-scanner.js';
14
+ import type { GraphEdge } from '../graph/dependency-graph.js';
15
+ /** The type of benchmark query generated. */
16
+ export type BenchmarkQueryType = 'find-by-name' | 'find-by-description' | 'find-callers' | 'find-tests' | 'find-imports';
17
+ /** A single auto-generated benchmark query with ground truth. */
18
+ export interface GeneratedQuery {
19
+ readonly query: string;
20
+ readonly expectedChunkIds: readonly string[];
21
+ readonly queryType: BenchmarkQueryType;
22
+ /** Source entity that inspired this query. */
23
+ readonly sourceEntityId: string;
24
+ }
25
+ /** Options for query generation. */
26
+ export interface QueryGeneratorOptions {
27
+ /** Total number of queries to generate (default: 100). */
28
+ readonly maxQueries: number;
29
+ /** Distribution of query types as fractions (must sum to 1.0). */
30
+ readonly distribution?: Readonly<Record<BenchmarkQueryType, number>>;
31
+ }
32
+ /**
33
+ * Generate find-by-name queries.
34
+ * Query: "Where is the <name> <type>?" or "<name> <type>"
35
+ * Ground truth: chunk ID of the entity itself.
36
+ */
37
+ export declare function generateFindByNameQueries(entities: readonly ScannedEntity[], count: number, random: () => number): GeneratedQuery[];
38
+ /**
39
+ * Generate find-by-description queries.
40
+ * Query: The NL summary of the entity.
41
+ * Ground truth: chunk ID of the entity itself.
42
+ */
43
+ export declare function generateFindByDescriptionQueries(entities: readonly ScannedEntity[], count: number, random: () => number): GeneratedQuery[];
44
+ /**
45
+ * Generate find-callers queries.
46
+ * Query: "What calls <name>?" or "callers of <name>"
47
+ * Ground truth: chunk IDs of callers from the dependency graph.
48
+ */
49
+ export declare function generateFindCallersQueries(entities: readonly ScannedEntity[], callerMap: ReadonlyMap<string, readonly string[]>, count: number, random: () => number): GeneratedQuery[];
50
+ /**
51
+ * Generate find-tests queries.
52
+ * Query: "tests for <name>" or "test file for <filePath>"
53
+ * Ground truth: chunk IDs in the corresponding test file.
54
+ */
55
+ export declare function generateFindTestsQueries(entities: readonly ScannedEntity[], testMap: ReadonlyMap<string, readonly string[]>, count: number, random: () => number): GeneratedQuery[];
56
+ /**
57
+ * Generate find-imports queries.
58
+ * Query: "imports of <name>" or "what does <name> import"
59
+ * Ground truth: chunk IDs of imported modules (resolved via name map).
60
+ */
61
+ export declare function generateFindImportsQueries(entities: readonly ScannedEntity[], edges: readonly GraphEdge[], count: number, random: () => number): GeneratedQuery[];
62
+ /**
63
+ * Generate a complete benchmark dataset from scanned index data.
64
+ *
65
+ * Distributes queries across types according to the configured distribution,
66
+ * using a deterministic seeded RNG for reproducibility.
67
+ */
68
+ export declare function generateQueries(scanResult: IndexScanResult, edges: readonly GraphEdge[], callerMap: ReadonlyMap<string, readonly string[]>, testMap: ReadonlyMap<string, readonly string[]>, options: QueryGeneratorOptions, seed?: number): readonly GeneratedQuery[];
@@ -0,0 +1,205 @@
1
+ /**
2
+ * Auto-generates benchmark queries with ground-truth from scanned index data.
3
+ *
4
+ * Query types:
5
+ * - find-by-name: "Where is the X function/class/interface?"
6
+ * - find-by-description: Uses NL summary as query text
7
+ * - find-callers: "What calls X?" or "What references X?"
8
+ * - find-tests: "Tests for X" or "Test file for foo.ts"
9
+ * - find-imports: "What does X import?" or "What imports X?"
10
+ *
11
+ * Each query has a ground-truth set of expected chunk IDs.
12
+ */
13
+ const DEFAULT_DISTRIBUTION = {
14
+ 'find-by-name': 0.30,
15
+ 'find-by-description': 0.25,
16
+ 'find-callers': 0.15,
17
+ 'find-tests': 0.15,
18
+ 'find-imports': 0.15,
19
+ };
20
+ /** Types eligible for name-based queries (skip import_block, config_block). */
21
+ const NAME_QUERY_TYPES = new Set([
22
+ 'function', 'method', 'class', 'interface', 'type_alias', 'module',
23
+ ]);
24
+ /**
25
+ * Deterministic seeded pseudo-random number generator (mulberry32).
26
+ * Allows reproducible benchmark datasets.
27
+ */
28
+ function createSeededRandom(seed) {
29
+ let state = seed | 0;
30
+ return () => {
31
+ state = (state + 0x6D2B79F5) | 0;
32
+ let t = Math.imul(state ^ (state >>> 15), 1 | state);
33
+ t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
34
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
35
+ };
36
+ }
37
+ /**
38
+ * Shuffle an array deterministically using Fisher-Yates with seeded RNG.
39
+ */
40
+ function shuffleDeterministic(items, random) {
41
+ const result = [...items];
42
+ for (let i = result.length - 1; i > 0; i--) {
43
+ const j = Math.floor(random() * (i + 1));
44
+ const temp = result[i];
45
+ result[i] = result[j];
46
+ result[j] = temp;
47
+ }
48
+ return result;
49
+ }
50
+ /**
51
+ * Generate find-by-name queries.
52
+ * Query: "Where is the <name> <type>?" or "<name> <type>"
53
+ * Ground truth: chunk ID of the entity itself.
54
+ */
55
+ export function generateFindByNameQueries(entities, count, random) {
56
+ const eligible = entities.filter((e) => e.name.length > 0 && NAME_QUERY_TYPES.has(e.chunkType));
57
+ if (eligible.length === 0)
58
+ return [];
59
+ const shuffled = shuffleDeterministic(eligible, random);
60
+ const selected = shuffled.slice(0, count);
61
+ return selected.map((entity) => {
62
+ const typeLabel = entity.chunkType === 'type_alias' ? 'type' : entity.chunkType;
63
+ return {
64
+ query: `${entity.name} ${typeLabel}`,
65
+ expectedChunkIds: [entity.chunkId],
66
+ queryType: 'find-by-name',
67
+ sourceEntityId: entity.chunkId,
68
+ };
69
+ });
70
+ }
71
+ /**
72
+ * Generate find-by-description queries.
73
+ * Query: The NL summary of the entity.
74
+ * Ground truth: chunk ID of the entity itself.
75
+ */
76
+ export function generateFindByDescriptionQueries(entities, count, random) {
77
+ const eligible = entities.filter((e) => e.nlSummary.length > 20 && NAME_QUERY_TYPES.has(e.chunkType));
78
+ if (eligible.length === 0)
79
+ return [];
80
+ const shuffled = shuffleDeterministic(eligible, random);
81
+ const selected = shuffled.slice(0, count);
82
+ return selected.map((entity) => ({
83
+ query: entity.nlSummary,
84
+ expectedChunkIds: [entity.chunkId],
85
+ queryType: 'find-by-description',
86
+ sourceEntityId: entity.chunkId,
87
+ }));
88
+ }
89
+ /**
90
+ * Generate find-callers queries.
91
+ * Query: "What calls <name>?" or "callers of <name>"
92
+ * Ground truth: chunk IDs of callers from the dependency graph.
93
+ */
94
+ export function generateFindCallersQueries(entities, callerMap, count, random) {
95
+ // Only generate queries for entities that actually have callers
96
+ const eligible = entities.filter((e) => e.name.length > 0 && (callerMap.get(e.chunkId)?.length ?? 0) > 0);
97
+ if (eligible.length === 0)
98
+ return [];
99
+ const shuffled = shuffleDeterministic(eligible, random);
100
+ const selected = shuffled.slice(0, count);
101
+ return selected.map((entity) => {
102
+ const callers = callerMap.get(entity.chunkId) ?? [];
103
+ return {
104
+ query: `callers of ${entity.name}`,
105
+ expectedChunkIds: [...callers, entity.chunkId],
106
+ queryType: 'find-callers',
107
+ sourceEntityId: entity.chunkId,
108
+ };
109
+ });
110
+ }
111
+ /**
112
+ * Generate find-tests queries.
113
+ * Query: "tests for <name>" or "test file for <filePath>"
114
+ * Ground truth: chunk IDs in the corresponding test file.
115
+ */
116
+ export function generateFindTestsQueries(entities, testMap, count, random) {
117
+ // Only generate for entities whose file has a test file
118
+ const eligible = entities.filter((e) => e.name.length > 0 &&
119
+ NAME_QUERY_TYPES.has(e.chunkType) &&
120
+ (testMap.get(e.filePath)?.length ?? 0) > 0);
121
+ if (eligible.length === 0)
122
+ return [];
123
+ const shuffled = shuffleDeterministic(eligible, random);
124
+ const selected = shuffled.slice(0, count);
125
+ return selected.map((entity) => {
126
+ const testChunkIds = testMap.get(entity.filePath) ?? [];
127
+ return {
128
+ query: `tests for ${entity.name}`,
129
+ expectedChunkIds: [...testChunkIds, entity.chunkId],
130
+ queryType: 'find-tests',
131
+ sourceEntityId: entity.chunkId,
132
+ };
133
+ });
134
+ }
135
+ /**
136
+ * Generate find-imports queries.
137
+ * Query: "imports of <name>" or "what does <name> import"
138
+ * Ground truth: chunk IDs of imported modules (resolved via name map).
139
+ */
140
+ export function generateFindImportsQueries(entities, edges, count, random) {
141
+ // Build import targets from graph edges
142
+ const importTargets = new Map();
143
+ for (const edge of edges) {
144
+ if (edge.type === 'imports') {
145
+ const existing = importTargets.get(edge.source);
146
+ if (existing) {
147
+ existing.push(edge.target);
148
+ }
149
+ else {
150
+ importTargets.set(edge.source, [edge.target]);
151
+ }
152
+ }
153
+ }
154
+ const eligible = entities.filter((e) => e.name.length > 0 &&
155
+ NAME_QUERY_TYPES.has(e.chunkType) &&
156
+ (importTargets.get(e.chunkId)?.length ?? 0) > 0);
157
+ if (eligible.length === 0)
158
+ return [];
159
+ const shuffled = shuffleDeterministic(eligible, random);
160
+ const selected = shuffled.slice(0, count);
161
+ return selected.map((entity) => {
162
+ const targets = importTargets.get(entity.chunkId) ?? [];
163
+ return {
164
+ query: `imports of ${entity.name}`,
165
+ expectedChunkIds: [...targets, entity.chunkId],
166
+ queryType: 'find-imports',
167
+ sourceEntityId: entity.chunkId,
168
+ };
169
+ });
170
+ }
171
+ /**
172
+ * Generate a complete benchmark dataset from scanned index data.
173
+ *
174
+ * Distributes queries across types according to the configured distribution,
175
+ * using a deterministic seeded RNG for reproducibility.
176
+ */
177
+ export function generateQueries(scanResult, edges, callerMap, testMap, options, seed = 42) {
178
+ const random = createSeededRandom(seed);
179
+ const distribution = options.distribution ?? DEFAULT_DISTRIBUTION;
180
+ const maxQueries = options.maxQueries;
181
+ // Calculate target counts per type
182
+ const targetCounts = {
183
+ 'find-by-name': Math.round(maxQueries * distribution['find-by-name']),
184
+ 'find-by-description': Math.round(maxQueries * distribution['find-by-description']),
185
+ 'find-callers': Math.round(maxQueries * distribution['find-callers']),
186
+ 'find-tests': Math.round(maxQueries * distribution['find-tests']),
187
+ 'find-imports': Math.round(maxQueries * distribution['find-imports']),
188
+ };
189
+ const { entities } = scanResult;
190
+ const nameQueries = generateFindByNameQueries(entities, targetCounts['find-by-name'], random);
191
+ const descQueries = generateFindByDescriptionQueries(entities, targetCounts['find-by-description'], random);
192
+ const callerQueries = generateFindCallersQueries(entities, callerMap, targetCounts['find-callers'], random);
193
+ const testQueries = generateFindTestsQueries(entities, testMap, targetCounts['find-tests'], random);
194
+ const importQueries = generateFindImportsQueries(entities, edges, targetCounts['find-imports'], random);
195
+ // Combine and trim to maxQueries
196
+ const allQueries = [
197
+ ...nameQueries,
198
+ ...descQueries,
199
+ ...callerQueries,
200
+ ...testQueries,
201
+ ...importQueries,
202
+ ];
203
+ // Shuffle the combined set for fair evaluation
204
+ return shuffleDeterministic(allQueries, random).slice(0, maxQueries);
205
+ }
@@ -3,6 +3,7 @@ import { join } from 'node:path';
3
3
  import { ok, err } from 'neverthrow';
4
4
  import { parse } from 'yaml';
5
5
  import { z } from 'zod';
6
+ import { safeString, safeRecord } from '../utils/safe-cast.js';
6
7
  export class ConfigError extends Error {
7
8
  constructor(message) {
8
9
  super(message);
@@ -156,8 +157,10 @@ export function interpolateEnvVars(obj) {
156
157
  return result;
157
158
  }
158
159
  if (obj !== null && typeof obj === 'object') {
160
+ // Runtime guard above ensures obj is a non-null object (not an array — handled earlier)
161
+ const record = safeRecord(obj, {});
159
162
  const result = {};
160
- for (const [key, value] of Object.entries(obj)) {
163
+ for (const [key, value] of Object.entries(record)) {
161
164
  const interpolated = interpolateEnvVars(value);
162
165
  if (interpolated instanceof ConfigError)
163
166
  return interpolated;
@@ -181,8 +184,10 @@ function normalizeEmbeddingConfig(embeddingPartial) {
181
184
  const defaults = { ...DEFAULT_CONFIG.embedding };
182
185
  const merged = { ...defaults, ...embeddingPartial };
183
186
  // Support snake_case key from YAML: openai_compatible -> openaiCompatible
184
- const openaiCompat = merged['openaiCompatible'] ??
185
- merged['openai_compatible'];
187
+ const openaiCompatRaw = merged['openaiCompatible'] ?? merged['openai_compatible'];
188
+ const openaiCompat = openaiCompatRaw !== undefined && openaiCompatRaw !== null
189
+ ? safeRecord(openaiCompatRaw, {})
190
+ : undefined;
186
191
  // Remove the snake_case variant so only the camelCase one remains
187
192
  delete merged['openai_compatible'];
188
193
  if (openaiCompat) {
@@ -208,35 +213,42 @@ function normalizeEmbeddingConfig(embeddingPartial) {
208
213
  }
209
214
  return merged;
210
215
  }
216
+ /** Extract a sub-record from a config object, returning undefined if not a valid record. */
217
+ function optionalRecord(value) {
218
+ if (value !== null && typeof value === 'object' && !Array.isArray(value)) {
219
+ return safeRecord(value);
220
+ }
221
+ return undefined;
222
+ }
211
223
  function applyDefaults(partial) {
212
224
  return {
213
- version: partial['version'] ?? DEFAULT_CONFIG.version,
225
+ version: safeString(partial['version'], DEFAULT_CONFIG.version),
214
226
  project: {
215
227
  ...DEFAULT_CONFIG.project,
216
- ...partial['project'],
228
+ ...optionalRecord(partial['project']),
217
229
  },
218
230
  ingestion: {
219
231
  ...DEFAULT_CONFIG.ingestion,
220
- ...partial['ingestion'],
232
+ ...optionalRecord(partial['ingestion']),
221
233
  },
222
- embedding: normalizeEmbeddingConfig(partial['embedding']),
234
+ embedding: normalizeEmbeddingConfig(optionalRecord(partial['embedding'])),
223
235
  llm: {
224
236
  ...DEFAULT_CONFIG.llm,
225
- ...partial['llm'],
237
+ ...optionalRecord(partial['llm']),
226
238
  },
227
239
  search: {
228
240
  ...DEFAULT_CONFIG.search,
229
- ...partial['search'],
241
+ ...optionalRecord(partial['search']),
230
242
  },
231
243
  storage: {
232
244
  ...DEFAULT_CONFIG.storage,
233
- ...partial['storage'],
245
+ ...optionalRecord(partial['storage']),
234
246
  },
235
247
  ...(partial['reranker'] !== undefined
236
248
  ? {
237
249
  reranker: {
238
250
  ...DEFAULT_CONFIG.reranker,
239
- ...partial['reranker'],
251
+ ...optionalRecord(partial['reranker']),
240
252
  },
241
253
  }
242
254
  : {}),
@@ -256,7 +268,7 @@ function deepMerge(target, source) {
256
268
  const sv = source[key];
257
269
  const tv = target[key];
258
270
  if (sv !== null && typeof sv === 'object' && !Array.isArray(sv) && tv !== null && typeof tv === 'object' && !Array.isArray(tv)) {
259
- result[key] = deepMerge(tv, sv);
271
+ result[key] = deepMerge(safeRecord(tv), safeRecord(sv));
260
272
  }
261
273
  else {
262
274
  result[key] = sv;
@@ -290,7 +302,7 @@ export async function loadConfig(rootDir) {
290
302
  const localContent = await readFile(localPath, 'utf-8');
291
303
  const localParsed = parse(localContent);
292
304
  if (localParsed !== null && localParsed !== undefined && typeof localParsed === 'object') {
293
- parsed = deepMerge(parsed, localParsed);
305
+ parsed = deepMerge(safeRecord(parsed, {}), safeRecord(localParsed, {}));
294
306
  }
295
307
  }
296
308
  catch {
@@ -301,10 +313,11 @@ export async function loadConfig(rootDir) {
301
313
  if (interpolated instanceof ConfigError) {
302
314
  return err(interpolated);
303
315
  }
304
- const withDefaults = applyDefaults(interpolated);
316
+ const withDefaults = applyDefaults(safeRecord(interpolated, {}));
305
317
  const validationResult = codeRAGConfigSchema.safeParse(withDefaults);
306
318
  if (!validationResult.success) {
307
319
  return err(new ConfigError(`Config validation failed: ${formatZodErrors(validationResult.error)}`));
308
320
  }
321
+ // eslint-disable-next-line @typescript-eslint/consistent-type-assertions -- Zod schema structurally matches CodeRAGConfig; safeParse validates all fields
309
322
  return ok(validationResult.data);
310
323
  }
@@ -1,6 +1,11 @@
1
1
  import { ok, err } from 'neverthrow';
2
2
  import { EmbedError } from '../types/provider.js';
3
+ import { safeString, safeStringUnion } from '../utils/safe-cast.js';
3
4
  const RRF_K = 60;
5
+ const CHUNK_TYPES = [
6
+ 'function', 'method', 'class', 'module', 'interface',
7
+ 'type_alias', 'config_block', 'import_block', 'doc',
8
+ ];
4
9
  export class HybridSearch {
5
10
  vectorStore;
6
11
  bm25Index;
@@ -86,10 +91,12 @@ export class HybridSearch {
86
91
  else {
87
92
  // Vector-only hit: hydrate from vector store metadata
88
93
  const meta = vectorMetadataMap.get(chunkId) ?? {};
89
- const storedName = meta['name'] ?? '';
90
- const storedChunkType = meta['chunk_type'] ?? 'function';
91
- const storedFilePath = meta['file_path'] ?? '';
92
- const storedLanguage = meta['language'] ?? 'unknown';
94
+ const storedName = safeString(meta['name'], '');
95
+ const storedChunkType = safeStringUnion(meta['chunk_type'], CHUNK_TYPES, 'function');
96
+ const storedFilePath = safeString(meta['file_path'], '');
97
+ const storedLanguage = safeString(meta['language'], 'unknown');
98
+ const storedContent = safeString(meta['content'], '');
99
+ const storedNlSummary = safeString(meta['nl_summary'], '');
93
100
  const chunkMetadata = {
94
101
  chunkType: storedChunkType,
95
102
  name: storedName,
@@ -99,15 +106,15 @@ export class HybridSearch {
99
106
  };
100
107
  merged.push({
101
108
  chunkId,
102
- content: meta['content'] ?? '',
103
- nlSummary: meta['nl_summary'] ?? '',
109
+ content: storedContent,
110
+ nlSummary: storedNlSummary,
104
111
  score: fusedScore,
105
112
  method: 'hybrid',
106
113
  metadata: chunkMetadata,
107
114
  chunk: {
108
115
  id: chunkId,
109
- content: meta['content'] ?? '',
110
- nlSummary: meta['nl_summary'] ?? '',
116
+ content: storedContent,
117
+ nlSummary: storedNlSummary,
111
118
  filePath: storedFilePath,
112
119
  startLine: 0,
113
120
  endLine: 0,
@@ -17,5 +17,18 @@ export declare class LanceDBStore implements VectorStore {
17
17
  }[], StoreError>>;
18
18
  delete(ids: string[]): Promise<Result<void, StoreError>>;
19
19
  count(): Promise<Result<number, StoreError>>;
20
+ getById(id: string): Promise<Result<{
21
+ id: string;
22
+ metadata: Record<string, unknown>;
23
+ } | undefined, StoreError>>;
24
+ /**
25
+ * Scan all rows from the table.
26
+ * Returns an array of { id, metadata } objects (no vectors).
27
+ * Useful for index analysis and benchmark generation.
28
+ */
29
+ getAll(limit?: number): Promise<Result<{
30
+ id: string;
31
+ metadata: Record<string, unknown>;
32
+ }[], StoreError>>;
20
33
  close(): void;
21
34
  }