@axplusb/kepler 1.0.10 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@axplusb/kepler",
3
- "version": "1.0.10",
3
+ "version": "2.0.0",
4
4
  "description": "Kepler — AI coding agent with operating brief, preflight planning, and sub-agents. SWE-bench Lite evaluated.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -40,5 +40,8 @@
40
40
  "type": "git",
41
41
  "url": "git+https://github.com/raviakasapu/codekepler-npm.git"
42
42
  },
43
- "dependencies": {}
43
+ "dependencies": {
44
+ "tree-sitter-wasms": "^0.1.13",
45
+ "web-tree-sitter": "^0.26.9"
46
+ }
44
47
  }
@@ -4,12 +4,14 @@
4
4
  */
5
5
 
6
6
  import { BM25Index } from './bm25.mjs';
7
+ import { SymbolIndexer } from './symbol-indexer.mjs';
7
8
  import * as fs from 'node:fs';
8
9
  import * as path from 'node:path';
9
10
  import { indexDir as getIndexDir } from '../core/paths.mjs';
10
11
 
11
12
  const IGNORED_DIRS = new Set(['.git', 'node_modules', '.kepler', '__pycache__', '.venv', 'venv', 'dist', 'build', '.next']);
12
13
  const CODE_EXTS = new Set(['.js', '.mjs', '.ts', '.tsx', '.py', '.go', '.rs', '.java', '.rb', '.php', '.c', '.cpp', '.h', '.css', '.html', '.json', '.yaml', '.yml', '.toml', '.md', '.sh']);
14
+ const SYMBOL_EXTS = new Set(['.py', '.js', '.mjs', '.ts', '.tsx', '.jsx', '.go', '.rs']);
13
15
  const MAX_FILE_SIZE = 100_000; // 100KB
14
16
  const CHUNK_LINES = 50;
15
17
  const CHUNK_OVERLAP = 10;
@@ -19,20 +21,33 @@ export class ContextRetriever {
19
21
  this.projectDir = projectDir;
20
22
  this.indexDir = getIndexDir(projectDir);
21
23
  this.index = null;
24
+ this.symbolIndexer = null;
22
25
  this.chunkTexts = new Map(); // id → original text content
23
26
  }
24
27
 
25
- /** Build or rebuild the search index. */
28
+ /** Build or rebuild the search index (BM25 chunks + symbol index). */
26
29
  async buildIndex() {
27
30
  const files = this._scanFiles(this.projectDir);
28
31
  const documents = [];
29
32
 
33
+ // Symbol indexer for AST-based search
34
+ this.symbolIndexer = new SymbolIndexer();
35
+ await this.symbolIndexer.init();
36
+
30
37
  for (const filePath of files) {
31
38
  try {
32
39
  const content = fs.readFileSync(filePath, 'utf-8');
33
40
  const relPath = path.relative(this.projectDir, filePath);
41
+
42
+ // BM25 chunks (existing behavior)
34
43
  const chunks = this._chunkFile(content, relPath);
35
44
  documents.push(...chunks);
45
+
46
+ // Symbol extraction for code files
47
+ const ext = path.extname(filePath).toLowerCase();
48
+ if (SYMBOL_EXTS.has(ext)) {
49
+ await this.symbolIndexer.indexFile(relPath, content);
50
+ }
36
51
  } catch { /* skip unreadable files */ }
37
52
  }
38
53
 
@@ -45,12 +60,13 @@ export class ContextRetriever {
45
60
  this.chunkTexts.set(doc.id, doc.text);
46
61
  }
47
62
 
48
- // Persist index + chunk texts
63
+ // Persist
49
64
  if (!fs.existsSync(this.indexDir)) fs.mkdirSync(this.indexDir, { recursive: true });
50
65
  fs.writeFileSync(path.join(this.indexDir, 'bm25.json'), JSON.stringify(this.index.toJSON()));
51
66
  fs.writeFileSync(path.join(this.indexDir, 'chunks.json'), JSON.stringify(Object.fromEntries(this.chunkTexts)));
67
+ fs.writeFileSync(path.join(this.indexDir, 'symbols.json'), JSON.stringify(this.symbolIndexer.toJSON()));
52
68
 
53
- return { fileCount: files.length, chunkCount: documents.length };
69
+ return { fileCount: files.length, chunkCount: documents.length, symbolCount: this.symbolIndexer.symbolCount };
54
70
  }
55
71
 
56
72
  /**
@@ -120,22 +136,44 @@ export class ContextRetriever {
120
136
  loadIndex() {
121
137
  const indexPath = path.join(this.indexDir, 'bm25.json');
122
138
  const chunksPath = path.join(this.indexDir, 'chunks.json');
139
+ const symbolsPath = path.join(this.indexDir, 'symbols.json');
123
140
  if (!fs.existsSync(indexPath)) return false;
124
141
  try {
125
142
  const data = JSON.parse(fs.readFileSync(indexPath, 'utf-8'));
126
143
  this.index = BM25Index.fromJSON(data);
127
144
 
128
- // Load chunk texts if available
129
145
  if (fs.existsSync(chunksPath)) {
130
146
  const chunks = JSON.parse(fs.readFileSync(chunksPath, 'utf-8'));
131
147
  this.chunkTexts = new Map(Object.entries(chunks));
132
148
  }
149
+
150
+ if (fs.existsSync(symbolsPath)) {
151
+ const symData = JSON.parse(fs.readFileSync(symbolsPath, 'utf-8'));
152
+ this.symbolIndexer = SymbolIndexer.fromJSON(symData);
153
+ }
133
154
  return true;
134
155
  } catch {
135
156
  return false;
136
157
  }
137
158
  }
138
159
 
160
+ /**
161
+ * Search symbols (functions, classes, methods) by query.
162
+ * Returns structured results with file:line, signature, parent class.
163
+ */
164
+ searchSymbols(query, topK = 5) {
165
+ if (!this.symbolIndexer) return [];
166
+ return this.symbolIndexer.search(query, topK);
167
+ }
168
+
169
+ /**
170
+ * Format symbol search results for the agent.
171
+ */
172
+ formatSymbolResults(results) {
173
+ if (!this.symbolIndexer || !results.length) return '';
174
+ return this.symbolIndexer.formatResults(results);
175
+ }
176
+
139
177
  /** Retrieve relevant context chunks for a query, with full text. */
140
178
  retrieve(query, topK = 10) {
141
179
  if (!this.index) {
@@ -0,0 +1,375 @@
1
+ /**
2
+ * Symbol Indexer — AST-based code search using tree-sitter.
3
+ *
4
+ * Parses source files into symbols (functions, classes, methods) with
5
+ * signatures and line numbers. Indexes symbols in BM25 for search.
6
+ *
7
+ * Memory efficient: stores symbol signatures (~50 chars) not file chunks
8
+ * (~2000 chars). One tree-sitter parse per file, O(n) on file size.
9
+ *
10
+ * Usage:
11
+ * const indexer = new SymbolIndexer();
12
+ * await indexer.init(); // load WASM grammars once
13
+ * indexer.indexFile('/path/to/file.py', content);
14
+ * const results = indexer.search('find_ordering_name');
15
+ */
16
+
17
+ import * as fs from 'node:fs';
18
+ import * as path from 'node:path';
19
+ import { BM25Index } from './bm25.mjs';
20
+
21
+ const GRAMMAR_DIR = new URL('./grammars/', import.meta.url).pathname;
22
+
23
+ const LANG_MAP = {
24
+ '.py': 'python',
25
+ '.js': 'javascript',
26
+ '.mjs': 'javascript',
27
+ '.jsx': 'javascript',
28
+ '.ts': 'typescript',
29
+ '.tsx': 'typescript',
30
+ };
31
+
32
+ /**
33
+ * @typedef {Object} Symbol
34
+ * @property {string} name
35
+ * @property {string} kind - 'function' | 'class' | 'method'
36
+ * @property {string} file - relative path
37
+ * @property {number} line
38
+ * @property {number} endLine
39
+ * @property {string} signature - e.g., "def find_ordering_name(self, name, opts)"
40
+ * @property {string} [parent] - parent class name if method
41
+ * @property {string} [docstring] - first line of docstring
42
+ */
43
+
44
+ export class SymbolIndexer {
45
+ constructor() {
46
+ this._Parser = null;
47
+ this._languages = {}; // ext → Language
48
+ this._symbols = []; // all extracted symbols
49
+ this._symbolMap = new Map(); // id → Symbol
50
+ this._bm25 = new BM25Index();
51
+ this._initialized = false;
52
+ }
53
+
54
+ /**
55
+ * Load tree-sitter WASM runtime + grammars. Call once per session.
56
+ * Lazy — only loads grammars for languages actually encountered.
57
+ */
58
+ async init() {
59
+ if (this._initialized) return;
60
+ try {
61
+ const TreeSitter = (await import('web-tree-sitter')).default;
62
+ await TreeSitter.init();
63
+ this._Parser = new TreeSitter();
64
+ this._TreeSitter = TreeSitter;
65
+ this._initialized = true;
66
+ } catch (e) {
67
+ // Fallback: tree-sitter not available, use regex parser
68
+ this._initialized = false;
69
+ }
70
+ }
71
+
72
+ async _getLanguage(ext) {
73
+ if (this._languages[ext]) return this._languages[ext];
74
+ const langName = LANG_MAP[ext];
75
+ if (!langName || !this._TreeSitter) return null;
76
+
77
+ // Try bundled WASM from tree-sitter-wasms package
78
+ const wasmPaths = [
79
+ path.join(GRAMMAR_DIR, `tree-sitter-${langName}.wasm`),
80
+ ];
81
+
82
+ // Also check node_modules
83
+ try {
84
+ const modPath = new URL(`../../node_modules/tree-sitter-wasms/out/tree-sitter-${langName}.wasm`, import.meta.url).pathname;
85
+ wasmPaths.push(modPath);
86
+ } catch { /* ignore */ }
87
+
88
+ for (const p of wasmPaths) {
89
+ try {
90
+ if (fs.existsSync(p)) {
91
+ const lang = await this._TreeSitter.Language.load(p);
92
+ this._languages[ext] = lang;
93
+ return lang;
94
+ }
95
+ } catch { /* try next */ }
96
+ }
97
+ return null;
98
+ }
99
+
100
+ /**
101
+ * Index a single file. Extracts symbols and adds to BM25.
102
+ * @param {string} relPath - relative path (used as ID)
103
+ * @param {string} content - file content
104
+ */
105
+ async indexFile(relPath, content) {
106
+ const ext = path.extname(relPath).toLowerCase();
107
+ let symbols;
108
+
109
+ const lang = await this._getLanguage(ext);
110
+ if (lang && this._Parser) {
111
+ this._Parser.setLanguage(lang);
112
+ const tree = this._Parser.parse(content);
113
+ symbols = this._extractSymbols(tree.rootNode, relPath, ext);
114
+ tree.delete();
115
+ } else {
116
+ symbols = this._regexExtract(relPath, content, ext);
117
+ }
118
+
119
+ for (const sym of symbols) {
120
+ const id = `${sym.file}:${sym.line}:${sym.name}`;
121
+ this._symbols.push(sym);
122
+ this._symbolMap.set(id, sym);
123
+
124
+ // BM25 document: name + signature + parent + docstring
125
+ const text = [
126
+ sym.name,
127
+ sym.signature || '',
128
+ sym.parent ? `${sym.parent}.${sym.name}` : '',
129
+ sym.docstring || '',
130
+ sym.file,
131
+ ].join(' ');
132
+ this._bm25.addDocument(id, text);
133
+ }
134
+ }
135
+
136
+ /**
137
+ * Search for symbols matching a query.
138
+ * @param {string} query
139
+ * @param {number} [topK=10]
140
+ * @returns {Array<{symbol: Symbol, score: number}>}
141
+ */
142
+ search(query, topK = 10) {
143
+ const results = this._bm25.search(query, topK);
144
+ return results.map(r => ({
145
+ symbol: this._symbolMap.get(r.id),
146
+ score: r.score,
147
+ id: r.id,
148
+ })).filter(r => r.symbol);
149
+ }
150
+
151
+ /**
152
+ * Format search results for the agent.
153
+ */
154
+ formatResults(results) {
155
+ if (!results.length) return '';
156
+ return results.map(r => {
157
+ const s = r.symbol;
158
+ const parent = s.parent ? `${s.parent}.` : '';
159
+ const doc = s.docstring ? ` "${s.docstring}"` : '';
160
+ return `${s.file}:${s.line} ${parent}${s.signature || s.name}${doc}`;
161
+ }).join('\n');
162
+ }
163
+
164
+ get symbolCount() { return this._symbols.length; }
165
+
166
+ // ── Tree-sitter extraction ──
167
+
168
+ _extractSymbols(node, file, ext) {
169
+ const symbols = [];
170
+ const langName = LANG_MAP[ext];
171
+
172
+ if (langName === 'python') {
173
+ this._walkPython(node, file, symbols, null);
174
+ } else if (langName === 'javascript' || langName === 'typescript') {
175
+ this._walkJS(node, file, symbols, null);
176
+ }
177
+ return symbols;
178
+ }
179
+
180
+ _walkPython(node, file, symbols, parentClass) {
181
+ for (let i = 0; i < node.childCount; i++) {
182
+ const child = node.child(i);
183
+ const type = child.type;
184
+
185
+ if (type === 'class_definition') {
186
+ const nameNode = child.childForFieldName('name');
187
+ const name = nameNode?.text || '';
188
+ const bases = child.childForFieldName('superclasses')?.text || '';
189
+ symbols.push({
190
+ name, kind: 'class', file,
191
+ line: child.startPosition.row + 1,
192
+ endLine: child.endPosition.row + 1,
193
+ signature: `class ${name}${bases ? `(${bases})` : ''}`,
194
+ docstring: this._pyDocstring(child),
195
+ });
196
+ // Recurse into class body for methods
197
+ const body = child.childForFieldName('body');
198
+ if (body) this._walkPython(body, file, symbols, name);
199
+ }
200
+
201
+ else if (type === 'function_definition') {
202
+ const nameNode = child.childForFieldName('name');
203
+ const name = nameNode?.text || '';
204
+ const params = child.childForFieldName('parameters')?.text || '()';
205
+ const returnType = child.childForFieldName('return_type')?.text || '';
206
+ const sig = `def ${name}${params}${returnType ? ' -> ' + returnType : ''}`;
207
+ symbols.push({
208
+ name,
209
+ kind: parentClass ? 'method' : 'function',
210
+ file,
211
+ line: child.startPosition.row + 1,
212
+ endLine: child.endPosition.row + 1,
213
+ signature: sig,
214
+ parent: parentClass || undefined,
215
+ docstring: this._pyDocstring(child),
216
+ });
217
+ }
218
+
219
+ else if (type === 'decorated_definition') {
220
+ // Unwrap decorator to get the actual definition
221
+ for (let j = 0; j < child.childCount; j++) {
222
+ const inner = child.child(j);
223
+ if (inner.type === 'function_definition' || inner.type === 'class_definition') {
224
+ this._walkPython(child, file, symbols, parentClass);
225
+ break;
226
+ }
227
+ }
228
+ }
229
+
230
+ else {
231
+ // Recurse for module-level statements
232
+ if (!parentClass && child.childCount > 0) {
233
+ this._walkPython(child, file, symbols, parentClass);
234
+ }
235
+ }
236
+ }
237
+ }
238
+
239
+ _pyDocstring(defNode) {
240
+ const body = defNode.childForFieldName('body');
241
+ if (!body || body.childCount === 0) return '';
242
+ const first = body.child(0);
243
+ if (first?.type === 'expression_statement') {
244
+ const expr = first.child(0);
245
+ if (expr?.type === 'string' || expr?.type === 'concatenated_string') {
246
+ const raw = expr.text;
247
+ // Extract first line of docstring
248
+ const content = raw.replace(/^['"`]{1,3}/, '').replace(/['"`]{1,3}$/, '');
249
+ const firstLine = content.split('\n')[0].trim();
250
+ return firstLine.slice(0, 120);
251
+ }
252
+ }
253
+ return '';
254
+ }
255
+
256
+ _walkJS(node, file, symbols, parentClass) {
257
+ for (let i = 0; i < node.childCount; i++) {
258
+ const child = node.child(i);
259
+ const type = child.type;
260
+
261
+ if (type === 'class_declaration' || type === 'class') {
262
+ const nameNode = child.childForFieldName('name');
263
+ const name = nameNode?.text || '';
264
+ symbols.push({
265
+ name, kind: 'class', file,
266
+ line: child.startPosition.row + 1,
267
+ endLine: child.endPosition.row + 1,
268
+ signature: `class ${name}`,
269
+ });
270
+ const body = child.childForFieldName('body');
271
+ if (body) this._walkJS(body, file, symbols, name);
272
+ }
273
+
274
+ else if (type === 'function_declaration' || type === 'method_definition') {
275
+ const nameNode = child.childForFieldName('name');
276
+ const name = nameNode?.text || '';
277
+ const params = child.childForFieldName('parameters')?.text || '()';
278
+ symbols.push({
279
+ name,
280
+ kind: parentClass ? 'method' : 'function',
281
+ file,
282
+ line: child.startPosition.row + 1,
283
+ endLine: child.endPosition.row + 1,
284
+ signature: `${parentClass ? '' : 'function '}${name}${params}`,
285
+ parent: parentClass || undefined,
286
+ });
287
+ }
288
+
289
+ else if (type === 'export_statement' || type === 'lexical_declaration') {
290
+ this._walkJS(child, file, symbols, parentClass);
291
+ }
292
+
293
+ else if (child.childCount > 0 && !parentClass) {
294
+ this._walkJS(child, file, symbols, parentClass);
295
+ }
296
+ }
297
+ }
298
+
299
+ // ── Regex fallback (no tree-sitter) ──
300
+
301
+ _regexExtract(file, content, ext) {
302
+ const symbols = [];
303
+ const lines = content.split('\n');
304
+ let currentClass = null;
305
+
306
+ for (let i = 0; i < lines.length; i++) {
307
+ const line = lines[i];
308
+ const trimmed = line.trim();
309
+ const lineNum = i + 1;
310
+ const indent = line.length - line.trimStart().length;
311
+
312
+ // Python
313
+ if (ext === '.py') {
314
+ const classMatch = trimmed.match(/^class\s+(\w+)(?:\(([^)]*)\))?/);
315
+ if (classMatch) {
316
+ currentClass = classMatch[1];
317
+ symbols.push({
318
+ name: currentClass, kind: 'class', file, line: lineNum,
319
+ signature: `class ${currentClass}${classMatch[2] ? `(${classMatch[2]})` : ''}`,
320
+ });
321
+ continue;
322
+ }
323
+ const fnMatch = trimmed.match(/^(?:async\s+)?def\s+(\w+)\s*\(([^)]*)\)/);
324
+ if (fnMatch) {
325
+ const isMethod = indent >= 4 && currentClass;
326
+ symbols.push({
327
+ name: fnMatch[1],
328
+ kind: isMethod ? 'method' : 'function',
329
+ file, line: lineNum,
330
+ signature: `def ${fnMatch[1]}(${fnMatch[2]})`,
331
+ parent: isMethod ? currentClass : undefined,
332
+ });
333
+ continue;
334
+ }
335
+ if (indent === 0 && !trimmed.startsWith('#') && trimmed) {
336
+ currentClass = null;
337
+ }
338
+ }
339
+
340
+ // JS/TS
341
+ if (['.js', '.mjs', '.ts', '.tsx', '.jsx'].includes(ext)) {
342
+ const fnMatch = trimmed.match(/^(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(([^)]*)\)/);
343
+ if (fnMatch) {
344
+ symbols.push({ name: fnMatch[1], kind: 'function', file, line: lineNum, signature: `function ${fnMatch[1]}(${fnMatch[2]})` });
345
+ }
346
+ const classMatch = trimmed.match(/^(?:export\s+)?class\s+(\w+)/);
347
+ if (classMatch) {
348
+ symbols.push({ name: classMatch[1], kind: 'class', file, line: lineNum, signature: `class ${classMatch[1]}` });
349
+ }
350
+ }
351
+ }
352
+ return symbols;
353
+ }
354
+
355
+ // ── Serialization ──
356
+
357
+ toJSON() {
358
+ return {
359
+ symbols: this._symbols,
360
+ bm25: this._bm25.toJSON(),
361
+ };
362
+ }
363
+
364
+ static fromJSON(data) {
365
+ const indexer = new SymbolIndexer();
366
+ indexer._initialized = true; // don't need tree-sitter for search
367
+ indexer._symbols = data.symbols || [];
368
+ indexer._bm25 = BM25Index.fromJSON(data.bm25);
369
+ for (const sym of indexer._symbols) {
370
+ const id = `${sym.file}:${sym.line}:${sym.name}`;
371
+ indexer._symbolMap.set(id, sym);
372
+ }
373
+ return indexer;
374
+ }
375
+ }