codebasesearch 0.1.23 → 0.1.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.prd CHANGED
@@ -1,78 +1 @@
1
- {
2
- "project": "code-search",
3
- "created": "2026-03-12",
4
- "objective": "Profile and improve code-search speed and result quality",
5
- "items": [
6
- {
7
- "id": "fix-dedup-buildtextindex",
8
- "subject": "Remove duplicate buildTextIndex from search.js",
9
- "status": "pending",
10
- "description": "search.js has a private copy of buildTextIndex and tokenize/extractSymbols/isCodeFile that duplicates text-search.js. Import the exported buildTextIndex from text-search.js instead.",
11
- "category": "refactor",
12
- "effort": "small",
13
- "blocking": ["fix-score-normalization", "fix-hybrid-weights"],
14
- "blockedBy": []
15
- },
16
- {
17
- "id": "fix-chunk-size",
18
- "subject": "Reduce chunk size from 300 to 60 lines for better semantic granularity",
19
- "status": "pending",
20
- "description": "scanner.js uses 300-line chunks. Embeddings work best on 50-100 line chunks. Reduce to 60-line chunks with 15-line overlap for better vector search quality.",
21
- "category": "feature",
22
- "effort": "small",
23
- "blocking": [],
24
- "blockedBy": []
25
- },
26
- {
27
- "id": "fix-score-normalization",
28
- "subject": "Fix text search score normalization so top result is always 1.0",
29
- "status": "pending",
30
- "description": "Text scores divide raw by 100 but scores can exceed 100. Use dynamic max-score scaling. Lower hasGoodTextResults threshold from 0.5 to 0.3.",
31
- "category": "bug",
32
- "effort": "small",
33
- "blocking": [],
34
- "blockedBy": ["fix-dedup-buildtextindex"]
35
- },
36
- {
37
- "id": "fix-hybrid-weights",
38
- "subject": "Boost text-only exact-match results in hybrid merge",
39
- "status": "pending",
40
- "description": "Text-only results are capped at 20% weight. Give high-scoring text-only results a floor finalScore of 0.4.",
41
- "category": "feature",
42
- "effort": "small",
43
- "blocking": [],
44
- "blockedBy": ["fix-dedup-buildtextindex"]
45
- },
46
- {
47
- "id": "fix-vector-cache-key",
48
- "subject": "Strengthen vector search cache key to 20 dimensions",
49
- "status": "pending",
50
- "description": "Cache key uses only first 5 embedding dims. Use 20 dims for near-zero collision rate.",
51
- "category": "bug",
52
- "effort": "small",
53
- "blocking": [],
54
- "blockedBy": []
55
- },
56
- {
57
- "id": "remove-dead-meanpooling",
58
- "subject": "Remove dead meanPooling function from embeddings.js",
59
- "status": "pending",
60
- "description": "meanPooling is defined but never called. Remove dead code.",
61
- "category": "refactor",
62
- "effort": "small",
63
- "blocking": [],
64
- "blockedBy": []
65
- },
66
- {
67
- "id": "verify-and-commit",
68
- "subject": "Verify improvements and commit all changes",
69
- "status": "pending",
70
- "description": "Run end-to-end search logic test inline. Commit and push all changes.",
71
- "category": "infra",
72
- "effort": "small",
73
- "blocking": [],
74
- "blockedBy": ["fix-dedup-buildtextindex", "fix-chunk-size", "fix-score-normalization", "fix-hybrid-weights", "fix-vector-cache-key", "remove-dead-meanpooling"]
75
- }
76
- ],
77
- "completed": []
78
- }
1
+ {}
package/mcp.js CHANGED
@@ -1,6 +1,5 @@
1
1
  #!/usr/bin/env node
2
2
 
3
- // MUST patch sharp before any other imports
4
3
  import fs from 'fs';
5
4
  import path from 'path';
6
5
  import { fileURLToPath } from 'url';
@@ -26,147 +25,76 @@ import { join } from 'path';
26
25
  import { existsSync, readFileSync, appendFileSync, writeFileSync } from 'fs';
27
26
  import { supervisor } from './src/supervisor.js';
28
27
 
29
- async function ensureIgnoreEntry(rootPath) {
28
+ function ensureIgnoreEntry(rootPath) {
30
29
  const gitignorePath = join(rootPath, '.gitignore');
31
30
  const entry = '.code-search/';
32
-
33
31
  try {
34
32
  if (existsSync(gitignorePath)) {
35
33
  const content = readFileSync(gitignorePath, 'utf8');
36
- if (!content.includes(entry)) {
37
- appendFileSync(gitignorePath, `\n${entry}`);
38
- }
34
+ if (!content.includes(entry)) appendFileSync(gitignorePath, `\n${entry}`);
39
35
  } else {
40
36
  writeFileSync(gitignorePath, `${entry}\n`);
41
37
  }
42
- } catch (e) {
43
- // Ignore write errors
44
- }
38
+ } catch (e) {}
45
39
  }
46
40
 
47
- const server = new Server(
48
- {
49
- name: 'code-search-mcp',
50
- version: '0.1.0',
51
- },
52
- {
53
- capabilities: {
54
- tools: {},
55
- },
56
- }
57
- );
41
+ function formatResults(result, query) {
42
+ if (result.resultsCount === 0) return `No results found for: "${query}"`;
43
+ const plural = result.resultsCount !== 1 ? 's' : '';
44
+ const header = `Found ${result.resultsCount} result${plural} for: "${query}"\n\n`;
45
+ const body = result.results.map((r) => {
46
+ const pathPart = r.relativePath || r.absolutePath;
47
+ const lineCount = r.totalLines ? ` [${r.totalLines}L]` : '';
48
+ const ctx = r.enclosingContext ? ` (in: ${r.enclosingContext})` : '';
49
+ const rHeader = `${r.rank}. ${pathPart}${lineCount}:${r.lines}${ctx} (score: ${r.score}%)`;
50
+ const rBody = r.snippet.split('\n').map((line) => ` ${line}`).join('\n');
51
+ return `${rHeader}\n${rBody}`;
52
+ }).join('\n\n');
53
+ return header + body;
54
+ }
55
+
56
+ function errResponse(msg) {
57
+ return { content: [{ type: 'text', text: msg }], isError: true };
58
+ }
58
59
 
59
- server.setRequestHandler(ListToolsRequestSchema, async () => {
60
- return {
61
- tools: [
62
- {
63
- name: 'search',
64
- description:
65
- 'Search through a code repository using semantic search with Jina embeddings. Automatically indexes the repository before searching.',
66
- inputSchema: {
67
- type: 'object',
68
- properties: {
69
- repository_path: {
70
- type: 'string',
71
- description:
72
- 'Absolute or relative path to the repository to search in (defaults to current directory)',
73
- },
74
- query: {
75
- type: 'string',
76
- description:
77
- 'Natural language search query (e.g., "authentication middleware", "database connection")',
78
- },
79
- },
80
- required: ['query'],
60
+ function okResponse(text) {
61
+ return { content: [{ type: 'text', text }] };
62
+ }
63
+
64
+ const server = new Server({ name: 'code-search-mcp', version: '0.1.0' }, { capabilities: { tools: {} } });
65
+
66
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({
67
+ tools: [
68
+ {
69
+ name: 'search',
70
+ description: 'Search through a code repository. Automatically indexes before searching.',
71
+ inputSchema: {
72
+ type: 'object',
73
+ properties: {
74
+ repository_path: { type: 'string', description: 'Path to repository (defaults to current directory)' },
75
+ query: { type: 'string', description: 'Natural language search query' },
81
76
  },
77
+ required: ['query'],
82
78
  },
83
- ],
84
- };
85
- });
79
+ },
80
+ ],
81
+ }));
86
82
 
87
83
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
88
84
  const { name, arguments: args } = request.params;
89
-
90
- if (name !== 'search') {
91
- return {
92
- content: [
93
- {
94
- type: 'text',
95
- text: `Unknown tool: ${name}`,
96
- },
97
- ],
98
- isError: true,
99
- };
100
- }
101
-
102
85
  const query = args?.query;
103
- const repositoryPath = args?.repository_path || cwd();
104
-
105
- if (!query || typeof query !== 'string') {
106
- return {
107
- content: [
108
- {
109
- type: 'text',
110
- text: 'Error: query is required and must be a string',
111
- },
112
- ],
113
- isError: true,
114
- };
115
- }
116
86
 
117
- try {
118
- await ensureIgnoreEntry(repositoryPath);
119
- const result = await supervisor.sendRequest({
120
- type: 'search',
121
- query,
122
- repositoryPath,
123
- });
124
-
125
- if (result.error) {
126
- return {
127
- content: [
128
- {
129
- type: 'text',
130
- text: `Error: ${result.error}`,
131
- },
132
- ],
133
- isError: true,
134
- };
135
- }
87
+ if (name !== 'search') return errResponse(`Unknown tool: ${name}`);
88
+ if (!query || typeof query !== 'string') return errResponse('Error: query is required and must be a string');
136
89
 
137
- const repoPath = typeof repositoryPath === 'string' ? repositoryPath : null;
138
- const text =
139
- result.resultsCount === 0
140
- ? `No results found for query: "${query}"`
141
- : `Found ${result.resultsCount} result${result.resultsCount !== 1 ? 's' : ''} for query: "${query}"\n\n${result.results
142
- .map((r) => {
143
- const pathPart = r.relativePath || r.absolutePath;
144
- const lineCount = r.totalLines ? ` [${r.totalLines}L]` : '';
145
- const ctx = r.enclosingContext ? ` (in: ${r.enclosingContext})` : '';
146
- const header = `${r.rank}. ${pathPart}${lineCount}:${r.lines}${ctx} (score: ${r.score}%)`;
147
- const body = r.snippet.split('\n').map((line) => ` ${line}`).join('\n');
148
- return `${header}\n${body}`;
149
- })
150
- .join('\n\n')}`;
151
-
152
- return {
153
- content: [
154
- {
155
- type: 'text',
156
- text,
157
- },
158
- ],
159
- };
90
+ try {
91
+ const repositoryPath = args?.repository_path || cwd();
92
+ ensureIgnoreEntry(repositoryPath);
93
+ const result = await supervisor.sendRequest({ type: 'search', query, repositoryPath });
94
+ if (result.error) return errResponse(`Error: ${result.error}`);
95
+ return okResponse(formatResults(result, query));
160
96
  } catch (error) {
161
- return {
162
- content: [
163
- {
164
- type: 'text',
165
- text: `Error: ${error.message}`,
166
- },
167
- ],
168
- isError: true,
169
- };
97
+ return errResponse(`Error: ${error.message}`);
170
98
  }
171
99
  });
172
100
 
@@ -176,25 +104,13 @@ export async function startMcpServer() {
176
104
  }
177
105
 
178
106
  const isMain = process.argv[1] && (
179
- process.argv[1] === fileURLToPath(import.meta.url) ||
180
- process.argv[1].endsWith('mcp.js') ||
107
+ process.argv[1] === fileURLToPath(import.meta.url) ||
108
+ process.argv[1].endsWith('mcp.js') ||
181
109
  process.argv[1].endsWith('code-search-mcp')
182
110
  );
183
111
 
184
- if (isMain) {
185
- main().catch((error) => {
186
- console.error('Server error:', error);
187
- });
188
- }
189
-
190
- process.on('uncaughtException', (error) => {
191
- console.error('Uncaught exception:', error);
192
- });
112
+ process.on('uncaughtException', (error) => console.error('Uncaught exception:', error));
113
+ process.on('unhandledRejection', (reason) => console.error('Unhandled rejection:', reason));
193
114
 
194
- process.on('unhandledRejection', (reason) => {
195
- console.error('Unhandled rejection:', reason);
196
- });
197
-
198
- async function main() {
199
- await startMcpServer();
200
- }
115
+ async function main() { await startMcpServer(); }
116
+ if (isMain) main().catch((error) => console.error('Server error:', error));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codebasesearch",
3
- "version": "0.1.23",
3
+ "version": "0.1.25",
4
4
  "description": "Ultra-simple code search tool with Jina embeddings, LanceDB, and MCP protocol support",
5
5
  "type": "module",
6
6
  "bin": {
@@ -0,0 +1,248 @@
1
+ Performance profiling across codebases...
2
+ This will take several minutes.
3
+
4
+
5
+ ▶ Profiling xbot...
6
+ Scanned: 7 chunks in 8ms
7
+ Vector store initialized
8
+ Loading embeddings model (this may take a moment on first run)...
9
+ dtype not specified for "model". Using the default dtype (fp32) for this device (cpu).
10
+ Indexed 7 chunks
11
+ Embeddings: 7/7 chunks in 1530ms
12
+ Searching for: "function class method"
13
+ Text search: 7 results in 88ms
14
+
15
+ ▶ Profiling fsbrowse...
16
+ Scanned: 6 chunks in 7ms
17
+ Vector store initialized
18
+ Indexed 6 chunks
19
+ Embeddings: 6/6 chunks in 1126ms
20
+ Searching for: "function class method"
21
+ Text search: 6 results in 26ms
22
+
23
+ ▶ Profiling agentauth...
24
+ Scanned: 11 chunks in 2ms
25
+ Vector store initialized
26
+ Indexed 11 chunks
27
+ Embeddings: 11/11 chunks in 1957ms
28
+ Searching for: "function class method"
29
+ Text search: 10 results in 40ms
30
+
31
+ ▶ Profiling webtalk...
32
+ Scanned: 40 chunks in 28ms
33
+ Vector store initialized
34
+ Indexed 32 chunks
35
+ Indexed 8 chunks
36
+ Embeddings: 40/40 chunks in 6680ms
37
+ Searching for: "function class method"
38
+ Text search: 10 results in 80ms
39
+
40
+ ▶ Profiling plugforge...
41
+ Scanned: 66 chunks in 11ms
42
+ Vector store initialized
43
+ Indexed 32 chunks
44
+ Indexed 32 chunks
45
+ Indexed 2 chunks
46
+ Embeddings: 66/66 chunks in 10923ms
47
+ Searching for: "function class method"
48
+ Text search: 10 results in 83ms
49
+
50
+ ▶ Profiling agentgui...
51
+ Scanned: 83 chunks in 10ms
52
+ Vector store initialized
53
+ Indexed 32 chunks
54
+ Indexed 32 chunks
55
+ Indexed 19 chunks
56
+ Embeddings: 83/83 chunks in 16535ms
57
+ Searching for: "function class method"
58
+ Text search: 10 results in 146ms
59
+
60
+ ▶ Profiling seqos...
61
+ Scanned: 76 chunks in 9ms
62
+ Vector store initialized
63
+ Indexed 32 chunks
64
+ Indexed 32 chunks
65
+ Indexed 12 chunks
66
+ Embeddings: 76/76 chunks in 13730ms
67
+ Searching for: "function class method"
68
+ Text search: 10 results in 113ms
69
+
70
+ ▶ Profiling docmcp...
71
+ Scanned: 74 chunks in 53ms
72
+ Vector store initialized
73
+ Indexed 32 chunks
74
+ Indexed 32 chunks
75
+ Indexed 10 chunks
76
+ Embeddings: 74/74 chunks in 13526ms
77
+ Searching for: "function class method"
78
+ Text search: 10 results in 95ms
79
+
80
+ ▶ Profiling zellous...
81
+ Scanned: 77 chunks in 54ms
82
+ Vector store initialized
83
+ Indexed 32 chunks
84
+ Indexed 32 chunks
85
+ Indexed 13 chunks
86
+ Embeddings: 77/77 chunks in 13463ms
87
+ Searching for: "function class method"
88
+ Text search: 10 results in 120ms
89
+
90
+ ▶ Profiling teatree...
91
+ Scanned: 93 chunks in 10ms
92
+ Vector store initialized
93
+ Indexed 32 chunks
94
+ Indexed 32 chunks
95
+ Indexed 29 chunks
96
+ Embeddings: 93/93 chunks in 15931ms
97
+ Searching for: "function class method"
98
+ Text search: 10 results in 82ms
99
+
100
+ ▶ Profiling moonlanding...
101
+ Scanned: 323 chunks in 35ms
102
+ Vector store initialized
103
+ Indexed 32 chunks
104
+ Indexed 32 chunks
105
+ Indexed 32 chunks
106
+ Indexed 32 chunks
107
+ Indexed 32 chunks
108
+ Indexed 32 chunks
109
+ Indexed 32 chunks
110
+ Indexed 32 chunks
111
+ Indexed 32 chunks
112
+ Indexed 32 chunks
113
+ Indexed 3 chunks
114
+ Embeddings: 323/323 chunks in 69519ms
115
+ Searching for: "function class method"
116
+ Text search: 10 results in 672ms
117
+
118
+ ▶ Profiling pp...
119
+ Scanned: 1041 chunks in 461ms
120
+ Vector store initialized
121
+ Indexed 32 chunks
122
+ Indexed 32 chunks
123
+ Indexed 32 chunks
124
+ Indexed 32 chunks
125
+ Indexed 32 chunks
126
+ Indexed 32 chunks
127
+ Indexed 32 chunks
128
+ Indexed 32 chunks
129
+ Indexed 32 chunks
130
+ Indexed 32 chunks
131
+ Indexed 32 chunks
132
+ Indexed 32 chunks
133
+ Indexed 32 chunks
134
+ Indexed 32 chunks
135
+ Indexed 32 chunks
136
+ Indexed 32 chunks
137
+ Indexed 32 chunks
138
+ Indexed 32 chunks
139
+ Indexed 32 chunks
140
+ Indexed 32 chunks
141
+ Indexed 32 chunks
142
+ Indexed 32 chunks
143
+ Indexed 32 chunks
144
+ Indexed 32 chunks
145
+ Indexed 32 chunks
146
+ Indexed 32 chunks
147
+ Indexed 32 chunks
148
+ Indexed 32 chunks
149
+ Indexed 32 chunks
150
+ Indexed 32 chunks
151
+ Indexed 32 chunks
152
+ Indexed 32 chunks
153
+ Indexed 17 chunks
154
+ Embeddings: 1041/1041 chunks in 286137ms
155
+ Searching for: "function class method"
156
+ Text search: 10 results in 1312ms
157
+
158
+ ================================================================================
159
+ PERFORMANCE PROFILE
160
+ ================================================================================
161
+
162
+ scan:
163
+ Total: 687.90ms
164
+ Avg: 57.32ms
165
+ Min: 2.05ms
166
+ Max: 461.34ms
167
+ Count: 12
168
+ Per-chunk: 8.189ms/chunk
169
+
170
+ embeddings:
171
+ Total: 451057.24ms
172
+ Avg: 37588.10ms
173
+ Min: 1125.83ms
174
+ Max: 286136.65ms
175
+ Count: 12
176
+ Per-chunk: 5369.729ms/chunk
177
+
178
+ search-text:
179
+ Total: 2856.57ms
180
+ Avg: 238.05ms
181
+ Min: 26.30ms
182
+ Max: 1311.63ms
183
+ Count: 12
184
+ Per-chunk: 34.007ms/chunk
185
+
186
+ --------------------------------------------------------------------------------
187
+ CODEBASE TIMING
188
+ --------------------------------------------------------------------------------
189
+
190
+ xbot (7 chunks): 1625ms
191
+ scan: 8ms (0.5%)
192
+ embeddings: 1530ms (94.1%)
193
+ search-text: 88ms (5.4%)
194
+
195
+ fsbrowse (6 chunks): 1159ms
196
+ scan: 7ms (0.6%)
197
+ embeddings: 1126ms (97.1%)
198
+ search-text: 26ms (2.3%)
199
+
200
+ agentauth (11 chunks): 1999ms
201
+ scan: 2ms (0.1%)
202
+ embeddings: 1957ms (97.9%)
203
+ search-text: 40ms (2.0%)
204
+
205
+ webtalk (40 chunks): 6788ms
206
+ scan: 28ms (0.4%)
207
+ embeddings: 6680ms (98.4%)
208
+ search-text: 80ms (1.2%)
209
+
210
+ plugforge (66 chunks): 11018ms
211
+ scan: 11ms (0.1%)
212
+ embeddings: 10923ms (99.1%)
213
+ search-text: 83ms (0.8%)
214
+
215
+ agentgui (83 chunks): 16691ms
216
+ scan: 10ms (0.1%)
217
+ embeddings: 16535ms (99.1%)
218
+ search-text: 146ms (0.9%)
219
+
220
+ seqos (76 chunks): 13852ms
221
+ scan: 9ms (0.1%)
222
+ embeddings: 13730ms (99.1%)
223
+ search-text: 113ms (0.8%)
224
+
225
+ docmcp (74 chunks): 13674ms
226
+ scan: 53ms (0.4%)
227
+ embeddings: 13526ms (98.9%)
228
+ search-text: 95ms (0.7%)
229
+
230
+ zellous (77 chunks): 13637ms
231
+ scan: 54ms (0.4%)
232
+ embeddings: 13463ms (98.7%)
233
+ search-text: 120ms (0.9%)
234
+
235
+ teatree (93 chunks): 16023ms
236
+ scan: 10ms (0.1%)
237
+ embeddings: 15931ms (99.4%)
238
+ search-text: 82ms (0.5%)
239
+
240
+ moonlanding (323 chunks): 70226ms
241
+ scan: 35ms (0.0%)
242
+ embeddings: 69519ms (99.0%)
243
+ search-text: 672ms (1.0%)
244
+
245
+ pp (1041 chunks): 287910ms
246
+ scan: 461ms (0.2%)
247
+ embeddings: 286137ms (99.4%)
248
+ search-text: 1312ms (0.5%)
package/profile.js ADDED
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { existsSync } from 'fs';
4
+ import { loadIgnorePatterns } from './src/ignore-parser.js';
5
+ import { scanRepository } from './src/scanner.js';
6
+ import { buildTextIndex, searchText } from './src/text-search.js';
7
+
8
+ const CODEBASES = [
9
+ '~/workspace/agentauth',
10
+ '~/workspace/agentgui',
11
+ '~/workspace/docmcp',
12
+ '~/workspace/friday-staging',
13
+ '~/workspace/fsbrowse',
14
+ '~/workspace/gmweb',
15
+ '~/workspace/hookie',
16
+ '~/workspace/invoic',
17
+ '~/workspace/mcp-thorns',
18
+ '~/workspace/models',
19
+ '~/workspace/moonlanding',
20
+ '~/workspace/myworkreview-staging',
21
+ '~/workspace/opencode-source',
22
+ '~/workspace/plugforge',
23
+ '~/workspace/pp',
24
+ '~/workspace/proxypilot-setup',
25
+ '~/workspace/seqos',
26
+ '~/workspace/sttttsmodels',
27
+ '~/workspace/teatree',
28
+ '~/workspace/webtalk',
29
+ '~/workspace/webtalk-repo',
30
+ '~/workspace/xbot',
31
+ '~/workspace/zellous',
32
+ ];
33
+
34
+ // Realistic queries that represent actual usage
35
+ const TEST_QUERIES = [
36
+ 'authentication',
37
+ 'database connection',
38
+ 'error handling',
39
+ 'HTTP request',
40
+ 'user session',
41
+ ];
42
+
43
+ class Profiler {
44
+ constructor() {
45
+ this.marks = {};
46
+ this.measurements = [];
47
+ }
48
+
49
+ start(name) {
50
+ this.marks[name] = performance.now();
51
+ }
52
+
53
+ end(name, extra = null) {
54
+ if (!this.marks[name]) return 0;
55
+ const duration = performance.now() - this.marks[name];
56
+ this.measurements.push({ name, duration, extra });
57
+ delete this.marks[name];
58
+ return duration;
59
+ }
60
+
61
+ report(results) {
62
+ console.log('\n' + '='.repeat(80));
63
+ console.log('PERFORMANCE SUMMARY');
64
+ console.log('='.repeat(80));
65
+
66
+ const scanTimes = this.measurements.filter(m => m.name.startsWith('scan:'));
67
+ const indexTimes = this.measurements.filter(m => m.name.startsWith('index:'));
68
+ const searchTimes = this.measurements.filter(m => m.name.startsWith('search:'));
69
+
70
+ const avg = arr => arr.reduce((a, b) => a + b, 0) / arr.length;
71
+ const fmt = ms => ms.toFixed(1) + 'ms';
72
+
73
+ console.log(`\nScan: avg ${fmt(avg(scanTimes.map(m => m.duration)))} max ${fmt(Math.max(...scanTimes.map(m => m.duration)))}`);
74
+ console.log(`Index: avg ${fmt(avg(indexTimes.map(m => m.duration)))} max ${fmt(Math.max(...indexTimes.map(m => m.duration)))}`);
75
+ console.log(`Search: avg ${fmt(avg(searchTimes.map(m => m.duration)))} max ${fmt(Math.max(...searchTimes.map(m => m.duration)))}`);
76
+
77
+ console.log('\n' + '-'.repeat(80));
78
+ console.log('PER-CODEBASE RESULTS');
79
+ console.log('-'.repeat(80));
80
+
81
+ for (const r of results) {
82
+ if (r.skipped) {
83
+ console.log(`\n${r.label}: skipped (not found)`);
84
+ continue;
85
+ }
86
+ const scanT = scanTimes.find(m => m.name === `scan:${r.label}`)?.duration || 0;
87
+ const indexT = indexTimes.find(m => m.name === `index:${r.label}`)?.duration || 0;
88
+ console.log(`\n${r.label} (${r.chunks} chunks): scan ${fmt(scanT)} | index ${fmt(indexT)}`);
89
+ for (const q of r.queries) {
90
+ const top = q.results[0];
91
+ const topStr = top ? `${top.file_path}:${top.line_start} (${(top.score * 100).toFixed(0)}%)` : 'no results';
92
+ console.log(` "${q.query}" → ${q.count} results in ${fmt(q.time)} | top: ${topStr}`);
93
+ }
94
+ }
95
+ }
96
+ }
97
+
98
+ async function profileCodebase(codebasePath, profiler) {
99
+ const rootPath = codebasePath.replace('~', process.env.HOME);
100
+ const label = codebasePath.split('/').pop();
101
+
102
+ if (!existsSync(rootPath)) {
103
+ console.log(` ${label}: not found, skipping`);
104
+ return { label, skipped: true };
105
+ }
106
+
107
+ process.stdout.write(`▶ ${label}... `);
108
+
109
+ profiler.start(`scan:${label}`);
110
+ const ignorePatterns = loadIgnorePatterns(rootPath);
111
+ const chunks = scanRepository(rootPath, ignorePatterns);
112
+ profiler.end(`scan:${label}`, chunks.length);
113
+
114
+ if (chunks.length === 0) {
115
+ console.log('0 chunks');
116
+ return { label, chunks: 0, queries: [] };
117
+ }
118
+
119
+ profiler.start(`index:${label}`);
120
+ const indexData = buildTextIndex(chunks);
121
+ profiler.end(`index:${label}`, chunks.length);
122
+
123
+ const queryResults = [];
124
+ for (const query of TEST_QUERIES) {
125
+ const t0 = performance.now();
126
+ const results = searchText(query, chunks, indexData);
127
+ const elapsed = performance.now() - t0;
128
+ profiler.measurements.push({ name: `search:${label}:${query}`, duration: elapsed });
129
+ queryResults.push({ query, count: results.length, time: elapsed, results: results.slice(0, 1) });
130
+ }
131
+
132
+ console.log(`${chunks.length} chunks, ${queryResults.map(q => q.time.toFixed(0) + 'ms').join('/')}`);
133
+ return { label, chunks: chunks.length, queries: queryResults };
134
+ }
135
+
136
+ async function main() {
137
+ console.log('Profiling search across codebases...\n');
138
+
139
+ const profiler = new Profiler();
140
+ const results = [];
141
+
142
+ for (const codebase of CODEBASES) {
143
+ results.push(await profileCodebase(codebase, profiler));
144
+ }
145
+
146
+ profiler.report(results);
147
+ }
148
+
149
+ main().catch(err => {
150
+ console.error('Profile error:', err.message);
151
+ process.exit(1);
152
+ });
@@ -197,6 +197,8 @@ const IGNORED_DIRECTORIES = new Set([
197
197
  'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
198
198
  // Version control
199
199
  '.git', '.svn', '.hg', '.bzr',
200
+ // Tool config (AI assistants, editors)
201
+ '.claude', '.cursor', '.aider',
200
202
  // IDE
201
203
  '.vscode', '.idea', '.vs', '.atom',
202
204
  // Build outputs (unambiguous names only)
@@ -1,6 +1,6 @@
1
1
  import { parentPort } from 'worker_threads';
2
2
  import { resolve, relative } from 'path';
3
- import { existsSync, readFileSync } from 'fs';
3
+ import { existsSync, readFileSync, statSync } from 'fs';
4
4
  import { loadIgnorePatterns } from './ignore-parser.js';
5
5
  import { scanRepository } from './scanner.js';
6
6
  import { buildTextIndex, searchText } from './text-search.js';
@@ -21,22 +21,34 @@ function findEnclosingContext(content, lineStart) {
21
21
  }
22
22
 
23
23
  function getFileTotalLines(absoluteFilePath) {
24
+ if (fileLineCountCache.has(absoluteFilePath)) {
25
+ return fileLineCountCache.get(absoluteFilePath);
26
+ }
24
27
  try {
25
28
  const content = readFileSync(absoluteFilePath, 'utf8');
26
- return content.split('\n').length;
29
+ const count = content.split('\n').length;
30
+ fileLineCountCache.set(absoluteFilePath, count);
31
+ return count;
27
32
  } catch {
28
33
  return null;
29
34
  }
30
35
  }
31
36
 
32
37
  let indexCache = new Map();
38
+ // Cache file line counts to avoid repeated disk reads on every search
39
+ const fileLineCountCache = new Map();
33
40
 
34
41
  async function initializeIndex(repositoryPath) {
35
42
  const absolutePath = resolve(repositoryPath);
36
- const cacheKey = absolutePath;
37
43
 
38
- if (indexCache.has(cacheKey)) {
39
- return indexCache.get(cacheKey);
44
+ const cached = indexCache.get(absolutePath);
45
+ if (cached) {
46
+ try {
47
+ const dirStat = statSync(absolutePath);
48
+ if (dirStat.mtimeMs <= cached.indexedAt) return cached;
49
+ } catch {
50
+ return cached;
51
+ }
40
52
  }
41
53
 
42
54
  try {
@@ -48,8 +60,8 @@ async function initializeIndex(repositoryPath) {
48
60
  }
49
61
 
50
62
  const indexData = buildTextIndex(chunks);
51
- const result = { chunks, indexData };
52
- indexCache.set(cacheKey, result);
63
+ const result = { chunks, indexData, indexedAt: Date.now() };
64
+ indexCache.set(absolutePath, result);
53
65
 
54
66
  return result;
55
67
  } catch (error) {
@@ -71,7 +83,17 @@ async function performSearch(repositoryPath, query) {
71
83
  return { error: indexData.error, results: [] };
72
84
  }
73
85
 
74
- const results = searchText(query, indexData.chunks, indexData.indexData);
86
+ const rawResults = searchText(query, indexData.chunks, indexData.indexData);
87
+
88
+ // Deduplicate: keep best-scoring chunk per file, then take top results
89
+ const bestPerFile = new Map();
90
+ for (const r of rawResults) {
91
+ const existing = bestPerFile.get(r.file_path);
92
+ if (!existing || r.score > existing.score) {
93
+ bestPerFile.set(r.file_path, r);
94
+ }
95
+ }
96
+ const results = Array.from(bestPerFile.values()).sort((a, b) => b.score - a.score);
75
97
 
76
98
  return {
77
99
  query,
@@ -1,35 +1,72 @@
1
1
  export function buildTextIndex(chunks) {
2
2
  const index = new Map();
3
- const chunkMetadata = [];
4
-
5
- chunks.forEach((chunk, idx) => {
6
- const tokens = tokenize(chunk.content);
7
- const fileNameTokens = tokenize(chunk.file_path);
8
- const symbols = extractSymbols(chunk.content);
9
- const frequency = new Map();
10
-
11
- tokens.forEach(token => {
12
- frequency.set(token, (frequency.get(token) || 0) + 1);
13
- if (!index.has(token)) {
14
- index.set(token, new Set());
15
- }
16
- index.get(token).add(idx);
17
- });
3
+ const chunkMetadata = new Array(chunks.length);
4
+
5
+ for (let idx = 0; idx < chunks.length; idx++) {
6
+ const chunk = chunks[idx];
7
+ const frequency = tokenizeToFrequency(chunk.content, index, idx);
8
+ const fileNameTokens = new Set(tokenize(chunk.file_path));
9
+ const symbols = new Set(extractSymbols(chunk.content));
18
10
 
19
11
  chunkMetadata[idx] = {
20
- tokens,
21
12
  fileNameTokens,
22
13
  symbols,
23
14
  frequency,
24
15
  isCode: isCodeFile(chunk.file_path),
16
+ contentLower: chunk.content.toLowerCase(),
25
17
  };
26
- });
18
+ }
19
+
20
+ // Precompute IDF for each token: log((N+1)/(df+1))
21
+ const N = chunks.length;
22
+ const idf = new Map();
23
+ for (const [token, docSet] of index) {
24
+ idf.set(token, Math.log((N + 1) / (docSet.size + 1)) + 1);
25
+ }
26
+
27
+ return { index, chunkMetadata, idf };
28
+ }
29
+
30
+ function tokenizeToFrequency(text, index, chunkIdx) {
31
+ const frequency = new Map();
32
+
33
+ for (const word of text.split(/\s+/)) {
34
+ if (word.length === 0) continue;
35
+
36
+ const hasUpperCase = word !== word.toLowerCase();
37
+ if (hasUpperCase) {
38
+ const camelTokens = word.match(/[A-Z]?[a-z]+|[A-Z]+(?=[A-Z][a-z]|\d|\W|$)|[0-9]+/g);
39
+ if (camelTokens) {
40
+ for (const t of camelTokens) {
41
+ if (t.length > 1) addToken(t.toLowerCase(), frequency, index, chunkIdx);
42
+ }
43
+ }
44
+ }
45
+
46
+ const cleaned = word.replace(/[^\w]/g, '').toLowerCase();
47
+ if (cleaned.length > 1) {
48
+ addToken(cleaned, frequency, index, chunkIdx);
49
+ if (word.includes('-') || word.includes('_') || word.includes('.')) {
50
+ for (const part of word.split(/[-_.]/)) {
51
+ const partCleaned = part.replace(/[^\w]/g, '').toLowerCase();
52
+ if (partCleaned.length > 1 && partCleaned !== cleaned) addToken(partCleaned, frequency, index, chunkIdx);
53
+ }
54
+ }
55
+ }
56
+ }
27
57
 
28
- return { index, chunkMetadata };
58
+ return frequency;
59
+ }
60
+
61
+ function addToken(token, frequency, index, chunkIdx) {
62
+ frequency.set(token, (frequency.get(token) || 0) + 1);
63
+ let docSet = index.get(token);
64
+ if (!docSet) { docSet = new Set(); index.set(token, docSet); }
65
+ docSet.add(chunkIdx);
29
66
  }
30
67
 
31
68
  export function searchText(query, chunks, indexData) {
32
- const { index, chunkMetadata } = indexData;
69
+ const { index, chunkMetadata, idf } = indexData;
33
70
  const queryTokens = tokenize(query);
34
71
  const querySymbols = extractSymbols(query);
35
72
  const chunkScores = new Map();
@@ -47,36 +84,53 @@ export function searchText(query, chunks, indexData) {
47
84
  }
48
85
  });
49
86
 
50
- for (const idx of candidates) {
87
+ const queryLower = query.toLowerCase();
88
+
89
+ let scoringCandidates = candidates;
90
+ if (candidates.size > 500) {
91
+ const ranked = Array.from(candidates).sort((a, b) => {
92
+ let aSum = 0, bSum = 0;
93
+ for (const token of queryTokens) {
94
+ if (index.has(token)) {
95
+ if (index.get(token).has(a)) aSum += idf.get(token) || 1;
96
+ if (index.get(token).has(b)) bSum += idf.get(token) || 1;
97
+ }
98
+ }
99
+ return bSum - aSum;
100
+ });
101
+ scoringCandidates = new Set(ranked.slice(0, 500));
102
+ }
103
+
104
+ for (const idx of scoringCandidates) {
51
105
  const chunk = chunks[idx];
52
106
  const meta = chunkMetadata[idx];
53
107
  let score = 0;
54
108
 
55
- // Exact phrase match - highest priority (saves embedding cost)
56
- if (chunk.content.toLowerCase().includes(query.toLowerCase())) {
109
+ if (queryTokens.length > 1 && meta.contentLower.includes(queryLower)) {
57
110
  score += 30;
58
111
  }
59
112
 
60
113
  // Symbol match in content - function/class named after query terms
61
114
  querySymbols.forEach(symbol => {
62
- if (meta.symbols.includes(symbol)) score += 10;
115
+ if (meta.symbols.has(symbol)) score += 10;
63
116
  });
64
117
 
65
118
  // Filename token match - strong signal that this file is about the query topic
66
119
  let fileNameMatches = 0;
67
120
  queryTokens.forEach(token => {
68
- if (meta.fileNameTokens.includes(token)) fileNameMatches++;
121
+ if (meta.fileNameTokens.has(token)) fileNameMatches++;
69
122
  });
70
123
  if (fileNameMatches > 0) {
71
124
  score += fileNameMatches * 10;
72
125
  }
73
126
 
74
- // Token frequency scoring
127
+ // TF-IDF scoring: reward rare tokens that appear in this chunk
75
128
  queryTokens.forEach(token => {
76
129
  if (index.has(token) && index.get(token).has(idx)) {
77
- const freq = meta.frequency.get(token) || 1;
130
+ const tf = Math.min(meta.frequency.get(token) || 1, 5);
131
+ const tokenIdf = idf ? (idf.get(token) || 1) : 1;
78
132
  const lengthBoost = token.length > 4 ? 1.5 : 1;
79
- score += lengthBoost * Math.min(freq, 5);
133
+ score += lengthBoost * tf * tokenIdf;
80
134
  }
81
135
  });
82
136