codebasesearch 0.1.23 → 0.1.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prd +1 -78
- package/mcp.js +56 -140
- package/package.json +1 -1
- package/profile-results.txt +248 -0
- package/profile.js +152 -0
- package/src/ignore-parser.js +2 -0
- package/src/search-worker.js +30 -8
- package/src/text-search.js +81 -27
package/.prd
CHANGED
|
@@ -1,78 +1 @@
|
|
|
1
|
-
{
|
|
2
|
-
"project": "code-search",
|
|
3
|
-
"created": "2026-03-12",
|
|
4
|
-
"objective": "Profile and improve code-search speed and result quality",
|
|
5
|
-
"items": [
|
|
6
|
-
{
|
|
7
|
-
"id": "fix-dedup-buildtextindex",
|
|
8
|
-
"subject": "Remove duplicate buildTextIndex from search.js",
|
|
9
|
-
"status": "pending",
|
|
10
|
-
"description": "search.js has a private copy of buildTextIndex and tokenize/extractSymbols/isCodeFile that duplicates text-search.js. Import the exported buildTextIndex from text-search.js instead.",
|
|
11
|
-
"category": "refactor",
|
|
12
|
-
"effort": "small",
|
|
13
|
-
"blocking": ["fix-score-normalization", "fix-hybrid-weights"],
|
|
14
|
-
"blockedBy": []
|
|
15
|
-
},
|
|
16
|
-
{
|
|
17
|
-
"id": "fix-chunk-size",
|
|
18
|
-
"subject": "Reduce chunk size from 300 to 60 lines for better semantic granularity",
|
|
19
|
-
"status": "pending",
|
|
20
|
-
"description": "scanner.js uses 300-line chunks. Embeddings work best on 50-100 line chunks. Reduce to 60-line chunks with 15-line overlap for better vector search quality.",
|
|
21
|
-
"category": "feature",
|
|
22
|
-
"effort": "small",
|
|
23
|
-
"blocking": [],
|
|
24
|
-
"blockedBy": []
|
|
25
|
-
},
|
|
26
|
-
{
|
|
27
|
-
"id": "fix-score-normalization",
|
|
28
|
-
"subject": "Fix text search score normalization so top result is always 1.0",
|
|
29
|
-
"status": "pending",
|
|
30
|
-
"description": "Text scores divide raw by 100 but scores can exceed 100. Use dynamic max-score scaling. Lower hasGoodTextResults threshold from 0.5 to 0.3.",
|
|
31
|
-
"category": "bug",
|
|
32
|
-
"effort": "small",
|
|
33
|
-
"blocking": [],
|
|
34
|
-
"blockedBy": ["fix-dedup-buildtextindex"]
|
|
35
|
-
},
|
|
36
|
-
{
|
|
37
|
-
"id": "fix-hybrid-weights",
|
|
38
|
-
"subject": "Boost text-only exact-match results in hybrid merge",
|
|
39
|
-
"status": "pending",
|
|
40
|
-
"description": "Text-only results are capped at 20% weight. Give high-scoring text-only results a floor finalScore of 0.4.",
|
|
41
|
-
"category": "feature",
|
|
42
|
-
"effort": "small",
|
|
43
|
-
"blocking": [],
|
|
44
|
-
"blockedBy": ["fix-dedup-buildtextindex"]
|
|
45
|
-
},
|
|
46
|
-
{
|
|
47
|
-
"id": "fix-vector-cache-key",
|
|
48
|
-
"subject": "Strengthen vector search cache key to 20 dimensions",
|
|
49
|
-
"status": "pending",
|
|
50
|
-
"description": "Cache key uses only first 5 embedding dims. Use 20 dims for near-zero collision rate.",
|
|
51
|
-
"category": "bug",
|
|
52
|
-
"effort": "small",
|
|
53
|
-
"blocking": [],
|
|
54
|
-
"blockedBy": []
|
|
55
|
-
},
|
|
56
|
-
{
|
|
57
|
-
"id": "remove-dead-meanpooling",
|
|
58
|
-
"subject": "Remove dead meanPooling function from embeddings.js",
|
|
59
|
-
"status": "pending",
|
|
60
|
-
"description": "meanPooling is defined but never called. Remove dead code.",
|
|
61
|
-
"category": "refactor",
|
|
62
|
-
"effort": "small",
|
|
63
|
-
"blocking": [],
|
|
64
|
-
"blockedBy": []
|
|
65
|
-
},
|
|
66
|
-
{
|
|
67
|
-
"id": "verify-and-commit",
|
|
68
|
-
"subject": "Verify improvements and commit all changes",
|
|
69
|
-
"status": "pending",
|
|
70
|
-
"description": "Run end-to-end search logic test inline. Commit and push all changes.",
|
|
71
|
-
"category": "infra",
|
|
72
|
-
"effort": "small",
|
|
73
|
-
"blocking": [],
|
|
74
|
-
"blockedBy": ["fix-dedup-buildtextindex", "fix-chunk-size", "fix-score-normalization", "fix-hybrid-weights", "fix-vector-cache-key", "remove-dead-meanpooling"]
|
|
75
|
-
}
|
|
76
|
-
],
|
|
77
|
-
"completed": []
|
|
78
|
-
}
|
|
1
|
+
{}
|
package/mcp.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
|
-
// MUST patch sharp before any other imports
|
|
4
3
|
import fs from 'fs';
|
|
5
4
|
import path from 'path';
|
|
6
5
|
import { fileURLToPath } from 'url';
|
|
@@ -26,147 +25,76 @@ import { join } from 'path';
|
|
|
26
25
|
import { existsSync, readFileSync, appendFileSync, writeFileSync } from 'fs';
|
|
27
26
|
import { supervisor } from './src/supervisor.js';
|
|
28
27
|
|
|
29
|
-
|
|
28
|
+
function ensureIgnoreEntry(rootPath) {
|
|
30
29
|
const gitignorePath = join(rootPath, '.gitignore');
|
|
31
30
|
const entry = '.code-search/';
|
|
32
|
-
|
|
33
31
|
try {
|
|
34
32
|
if (existsSync(gitignorePath)) {
|
|
35
33
|
const content = readFileSync(gitignorePath, 'utf8');
|
|
36
|
-
if (!content.includes(entry)) {
|
|
37
|
-
appendFileSync(gitignorePath, `\n${entry}`);
|
|
38
|
-
}
|
|
34
|
+
if (!content.includes(entry)) appendFileSync(gitignorePath, `\n${entry}`);
|
|
39
35
|
} else {
|
|
40
36
|
writeFileSync(gitignorePath, `${entry}\n`);
|
|
41
37
|
}
|
|
42
|
-
} catch (e) {
|
|
43
|
-
// Ignore write errors
|
|
44
|
-
}
|
|
38
|
+
} catch (e) {}
|
|
45
39
|
}
|
|
46
40
|
|
|
47
|
-
|
|
48
|
-
{
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
|
|
41
|
+
function formatResults(result, query) {
|
|
42
|
+
if (result.resultsCount === 0) return `No results found for: "${query}"`;
|
|
43
|
+
const plural = result.resultsCount !== 1 ? 's' : '';
|
|
44
|
+
const header = `Found ${result.resultsCount} result${plural} for: "${query}"\n\n`;
|
|
45
|
+
const body = result.results.map((r) => {
|
|
46
|
+
const pathPart = r.relativePath || r.absolutePath;
|
|
47
|
+
const lineCount = r.totalLines ? ` [${r.totalLines}L]` : '';
|
|
48
|
+
const ctx = r.enclosingContext ? ` (in: ${r.enclosingContext})` : '';
|
|
49
|
+
const rHeader = `${r.rank}. ${pathPart}${lineCount}:${r.lines}${ctx} (score: ${r.score}%)`;
|
|
50
|
+
const rBody = r.snippet.split('\n').map((line) => ` ${line}`).join('\n');
|
|
51
|
+
return `${rHeader}\n${rBody}`;
|
|
52
|
+
}).join('\n\n');
|
|
53
|
+
return header + body;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function errResponse(msg) {
|
|
57
|
+
return { content: [{ type: 'text', text: msg }], isError: true };
|
|
58
|
+
}
|
|
58
59
|
|
|
59
|
-
|
|
60
|
-
return {
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
type: 'string',
|
|
76
|
-
description:
|
|
77
|
-
'Natural language search query (e.g., "authentication middleware", "database connection")',
|
|
78
|
-
},
|
|
79
|
-
},
|
|
80
|
-
required: ['query'],
|
|
60
|
+
function okResponse(text) {
|
|
61
|
+
return { content: [{ type: 'text', text }] };
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const server = new Server({ name: 'code-search-mcp', version: '0.1.0' }, { capabilities: { tools: {} } });
|
|
65
|
+
|
|
66
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
67
|
+
tools: [
|
|
68
|
+
{
|
|
69
|
+
name: 'search',
|
|
70
|
+
description: 'Search through a code repository. Automatically indexes before searching.',
|
|
71
|
+
inputSchema: {
|
|
72
|
+
type: 'object',
|
|
73
|
+
properties: {
|
|
74
|
+
repository_path: { type: 'string', description: 'Path to repository (defaults to current directory)' },
|
|
75
|
+
query: { type: 'string', description: 'Natural language search query' },
|
|
81
76
|
},
|
|
77
|
+
required: ['query'],
|
|
82
78
|
},
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
});
|
|
79
|
+
},
|
|
80
|
+
],
|
|
81
|
+
}));
|
|
86
82
|
|
|
87
83
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
88
84
|
const { name, arguments: args } = request.params;
|
|
89
|
-
|
|
90
|
-
if (name !== 'search') {
|
|
91
|
-
return {
|
|
92
|
-
content: [
|
|
93
|
-
{
|
|
94
|
-
type: 'text',
|
|
95
|
-
text: `Unknown tool: ${name}`,
|
|
96
|
-
},
|
|
97
|
-
],
|
|
98
|
-
isError: true,
|
|
99
|
-
};
|
|
100
|
-
}
|
|
101
|
-
|
|
102
85
|
const query = args?.query;
|
|
103
|
-
const repositoryPath = args?.repository_path || cwd();
|
|
104
|
-
|
|
105
|
-
if (!query || typeof query !== 'string') {
|
|
106
|
-
return {
|
|
107
|
-
content: [
|
|
108
|
-
{
|
|
109
|
-
type: 'text',
|
|
110
|
-
text: 'Error: query is required and must be a string',
|
|
111
|
-
},
|
|
112
|
-
],
|
|
113
|
-
isError: true,
|
|
114
|
-
};
|
|
115
|
-
}
|
|
116
86
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
const result = await supervisor.sendRequest({
|
|
120
|
-
type: 'search',
|
|
121
|
-
query,
|
|
122
|
-
repositoryPath,
|
|
123
|
-
});
|
|
124
|
-
|
|
125
|
-
if (result.error) {
|
|
126
|
-
return {
|
|
127
|
-
content: [
|
|
128
|
-
{
|
|
129
|
-
type: 'text',
|
|
130
|
-
text: `Error: ${result.error}`,
|
|
131
|
-
},
|
|
132
|
-
],
|
|
133
|
-
isError: true,
|
|
134
|
-
};
|
|
135
|
-
}
|
|
87
|
+
if (name !== 'search') return errResponse(`Unknown tool: ${name}`);
|
|
88
|
+
if (!query || typeof query !== 'string') return errResponse('Error: query is required and must be a string');
|
|
136
89
|
|
|
137
|
-
|
|
138
|
-
const
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
const pathPart = r.relativePath || r.absolutePath;
|
|
144
|
-
const lineCount = r.totalLines ? ` [${r.totalLines}L]` : '';
|
|
145
|
-
const ctx = r.enclosingContext ? ` (in: ${r.enclosingContext})` : '';
|
|
146
|
-
const header = `${r.rank}. ${pathPart}${lineCount}:${r.lines}${ctx} (score: ${r.score}%)`;
|
|
147
|
-
const body = r.snippet.split('\n').map((line) => ` ${line}`).join('\n');
|
|
148
|
-
return `${header}\n${body}`;
|
|
149
|
-
})
|
|
150
|
-
.join('\n\n')}`;
|
|
151
|
-
|
|
152
|
-
return {
|
|
153
|
-
content: [
|
|
154
|
-
{
|
|
155
|
-
type: 'text',
|
|
156
|
-
text,
|
|
157
|
-
},
|
|
158
|
-
],
|
|
159
|
-
};
|
|
90
|
+
try {
|
|
91
|
+
const repositoryPath = args?.repository_path || cwd();
|
|
92
|
+
ensureIgnoreEntry(repositoryPath);
|
|
93
|
+
const result = await supervisor.sendRequest({ type: 'search', query, repositoryPath });
|
|
94
|
+
if (result.error) return errResponse(`Error: ${result.error}`);
|
|
95
|
+
return okResponse(formatResults(result, query));
|
|
160
96
|
} catch (error) {
|
|
161
|
-
return {
|
|
162
|
-
content: [
|
|
163
|
-
{
|
|
164
|
-
type: 'text',
|
|
165
|
-
text: `Error: ${error.message}`,
|
|
166
|
-
},
|
|
167
|
-
],
|
|
168
|
-
isError: true,
|
|
169
|
-
};
|
|
97
|
+
return errResponse(`Error: ${error.message}`);
|
|
170
98
|
}
|
|
171
99
|
});
|
|
172
100
|
|
|
@@ -176,25 +104,13 @@ export async function startMcpServer() {
|
|
|
176
104
|
}
|
|
177
105
|
|
|
178
106
|
const isMain = process.argv[1] && (
|
|
179
|
-
process.argv[1] === fileURLToPath(import.meta.url) ||
|
|
180
|
-
process.argv[1].endsWith('mcp.js') ||
|
|
107
|
+
process.argv[1] === fileURLToPath(import.meta.url) ||
|
|
108
|
+
process.argv[1].endsWith('mcp.js') ||
|
|
181
109
|
process.argv[1].endsWith('code-search-mcp')
|
|
182
110
|
);
|
|
183
111
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
console.error('Server error:', error);
|
|
187
|
-
});
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
process.on('uncaughtException', (error) => {
|
|
191
|
-
console.error('Uncaught exception:', error);
|
|
192
|
-
});
|
|
112
|
+
process.on('uncaughtException', (error) => console.error('Uncaught exception:', error));
|
|
113
|
+
process.on('unhandledRejection', (reason) => console.error('Unhandled rejection:', reason));
|
|
193
114
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
});
|
|
197
|
-
|
|
198
|
-
async function main() {
|
|
199
|
-
await startMcpServer();
|
|
200
|
-
}
|
|
115
|
+
async function main() { await startMcpServer(); }
|
|
116
|
+
if (isMain) main().catch((error) => console.error('Server error:', error));
|
package/package.json
CHANGED
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
Performance profiling across codebases...
|
|
2
|
+
This will take several minutes.
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
▶ Profiling xbot...
|
|
6
|
+
Scanned: 7 chunks in 8ms
|
|
7
|
+
Vector store initialized
|
|
8
|
+
Loading embeddings model (this may take a moment on first run)...
|
|
9
|
+
dtype not specified for "model". Using the default dtype (fp32) for this device (cpu).
|
|
10
|
+
Indexed 7 chunks
|
|
11
|
+
Embeddings: 7/7 chunks in 1530ms
|
|
12
|
+
Searching for: "function class method"
|
|
13
|
+
Text search: 7 results in 88ms
|
|
14
|
+
|
|
15
|
+
▶ Profiling fsbrowse...
|
|
16
|
+
Scanned: 6 chunks in 7ms
|
|
17
|
+
Vector store initialized
|
|
18
|
+
Indexed 6 chunks
|
|
19
|
+
Embeddings: 6/6 chunks in 1126ms
|
|
20
|
+
Searching for: "function class method"
|
|
21
|
+
Text search: 6 results in 26ms
|
|
22
|
+
|
|
23
|
+
▶ Profiling agentauth...
|
|
24
|
+
Scanned: 11 chunks in 2ms
|
|
25
|
+
Vector store initialized
|
|
26
|
+
Indexed 11 chunks
|
|
27
|
+
Embeddings: 11/11 chunks in 1957ms
|
|
28
|
+
Searching for: "function class method"
|
|
29
|
+
Text search: 10 results in 40ms
|
|
30
|
+
|
|
31
|
+
▶ Profiling webtalk...
|
|
32
|
+
Scanned: 40 chunks in 28ms
|
|
33
|
+
Vector store initialized
|
|
34
|
+
Indexed 32 chunks
|
|
35
|
+
Indexed 8 chunks
|
|
36
|
+
Embeddings: 40/40 chunks in 6680ms
|
|
37
|
+
Searching for: "function class method"
|
|
38
|
+
Text search: 10 results in 80ms
|
|
39
|
+
|
|
40
|
+
▶ Profiling plugforge...
|
|
41
|
+
Scanned: 66 chunks in 11ms
|
|
42
|
+
Vector store initialized
|
|
43
|
+
Indexed 32 chunks
|
|
44
|
+
Indexed 32 chunks
|
|
45
|
+
Indexed 2 chunks
|
|
46
|
+
Embeddings: 66/66 chunks in 10923ms
|
|
47
|
+
Searching for: "function class method"
|
|
48
|
+
Text search: 10 results in 83ms
|
|
49
|
+
|
|
50
|
+
▶ Profiling agentgui...
|
|
51
|
+
Scanned: 83 chunks in 10ms
|
|
52
|
+
Vector store initialized
|
|
53
|
+
Indexed 32 chunks
|
|
54
|
+
Indexed 32 chunks
|
|
55
|
+
Indexed 19 chunks
|
|
56
|
+
Embeddings: 83/83 chunks in 16535ms
|
|
57
|
+
Searching for: "function class method"
|
|
58
|
+
Text search: 10 results in 146ms
|
|
59
|
+
|
|
60
|
+
▶ Profiling seqos...
|
|
61
|
+
Scanned: 76 chunks in 9ms
|
|
62
|
+
Vector store initialized
|
|
63
|
+
Indexed 32 chunks
|
|
64
|
+
Indexed 32 chunks
|
|
65
|
+
Indexed 12 chunks
|
|
66
|
+
Embeddings: 76/76 chunks in 13730ms
|
|
67
|
+
Searching for: "function class method"
|
|
68
|
+
Text search: 10 results in 113ms
|
|
69
|
+
|
|
70
|
+
▶ Profiling docmcp...
|
|
71
|
+
Scanned: 74 chunks in 53ms
|
|
72
|
+
Vector store initialized
|
|
73
|
+
Indexed 32 chunks
|
|
74
|
+
Indexed 32 chunks
|
|
75
|
+
Indexed 10 chunks
|
|
76
|
+
Embeddings: 74/74 chunks in 13526ms
|
|
77
|
+
Searching for: "function class method"
|
|
78
|
+
Text search: 10 results in 95ms
|
|
79
|
+
|
|
80
|
+
▶ Profiling zellous...
|
|
81
|
+
Scanned: 77 chunks in 54ms
|
|
82
|
+
Vector store initialized
|
|
83
|
+
Indexed 32 chunks
|
|
84
|
+
Indexed 32 chunks
|
|
85
|
+
Indexed 13 chunks
|
|
86
|
+
Embeddings: 77/77 chunks in 13463ms
|
|
87
|
+
Searching for: "function class method"
|
|
88
|
+
Text search: 10 results in 120ms
|
|
89
|
+
|
|
90
|
+
▶ Profiling teatree...
|
|
91
|
+
Scanned: 93 chunks in 10ms
|
|
92
|
+
Vector store initialized
|
|
93
|
+
Indexed 32 chunks
|
|
94
|
+
Indexed 32 chunks
|
|
95
|
+
Indexed 29 chunks
|
|
96
|
+
Embeddings: 93/93 chunks in 15931ms
|
|
97
|
+
Searching for: "function class method"
|
|
98
|
+
Text search: 10 results in 82ms
|
|
99
|
+
|
|
100
|
+
▶ Profiling moonlanding...
|
|
101
|
+
Scanned: 323 chunks in 35ms
|
|
102
|
+
Vector store initialized
|
|
103
|
+
Indexed 32 chunks
|
|
104
|
+
Indexed 32 chunks
|
|
105
|
+
Indexed 32 chunks
|
|
106
|
+
Indexed 32 chunks
|
|
107
|
+
Indexed 32 chunks
|
|
108
|
+
Indexed 32 chunks
|
|
109
|
+
Indexed 32 chunks
|
|
110
|
+
Indexed 32 chunks
|
|
111
|
+
Indexed 32 chunks
|
|
112
|
+
Indexed 32 chunks
|
|
113
|
+
Indexed 3 chunks
|
|
114
|
+
Embeddings: 323/323 chunks in 69519ms
|
|
115
|
+
Searching for: "function class method"
|
|
116
|
+
Text search: 10 results in 672ms
|
|
117
|
+
|
|
118
|
+
▶ Profiling pp...
|
|
119
|
+
Scanned: 1041 chunks in 461ms
|
|
120
|
+
Vector store initialized
|
|
121
|
+
Indexed 32 chunks
|
|
122
|
+
Indexed 32 chunks
|
|
123
|
+
Indexed 32 chunks
|
|
124
|
+
Indexed 32 chunks
|
|
125
|
+
Indexed 32 chunks
|
|
126
|
+
Indexed 32 chunks
|
|
127
|
+
Indexed 32 chunks
|
|
128
|
+
Indexed 32 chunks
|
|
129
|
+
Indexed 32 chunks
|
|
130
|
+
Indexed 32 chunks
|
|
131
|
+
Indexed 32 chunks
|
|
132
|
+
Indexed 32 chunks
|
|
133
|
+
Indexed 32 chunks
|
|
134
|
+
Indexed 32 chunks
|
|
135
|
+
Indexed 32 chunks
|
|
136
|
+
Indexed 32 chunks
|
|
137
|
+
Indexed 32 chunks
|
|
138
|
+
Indexed 32 chunks
|
|
139
|
+
Indexed 32 chunks
|
|
140
|
+
Indexed 32 chunks
|
|
141
|
+
Indexed 32 chunks
|
|
142
|
+
Indexed 32 chunks
|
|
143
|
+
Indexed 32 chunks
|
|
144
|
+
Indexed 32 chunks
|
|
145
|
+
Indexed 32 chunks
|
|
146
|
+
Indexed 32 chunks
|
|
147
|
+
Indexed 32 chunks
|
|
148
|
+
Indexed 32 chunks
|
|
149
|
+
Indexed 32 chunks
|
|
150
|
+
Indexed 32 chunks
|
|
151
|
+
Indexed 32 chunks
|
|
152
|
+
Indexed 32 chunks
|
|
153
|
+
Indexed 17 chunks
|
|
154
|
+
Embeddings: 1041/1041 chunks in 286137ms
|
|
155
|
+
Searching for: "function class method"
|
|
156
|
+
Text search: 10 results in 1312ms
|
|
157
|
+
|
|
158
|
+
================================================================================
|
|
159
|
+
PERFORMANCE PROFILE
|
|
160
|
+
================================================================================
|
|
161
|
+
|
|
162
|
+
scan:
|
|
163
|
+
Total: 687.90ms
|
|
164
|
+
Avg: 57.32ms
|
|
165
|
+
Min: 2.05ms
|
|
166
|
+
Max: 461.34ms
|
|
167
|
+
Count: 12
|
|
168
|
+
Per-chunk: 8.189ms/chunk
|
|
169
|
+
|
|
170
|
+
embeddings:
|
|
171
|
+
Total: 451057.24ms
|
|
172
|
+
Avg: 37588.10ms
|
|
173
|
+
Min: 1125.83ms
|
|
174
|
+
Max: 286136.65ms
|
|
175
|
+
Count: 12
|
|
176
|
+
Per-chunk: 5369.729ms/chunk
|
|
177
|
+
|
|
178
|
+
search-text:
|
|
179
|
+
Total: 2856.57ms
|
|
180
|
+
Avg: 238.05ms
|
|
181
|
+
Min: 26.30ms
|
|
182
|
+
Max: 1311.63ms
|
|
183
|
+
Count: 12
|
|
184
|
+
Per-chunk: 34.007ms/chunk
|
|
185
|
+
|
|
186
|
+
--------------------------------------------------------------------------------
|
|
187
|
+
CODEBASE TIMING
|
|
188
|
+
--------------------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
xbot (7 chunks): 1625ms
|
|
191
|
+
scan: 8ms (0.5%)
|
|
192
|
+
embeddings: 1530ms (94.1%)
|
|
193
|
+
search-text: 88ms (5.4%)
|
|
194
|
+
|
|
195
|
+
fsbrowse (6 chunks): 1159ms
|
|
196
|
+
scan: 7ms (0.6%)
|
|
197
|
+
embeddings: 1126ms (97.1%)
|
|
198
|
+
search-text: 26ms (2.3%)
|
|
199
|
+
|
|
200
|
+
agentauth (11 chunks): 1999ms
|
|
201
|
+
scan: 2ms (0.1%)
|
|
202
|
+
embeddings: 1957ms (97.9%)
|
|
203
|
+
search-text: 40ms (2.0%)
|
|
204
|
+
|
|
205
|
+
webtalk (40 chunks): 6788ms
|
|
206
|
+
scan: 28ms (0.4%)
|
|
207
|
+
embeddings: 6680ms (98.4%)
|
|
208
|
+
search-text: 80ms (1.2%)
|
|
209
|
+
|
|
210
|
+
plugforge (66 chunks): 11018ms
|
|
211
|
+
scan: 11ms (0.1%)
|
|
212
|
+
embeddings: 10923ms (99.1%)
|
|
213
|
+
search-text: 83ms (0.8%)
|
|
214
|
+
|
|
215
|
+
agentgui (83 chunks): 16691ms
|
|
216
|
+
scan: 10ms (0.1%)
|
|
217
|
+
embeddings: 16535ms (99.1%)
|
|
218
|
+
search-text: 146ms (0.9%)
|
|
219
|
+
|
|
220
|
+
seqos (76 chunks): 13852ms
|
|
221
|
+
scan: 9ms (0.1%)
|
|
222
|
+
embeddings: 13730ms (99.1%)
|
|
223
|
+
search-text: 113ms (0.8%)
|
|
224
|
+
|
|
225
|
+
docmcp (74 chunks): 13674ms
|
|
226
|
+
scan: 53ms (0.4%)
|
|
227
|
+
embeddings: 13526ms (98.9%)
|
|
228
|
+
search-text: 95ms (0.7%)
|
|
229
|
+
|
|
230
|
+
zellous (77 chunks): 13637ms
|
|
231
|
+
scan: 54ms (0.4%)
|
|
232
|
+
embeddings: 13463ms (98.7%)
|
|
233
|
+
search-text: 120ms (0.9%)
|
|
234
|
+
|
|
235
|
+
teatree (93 chunks): 16023ms
|
|
236
|
+
scan: 10ms (0.1%)
|
|
237
|
+
embeddings: 15931ms (99.4%)
|
|
238
|
+
search-text: 82ms (0.5%)
|
|
239
|
+
|
|
240
|
+
moonlanding (323 chunks): 70226ms
|
|
241
|
+
scan: 35ms (0.0%)
|
|
242
|
+
embeddings: 69519ms (99.0%)
|
|
243
|
+
search-text: 672ms (1.0%)
|
|
244
|
+
|
|
245
|
+
pp (1041 chunks): 287910ms
|
|
246
|
+
scan: 461ms (0.2%)
|
|
247
|
+
embeddings: 286137ms (99.4%)
|
|
248
|
+
search-text: 1312ms (0.5%)
|
package/profile.js
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { existsSync } from 'fs';
|
|
4
|
+
import { loadIgnorePatterns } from './src/ignore-parser.js';
|
|
5
|
+
import { scanRepository } from './src/scanner.js';
|
|
6
|
+
import { buildTextIndex, searchText } from './src/text-search.js';
|
|
7
|
+
|
|
8
|
+
const CODEBASES = [
|
|
9
|
+
'~/workspace/agentauth',
|
|
10
|
+
'~/workspace/agentgui',
|
|
11
|
+
'~/workspace/docmcp',
|
|
12
|
+
'~/workspace/friday-staging',
|
|
13
|
+
'~/workspace/fsbrowse',
|
|
14
|
+
'~/workspace/gmweb',
|
|
15
|
+
'~/workspace/hookie',
|
|
16
|
+
'~/workspace/invoic',
|
|
17
|
+
'~/workspace/mcp-thorns',
|
|
18
|
+
'~/workspace/models',
|
|
19
|
+
'~/workspace/moonlanding',
|
|
20
|
+
'~/workspace/myworkreview-staging',
|
|
21
|
+
'~/workspace/opencode-source',
|
|
22
|
+
'~/workspace/plugforge',
|
|
23
|
+
'~/workspace/pp',
|
|
24
|
+
'~/workspace/proxypilot-setup',
|
|
25
|
+
'~/workspace/seqos',
|
|
26
|
+
'~/workspace/sttttsmodels',
|
|
27
|
+
'~/workspace/teatree',
|
|
28
|
+
'~/workspace/webtalk',
|
|
29
|
+
'~/workspace/webtalk-repo',
|
|
30
|
+
'~/workspace/xbot',
|
|
31
|
+
'~/workspace/zellous',
|
|
32
|
+
];
|
|
33
|
+
|
|
34
|
+
// Realistic queries that represent actual usage
|
|
35
|
+
const TEST_QUERIES = [
|
|
36
|
+
'authentication',
|
|
37
|
+
'database connection',
|
|
38
|
+
'error handling',
|
|
39
|
+
'HTTP request',
|
|
40
|
+
'user session',
|
|
41
|
+
];
|
|
42
|
+
|
|
43
|
+
class Profiler {
|
|
44
|
+
constructor() {
|
|
45
|
+
this.marks = {};
|
|
46
|
+
this.measurements = [];
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
start(name) {
|
|
50
|
+
this.marks[name] = performance.now();
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
end(name, extra = null) {
|
|
54
|
+
if (!this.marks[name]) return 0;
|
|
55
|
+
const duration = performance.now() - this.marks[name];
|
|
56
|
+
this.measurements.push({ name, duration, extra });
|
|
57
|
+
delete this.marks[name];
|
|
58
|
+
return duration;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
report(results) {
|
|
62
|
+
console.log('\n' + '='.repeat(80));
|
|
63
|
+
console.log('PERFORMANCE SUMMARY');
|
|
64
|
+
console.log('='.repeat(80));
|
|
65
|
+
|
|
66
|
+
const scanTimes = this.measurements.filter(m => m.name.startsWith('scan:'));
|
|
67
|
+
const indexTimes = this.measurements.filter(m => m.name.startsWith('index:'));
|
|
68
|
+
const searchTimes = this.measurements.filter(m => m.name.startsWith('search:'));
|
|
69
|
+
|
|
70
|
+
const avg = arr => arr.reduce((a, b) => a + b, 0) / arr.length;
|
|
71
|
+
const fmt = ms => ms.toFixed(1) + 'ms';
|
|
72
|
+
|
|
73
|
+
console.log(`\nScan: avg ${fmt(avg(scanTimes.map(m => m.duration)))} max ${fmt(Math.max(...scanTimes.map(m => m.duration)))}`);
|
|
74
|
+
console.log(`Index: avg ${fmt(avg(indexTimes.map(m => m.duration)))} max ${fmt(Math.max(...indexTimes.map(m => m.duration)))}`);
|
|
75
|
+
console.log(`Search: avg ${fmt(avg(searchTimes.map(m => m.duration)))} max ${fmt(Math.max(...searchTimes.map(m => m.duration)))}`);
|
|
76
|
+
|
|
77
|
+
console.log('\n' + '-'.repeat(80));
|
|
78
|
+
console.log('PER-CODEBASE RESULTS');
|
|
79
|
+
console.log('-'.repeat(80));
|
|
80
|
+
|
|
81
|
+
for (const r of results) {
|
|
82
|
+
if (r.skipped) {
|
|
83
|
+
console.log(`\n${r.label}: skipped (not found)`);
|
|
84
|
+
continue;
|
|
85
|
+
}
|
|
86
|
+
const scanT = scanTimes.find(m => m.name === `scan:${r.label}`)?.duration || 0;
|
|
87
|
+
const indexT = indexTimes.find(m => m.name === `index:${r.label}`)?.duration || 0;
|
|
88
|
+
console.log(`\n${r.label} (${r.chunks} chunks): scan ${fmt(scanT)} | index ${fmt(indexT)}`);
|
|
89
|
+
for (const q of r.queries) {
|
|
90
|
+
const top = q.results[0];
|
|
91
|
+
const topStr = top ? `${top.file_path}:${top.line_start} (${(top.score * 100).toFixed(0)}%)` : 'no results';
|
|
92
|
+
console.log(` "${q.query}" → ${q.count} results in ${fmt(q.time)} | top: ${topStr}`);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
async function profileCodebase(codebasePath, profiler) {
|
|
99
|
+
const rootPath = codebasePath.replace('~', process.env.HOME);
|
|
100
|
+
const label = codebasePath.split('/').pop();
|
|
101
|
+
|
|
102
|
+
if (!existsSync(rootPath)) {
|
|
103
|
+
console.log(` ${label}: not found, skipping`);
|
|
104
|
+
return { label, skipped: true };
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
process.stdout.write(`▶ ${label}... `);
|
|
108
|
+
|
|
109
|
+
profiler.start(`scan:${label}`);
|
|
110
|
+
const ignorePatterns = loadIgnorePatterns(rootPath);
|
|
111
|
+
const chunks = scanRepository(rootPath, ignorePatterns);
|
|
112
|
+
profiler.end(`scan:${label}`, chunks.length);
|
|
113
|
+
|
|
114
|
+
if (chunks.length === 0) {
|
|
115
|
+
console.log('0 chunks');
|
|
116
|
+
return { label, chunks: 0, queries: [] };
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
profiler.start(`index:${label}`);
|
|
120
|
+
const indexData = buildTextIndex(chunks);
|
|
121
|
+
profiler.end(`index:${label}`, chunks.length);
|
|
122
|
+
|
|
123
|
+
const queryResults = [];
|
|
124
|
+
for (const query of TEST_QUERIES) {
|
|
125
|
+
const t0 = performance.now();
|
|
126
|
+
const results = searchText(query, chunks, indexData);
|
|
127
|
+
const elapsed = performance.now() - t0;
|
|
128
|
+
profiler.measurements.push({ name: `search:${label}:${query}`, duration: elapsed });
|
|
129
|
+
queryResults.push({ query, count: results.length, time: elapsed, results: results.slice(0, 1) });
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
console.log(`${chunks.length} chunks, ${queryResults.map(q => q.time.toFixed(0) + 'ms').join('/')}`);
|
|
133
|
+
return { label, chunks: chunks.length, queries: queryResults };
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
async function main() {
|
|
137
|
+
console.log('Profiling search across codebases...\n');
|
|
138
|
+
|
|
139
|
+
const profiler = new Profiler();
|
|
140
|
+
const results = [];
|
|
141
|
+
|
|
142
|
+
for (const codebase of CODEBASES) {
|
|
143
|
+
results.push(await profileCodebase(codebase, profiler));
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
profiler.report(results);
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
main().catch(err => {
|
|
150
|
+
console.error('Profile error:', err.message);
|
|
151
|
+
process.exit(1);
|
|
152
|
+
});
|
package/src/ignore-parser.js
CHANGED
|
@@ -197,6 +197,8 @@ const IGNORED_DIRECTORIES = new Set([
|
|
|
197
197
|
'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
|
|
198
198
|
// Version control
|
|
199
199
|
'.git', '.svn', '.hg', '.bzr',
|
|
200
|
+
// Tool config (AI assistants, editors)
|
|
201
|
+
'.claude', '.cursor', '.aider',
|
|
200
202
|
// IDE
|
|
201
203
|
'.vscode', '.idea', '.vs', '.atom',
|
|
202
204
|
// Build outputs (unambiguous names only)
|
package/src/search-worker.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { parentPort } from 'worker_threads';
|
|
2
2
|
import { resolve, relative } from 'path';
|
|
3
|
-
import { existsSync, readFileSync } from 'fs';
|
|
3
|
+
import { existsSync, readFileSync, statSync } from 'fs';
|
|
4
4
|
import { loadIgnorePatterns } from './ignore-parser.js';
|
|
5
5
|
import { scanRepository } from './scanner.js';
|
|
6
6
|
import { buildTextIndex, searchText } from './text-search.js';
|
|
@@ -21,22 +21,34 @@ function findEnclosingContext(content, lineStart) {
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
function getFileTotalLines(absoluteFilePath) {
|
|
24
|
+
if (fileLineCountCache.has(absoluteFilePath)) {
|
|
25
|
+
return fileLineCountCache.get(absoluteFilePath);
|
|
26
|
+
}
|
|
24
27
|
try {
|
|
25
28
|
const content = readFileSync(absoluteFilePath, 'utf8');
|
|
26
|
-
|
|
29
|
+
const count = content.split('\n').length;
|
|
30
|
+
fileLineCountCache.set(absoluteFilePath, count);
|
|
31
|
+
return count;
|
|
27
32
|
} catch {
|
|
28
33
|
return null;
|
|
29
34
|
}
|
|
30
35
|
}
|
|
31
36
|
|
|
32
37
|
let indexCache = new Map();
|
|
38
|
+
// Cache file line counts to avoid repeated disk reads on every search
|
|
39
|
+
const fileLineCountCache = new Map();
|
|
33
40
|
|
|
34
41
|
async function initializeIndex(repositoryPath) {
|
|
35
42
|
const absolutePath = resolve(repositoryPath);
|
|
36
|
-
const cacheKey = absolutePath;
|
|
37
43
|
|
|
38
|
-
|
|
39
|
-
|
|
44
|
+
const cached = indexCache.get(absolutePath);
|
|
45
|
+
if (cached) {
|
|
46
|
+
try {
|
|
47
|
+
const dirStat = statSync(absolutePath);
|
|
48
|
+
if (dirStat.mtimeMs <= cached.indexedAt) return cached;
|
|
49
|
+
} catch {
|
|
50
|
+
return cached;
|
|
51
|
+
}
|
|
40
52
|
}
|
|
41
53
|
|
|
42
54
|
try {
|
|
@@ -48,8 +60,8 @@ async function initializeIndex(repositoryPath) {
|
|
|
48
60
|
}
|
|
49
61
|
|
|
50
62
|
const indexData = buildTextIndex(chunks);
|
|
51
|
-
const result = { chunks, indexData };
|
|
52
|
-
indexCache.set(
|
|
63
|
+
const result = { chunks, indexData, indexedAt: Date.now() };
|
|
64
|
+
indexCache.set(absolutePath, result);
|
|
53
65
|
|
|
54
66
|
return result;
|
|
55
67
|
} catch (error) {
|
|
@@ -71,7 +83,17 @@ async function performSearch(repositoryPath, query) {
|
|
|
71
83
|
return { error: indexData.error, results: [] };
|
|
72
84
|
}
|
|
73
85
|
|
|
74
|
-
const
|
|
86
|
+
const rawResults = searchText(query, indexData.chunks, indexData.indexData);
|
|
87
|
+
|
|
88
|
+
// Deduplicate: keep best-scoring chunk per file, then take top results
|
|
89
|
+
const bestPerFile = new Map();
|
|
90
|
+
for (const r of rawResults) {
|
|
91
|
+
const existing = bestPerFile.get(r.file_path);
|
|
92
|
+
if (!existing || r.score > existing.score) {
|
|
93
|
+
bestPerFile.set(r.file_path, r);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
const results = Array.from(bestPerFile.values()).sort((a, b) => b.score - a.score);
|
|
75
97
|
|
|
76
98
|
return {
|
|
77
99
|
query,
|
package/src/text-search.js
CHANGED
|
@@ -1,35 +1,72 @@
|
|
|
1
1
|
export function buildTextIndex(chunks) {
|
|
2
2
|
const index = new Map();
|
|
3
|
-
const chunkMetadata =
|
|
4
|
-
|
|
5
|
-
chunks.
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
-
const
|
|
10
|
-
|
|
11
|
-
tokens.forEach(token => {
|
|
12
|
-
frequency.set(token, (frequency.get(token) || 0) + 1);
|
|
13
|
-
if (!index.has(token)) {
|
|
14
|
-
index.set(token, new Set());
|
|
15
|
-
}
|
|
16
|
-
index.get(token).add(idx);
|
|
17
|
-
});
|
|
3
|
+
const chunkMetadata = new Array(chunks.length);
|
|
4
|
+
|
|
5
|
+
for (let idx = 0; idx < chunks.length; idx++) {
|
|
6
|
+
const chunk = chunks[idx];
|
|
7
|
+
const frequency = tokenizeToFrequency(chunk.content, index, idx);
|
|
8
|
+
const fileNameTokens = new Set(tokenize(chunk.file_path));
|
|
9
|
+
const symbols = new Set(extractSymbols(chunk.content));
|
|
18
10
|
|
|
19
11
|
chunkMetadata[idx] = {
|
|
20
|
-
tokens,
|
|
21
12
|
fileNameTokens,
|
|
22
13
|
symbols,
|
|
23
14
|
frequency,
|
|
24
15
|
isCode: isCodeFile(chunk.file_path),
|
|
16
|
+
contentLower: chunk.content.toLowerCase(),
|
|
25
17
|
};
|
|
26
|
-
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Precompute IDF for each token: log((N+1)/(df+1))
|
|
21
|
+
const N = chunks.length;
|
|
22
|
+
const idf = new Map();
|
|
23
|
+
for (const [token, docSet] of index) {
|
|
24
|
+
idf.set(token, Math.log((N + 1) / (docSet.size + 1)) + 1);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
return { index, chunkMetadata, idf };
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function tokenizeToFrequency(text, index, chunkIdx) {
|
|
31
|
+
const frequency = new Map();
|
|
32
|
+
|
|
33
|
+
for (const word of text.split(/\s+/)) {
|
|
34
|
+
if (word.length === 0) continue;
|
|
35
|
+
|
|
36
|
+
const hasUpperCase = word !== word.toLowerCase();
|
|
37
|
+
if (hasUpperCase) {
|
|
38
|
+
const camelTokens = word.match(/[A-Z]?[a-z]+|[A-Z]+(?=[A-Z][a-z]|\d|\W|$)|[0-9]+/g);
|
|
39
|
+
if (camelTokens) {
|
|
40
|
+
for (const t of camelTokens) {
|
|
41
|
+
if (t.length > 1) addToken(t.toLowerCase(), frequency, index, chunkIdx);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const cleaned = word.replace(/[^\w]/g, '').toLowerCase();
|
|
47
|
+
if (cleaned.length > 1) {
|
|
48
|
+
addToken(cleaned, frequency, index, chunkIdx);
|
|
49
|
+
if (word.includes('-') || word.includes('_') || word.includes('.')) {
|
|
50
|
+
for (const part of word.split(/[-_.]/)) {
|
|
51
|
+
const partCleaned = part.replace(/[^\w]/g, '').toLowerCase();
|
|
52
|
+
if (partCleaned.length > 1 && partCleaned !== cleaned) addToken(partCleaned, frequency, index, chunkIdx);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
27
57
|
|
|
28
|
-
return
|
|
58
|
+
return frequency;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function addToken(token, frequency, index, chunkIdx) {
|
|
62
|
+
frequency.set(token, (frequency.get(token) || 0) + 1);
|
|
63
|
+
let docSet = index.get(token);
|
|
64
|
+
if (!docSet) { docSet = new Set(); index.set(token, docSet); }
|
|
65
|
+
docSet.add(chunkIdx);
|
|
29
66
|
}
|
|
30
67
|
|
|
31
68
|
export function searchText(query, chunks, indexData) {
|
|
32
|
-
const { index, chunkMetadata } = indexData;
|
|
69
|
+
const { index, chunkMetadata, idf } = indexData;
|
|
33
70
|
const queryTokens = tokenize(query);
|
|
34
71
|
const querySymbols = extractSymbols(query);
|
|
35
72
|
const chunkScores = new Map();
|
|
@@ -47,36 +84,53 @@ export function searchText(query, chunks, indexData) {
|
|
|
47
84
|
}
|
|
48
85
|
});
|
|
49
86
|
|
|
50
|
-
|
|
87
|
+
const queryLower = query.toLowerCase();
|
|
88
|
+
|
|
89
|
+
let scoringCandidates = candidates;
|
|
90
|
+
if (candidates.size > 500) {
|
|
91
|
+
const ranked = Array.from(candidates).sort((a, b) => {
|
|
92
|
+
let aSum = 0, bSum = 0;
|
|
93
|
+
for (const token of queryTokens) {
|
|
94
|
+
if (index.has(token)) {
|
|
95
|
+
if (index.get(token).has(a)) aSum += idf.get(token) || 1;
|
|
96
|
+
if (index.get(token).has(b)) bSum += idf.get(token) || 1;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return bSum - aSum;
|
|
100
|
+
});
|
|
101
|
+
scoringCandidates = new Set(ranked.slice(0, 500));
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
for (const idx of scoringCandidates) {
|
|
51
105
|
const chunk = chunks[idx];
|
|
52
106
|
const meta = chunkMetadata[idx];
|
|
53
107
|
let score = 0;
|
|
54
108
|
|
|
55
|
-
|
|
56
|
-
if (chunk.content.toLowerCase().includes(query.toLowerCase())) {
|
|
109
|
+
if (queryTokens.length > 1 && meta.contentLower.includes(queryLower)) {
|
|
57
110
|
score += 30;
|
|
58
111
|
}
|
|
59
112
|
|
|
60
113
|
// Symbol match in content - function/class named after query terms
|
|
61
114
|
querySymbols.forEach(symbol => {
|
|
62
|
-
if (meta.symbols.
|
|
115
|
+
if (meta.symbols.has(symbol)) score += 10;
|
|
63
116
|
});
|
|
64
117
|
|
|
65
118
|
// Filename token match - strong signal that this file is about the query topic
|
|
66
119
|
let fileNameMatches = 0;
|
|
67
120
|
queryTokens.forEach(token => {
|
|
68
|
-
if (meta.fileNameTokens.
|
|
121
|
+
if (meta.fileNameTokens.has(token)) fileNameMatches++;
|
|
69
122
|
});
|
|
70
123
|
if (fileNameMatches > 0) {
|
|
71
124
|
score += fileNameMatches * 10;
|
|
72
125
|
}
|
|
73
126
|
|
|
74
|
-
//
|
|
127
|
+
// TF-IDF scoring: reward rare tokens that appear in this chunk
|
|
75
128
|
queryTokens.forEach(token => {
|
|
76
129
|
if (index.has(token) && index.get(token).has(idx)) {
|
|
77
|
-
const
|
|
130
|
+
const tf = Math.min(meta.frequency.get(token) || 1, 5);
|
|
131
|
+
const tokenIdf = idf ? (idf.get(token) || 1) : 1;
|
|
78
132
|
const lengthBoost = token.length > 4 ? 1.5 : 1;
|
|
79
|
-
score += lengthBoost *
|
|
133
|
+
score += lengthBoost * tf * tokenIdf;
|
|
80
134
|
}
|
|
81
135
|
});
|
|
82
136
|
|