codebasesearch 0.1.10 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli.js +15 -14
- package/src/ignore-parser.js +108 -3
- package/src/scanner.js +1 -2
- package/src/search-worker.js +7 -7
- package/src/store.js +20 -8
- package/src/text-search.js +96 -15
package/package.json
CHANGED
package/src/cli.js
CHANGED
|
@@ -74,25 +74,26 @@ export async function run(args) {
|
|
|
74
74
|
// Always reindex to ensure freshness
|
|
75
75
|
console.log('Generating embeddings and indexing...');
|
|
76
76
|
|
|
77
|
-
// Generate embeddings in batches
|
|
77
|
+
// Generate embeddings in batches and upsert immediately to free memory
|
|
78
78
|
const batchSize = 32;
|
|
79
|
-
|
|
80
|
-
const allEmbeddings = [];
|
|
79
|
+
let processedCount = 0;
|
|
81
80
|
|
|
82
|
-
for (let i = 0; i <
|
|
83
|
-
const
|
|
81
|
+
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
82
|
+
const batchChunks = chunks.slice(i, i + batchSize);
|
|
83
|
+
const batchTexts = batchChunks.map(c => c.content);
|
|
84
84
|
const batchEmbeddings = await generateEmbeddings(batchTexts);
|
|
85
|
-
allEmbeddings.push(...batchEmbeddings);
|
|
86
|
-
}
|
|
87
85
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
86
|
+
// Create batch with embeddings
|
|
87
|
+
const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
|
|
88
|
+
...chunk,
|
|
89
|
+
vector: batchEmbeddings[idx]
|
|
90
|
+
}));
|
|
91
|
+
|
|
92
|
+
// Upsert immediately to free memory
|
|
93
|
+
await upsertChunks(batchWithEmbeddings);
|
|
94
|
+
processedCount += batchWithEmbeddings.length;
|
|
95
|
+
}
|
|
93
96
|
|
|
94
|
-
// Upsert to store
|
|
95
|
-
await upsertChunks(chunksWithEmbeddings);
|
|
96
97
|
console.log('Index created\n');
|
|
97
98
|
|
|
98
99
|
// Execute search
|
package/src/ignore-parser.js
CHANGED
|
@@ -32,7 +32,97 @@ function getHardcodedIgnores() {
|
|
|
32
32
|
'temp', 'tmp', '.tmp', '.DS_Store', 'Thumbs.db',
|
|
33
33
|
'.swp', '.swo', '*.swp', '*.swo', '.tern-port',
|
|
34
34
|
'dist-server', 'out-tsc', '.cache', '.parcel-cache',
|
|
35
|
-
'typings', '.env', '.env.local', '.env.*.local'
|
|
35
|
+
'typings', '.env', '.env.local', '.env.*.local',
|
|
36
|
+
// JSON files - PRIMARY PRIORITY for memory reduction
|
|
37
|
+
'*.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
|
|
38
|
+
'Gemfile.lock', 'poetry.lock', 'Pipfile.lock',
|
|
39
|
+
// Lock files
|
|
40
|
+
'*.lock',
|
|
41
|
+
// Build outputs
|
|
42
|
+
'public', 'static', 'site', '_site', '.docusaurus', '.gatsby',
|
|
43
|
+
// Cache/dependency directories
|
|
44
|
+
'.rush', '.lerna', '.nx',
|
|
45
|
+
// IDE/editor configs
|
|
46
|
+
'.cursor', '.replit', '.sublime-project', '.sublime-workspace',
|
|
47
|
+
'*.iml', '.project', '.classpath', '.settings', '*.sublime-*',
|
|
48
|
+
// OS files
|
|
49
|
+
'.Spotlight-V100', '.Trashes', 'ehthumbs.db', '.fseventsd',
|
|
50
|
+
'.TemporaryItems', '.AppleDouble', '.LSOverride', 'desktop.ini',
|
|
51
|
+
// Large data files
|
|
52
|
+
'*.db', '*.sqlite', '*.sqlite3', '*.bak', '*.dump',
|
|
53
|
+
'*.backup', '*.data', '*.orig',
|
|
54
|
+
// Logs and temp
|
|
55
|
+
'*.log', 'logs', 'npm-debug.log', 'yarn-error.log',
|
|
56
|
+
// Test coverage and reports
|
|
57
|
+
'lcov.info', '.coverage', 'test-results',
|
|
58
|
+
// Database related
|
|
59
|
+
'storage', 'fixtures',
|
|
60
|
+
// LLM/Vector related
|
|
61
|
+
'.llamaindex', '.chroma', '.vectorstore', '.embeddings',
|
|
62
|
+
'.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
|
|
63
|
+
'.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
|
|
64
|
+
'chromadb', 'pinecone-cache', 'weaviate-data',
|
|
65
|
+
// Compiled output
|
|
66
|
+
'*.min.js', '*.min.css', '*.bundle.js', '*.chunk.js', '*.map',
|
|
67
|
+
// Generated/build artifacts
|
|
68
|
+
'.assets', 'out-tsc', 'cmake_build_debug', 'cmake_build_release',
|
|
69
|
+
// Version managers
|
|
70
|
+
'.rbenv', '.nvm', '.nvmrc',
|
|
71
|
+
// Ruby specific
|
|
72
|
+
'*.gem', '*.rbc', '/pkg', '/spec/reports', '/spec/examples.txt',
|
|
73
|
+
'/test/tmp', '/test/version_tmp', 'lib/bundler/man', '.ruby-version',
|
|
74
|
+
// Go specific
|
|
75
|
+
'go.work',
|
|
76
|
+
// Rust specific
|
|
77
|
+
'Cargo.lock', '**/*.rs.bk', '*.pdb',
|
|
78
|
+
// Java specific
|
|
79
|
+
'*.class', '*.jar', '*.war', '*.ear', '*.nar', '*.nupkg', '*.snupkg',
|
|
80
|
+
// C# specific
|
|
81
|
+
'*.suo', '*.user', '*.userosscache', '*.sln.docstates',
|
|
82
|
+
'project.lock.json', 'project.fragment.lock.json', 'artifacts',
|
|
83
|
+
// C/C++ specific
|
|
84
|
+
'*.o', '*.a', '*.so', '*.exe', '*.obj', '*.dll', '*.dylib',
|
|
85
|
+
'CMakeFiles', 'CMakeCache.txt', '*.cmake',
|
|
86
|
+
// Swift/Xcode specific
|
|
87
|
+
'*.xcodeproj', '*.xcworkspace', '*.moved-aside', '*.pbxuser',
|
|
88
|
+
'*.mode1v3', '*.mode2v3', '*.perspectivev3',
|
|
89
|
+
// Scala/SBT specific
|
|
90
|
+
'lib_managed', 'src_managed', 'project/boot', 'project/plugins/project',
|
|
91
|
+
'.history', '.lib',
|
|
92
|
+
// PHP specific
|
|
93
|
+
'composer.lock', '*.phar',
|
|
94
|
+
// Docker
|
|
95
|
+
'.dockerignore', 'docker-compose.override.yml', '.docker',
|
|
96
|
+
// Documentation build
|
|
97
|
+
'docs/_build', '.vuepress',
|
|
98
|
+
// Testing frameworks
|
|
99
|
+
'jest.config', 'vitest.config', 'pytest.ini', 'tox.ini',
|
|
100
|
+
'__tests__', '__mocks__', 'spec', 'cypress', 'playwright',
|
|
101
|
+
// Monorepo workspace patterns (implicit through directory coverage)
|
|
102
|
+
'.turbo', '.nx',
|
|
103
|
+
// Python package patterns
|
|
104
|
+
'*.py[cod]', '*$py.class', '.Python', 'pip-log.txt',
|
|
105
|
+
'pip-delete-this-directory.txt', '.hypothesis', '.pyre', '.pytype',
|
|
106
|
+
'*.whl',
|
|
107
|
+
// Config/metadata that are typically low-value
|
|
108
|
+
'*.config.js', '*.config.ts', 'webpack.config.js', 'rollup.config.js',
|
|
109
|
+
'vite.config.js', 'tsconfig.json', 'jsconfig.json', 'babel.config',
|
|
110
|
+
'.babelrc', '.eslintrc', '.prettierrc', '.stylelintrc', '.editorconfig',
|
|
111
|
+
'*.local', '*.development', '*.production',
|
|
112
|
+
// Node specific
|
|
113
|
+
'.npm', '.node_repl_history', '*.tsbuildinfo', 'yarn-error.log',
|
|
114
|
+
// Documentation/reference files that don't help with search
|
|
115
|
+
'*.md', '*.txt', '*.rst', '*.adoc', 'docs', 'documentation', 'wiki',
|
|
116
|
+
'CHANGELOG', 'HISTORY', 'NEWS', 'UPGRADING', 'FAQ', 'CONTRIBUTING',
|
|
117
|
+
'SECURITY', 'LICENSE', 'LICENCE', 'COPYRIGHT', 'NOTICE', 'AUTHORS',
|
|
118
|
+
'THIRDPARTY',
|
|
119
|
+
// Test and coverage files
|
|
120
|
+
'*.test', '*.spec', 'test', 'tests', 'htmlcov',
|
|
121
|
+
// Profiling
|
|
122
|
+
'*.prof', '*.cpuprofile', '*.heapprofile',
|
|
123
|
+
// Misc
|
|
124
|
+
'.tern-port', 'firebase-debug.log', 'firestore-debug.log',
|
|
125
|
+
'ui-debug.log', '.firebaserc', '.stackdump'
|
|
36
126
|
]);
|
|
37
127
|
}
|
|
38
128
|
|
|
@@ -110,15 +200,30 @@ export function loadIgnorePatterns(rootPath) {
|
|
|
110
200
|
export function shouldIgnore(filePath, ignorePatterns) {
|
|
111
201
|
const normalizedPath = filePath.replace(/\\/g, '/');
|
|
112
202
|
const pathParts = normalizedPath.split('/');
|
|
203
|
+
const fileName = pathParts[pathParts.length - 1];
|
|
113
204
|
|
|
114
205
|
for (const pattern of ignorePatterns) {
|
|
206
|
+
// Handle path patterns (contain /)
|
|
115
207
|
if (pattern.includes('/')) {
|
|
116
208
|
if (normalizedPath.includes(pattern)) {
|
|
117
209
|
return true;
|
|
118
210
|
}
|
|
119
|
-
}
|
|
211
|
+
}
|
|
212
|
+
// Handle extension patterns (*.ext)
|
|
213
|
+
else if (pattern.startsWith('*.')) {
|
|
214
|
+
const ext = pattern.slice(1);
|
|
215
|
+
if (fileName.endsWith(ext)) {
|
|
216
|
+
return true;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
// Handle exact file name patterns
|
|
220
|
+
else if (fileName === pattern) {
|
|
221
|
+
return true;
|
|
222
|
+
}
|
|
223
|
+
// Handle directory name patterns (match any path part)
|
|
224
|
+
else {
|
|
120
225
|
for (const part of pathParts) {
|
|
121
|
-
if (part === pattern) {
|
|
226
|
+
if (part === pattern || part.startsWith(pattern + '/')) {
|
|
122
227
|
return true;
|
|
123
228
|
}
|
|
124
229
|
}
|
package/src/scanner.js
CHANGED
|
@@ -23,7 +23,6 @@ const SUPPORTED_EXTENSIONS = new Set([
|
|
|
23
23
|
'.groovy',
|
|
24
24
|
'.gradle',
|
|
25
25
|
'.xml', '.xsd',
|
|
26
|
-
'.json', '.jsonc',
|
|
27
26
|
'.yaml', '.yml',
|
|
28
27
|
'.toml',
|
|
29
28
|
'.html', '.htm',
|
|
@@ -95,7 +94,7 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
|
|
|
95
94
|
return files;
|
|
96
95
|
}
|
|
97
96
|
|
|
98
|
-
function chunkContent(content, chunkSize = 1000, overlapSize =
|
|
97
|
+
function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
|
|
99
98
|
const lines = content.split('\n');
|
|
100
99
|
const chunks = [];
|
|
101
100
|
|
package/src/search-worker.js
CHANGED
|
@@ -20,16 +20,16 @@ async function initializeIndex(repositoryPath) {
|
|
|
20
20
|
const chunks = scanRepository(absolutePath, ignorePatterns);
|
|
21
21
|
|
|
22
22
|
if (chunks.length === 0) {
|
|
23
|
-
return { error: 'No code chunks found', chunks: [],
|
|
23
|
+
return { error: 'No code chunks found', chunks: [], indexData: null };
|
|
24
24
|
}
|
|
25
25
|
|
|
26
|
-
const
|
|
27
|
-
const
|
|
28
|
-
indexCache.set(cacheKey,
|
|
26
|
+
const indexData = buildTextIndex(chunks);
|
|
27
|
+
const result = { chunks, indexData };
|
|
28
|
+
indexCache.set(cacheKey, result);
|
|
29
29
|
|
|
30
|
-
return
|
|
30
|
+
return result;
|
|
31
31
|
} catch (error) {
|
|
32
|
-
return { error: error.message, chunks: [],
|
|
32
|
+
return { error: error.message, chunks: [], indexData: null };
|
|
33
33
|
}
|
|
34
34
|
}
|
|
35
35
|
|
|
@@ -47,7 +47,7 @@ async function performSearch(repositoryPath, query) {
|
|
|
47
47
|
return { error: indexData.error, results: [] };
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
-
const results = searchText(query, indexData.chunks, indexData.
|
|
50
|
+
const results = searchText(query, indexData.chunks, indexData.indexData);
|
|
51
51
|
|
|
52
52
|
return {
|
|
53
53
|
query,
|
package/src/store.js
CHANGED
|
@@ -4,6 +4,7 @@ import { mkdirSync, existsSync } from 'fs';
|
|
|
4
4
|
|
|
5
5
|
let dbConnection = null;
|
|
6
6
|
let tableRef = null;
|
|
7
|
+
let isFirstBatch = true;
|
|
7
8
|
|
|
8
9
|
export async function initStore(dbPath) {
|
|
9
10
|
// Ensure directory exists
|
|
@@ -19,6 +20,7 @@ export async function initStore(dbPath) {
|
|
|
19
20
|
uri: dbDir,
|
|
20
21
|
mode: 'overwrite'
|
|
21
22
|
});
|
|
23
|
+
isFirstBatch = true;
|
|
22
24
|
console.error('Vector store initialized');
|
|
23
25
|
return true;
|
|
24
26
|
} catch (e) {
|
|
@@ -68,14 +70,24 @@ export async function upsertChunks(chunks) {
|
|
|
68
70
|
try {
|
|
69
71
|
let table = null;
|
|
70
72
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
73
|
+
if (isFirstBatch) {
|
|
74
|
+
// First batch: try to open existing table, or create new one
|
|
75
|
+
try {
|
|
76
|
+
table = await dbConnection.openTable(tableName);
|
|
77
|
+
await table.overwrite(data);
|
|
78
|
+
} catch (e) {
|
|
79
|
+
table = await dbConnection.createTable(tableName, data);
|
|
80
|
+
}
|
|
81
|
+
isFirstBatch = false;
|
|
82
|
+
} else {
|
|
83
|
+
// Subsequent batches: add to existing table
|
|
84
|
+
try {
|
|
85
|
+
table = await dbConnection.openTable(tableName);
|
|
86
|
+
await table.add(data);
|
|
87
|
+
} catch (e) {
|
|
88
|
+
console.error('Failed to add to table:', e.message);
|
|
89
|
+
throw e;
|
|
90
|
+
}
|
|
79
91
|
}
|
|
80
92
|
|
|
81
93
|
tableRef = table;
|
package/src/text-search.js
CHANGED
|
@@ -1,48 +1,129 @@
|
|
|
1
1
|
export function buildTextIndex(chunks) {
|
|
2
2
|
const index = new Map();
|
|
3
|
+
const chunkMetadata = [];
|
|
3
4
|
|
|
4
5
|
chunks.forEach((chunk, idx) => {
|
|
5
6
|
const tokens = tokenize(chunk.content);
|
|
7
|
+
const symbols = extractSymbols(chunk.content);
|
|
8
|
+
const frequency = new Map();
|
|
9
|
+
|
|
6
10
|
tokens.forEach(token => {
|
|
11
|
+
frequency.set(token, (frequency.get(token) || 0) + 1);
|
|
7
12
|
if (!index.has(token)) {
|
|
8
13
|
index.set(token, []);
|
|
9
14
|
}
|
|
10
15
|
index.get(token).push(idx);
|
|
11
16
|
});
|
|
17
|
+
|
|
18
|
+
chunkMetadata[idx] = {
|
|
19
|
+
tokens,
|
|
20
|
+
symbols,
|
|
21
|
+
frequency,
|
|
22
|
+
isCode: isCodeFile(chunk.file_path),
|
|
23
|
+
};
|
|
12
24
|
});
|
|
13
25
|
|
|
14
|
-
return index;
|
|
26
|
+
return { index, chunkMetadata };
|
|
15
27
|
}
|
|
16
28
|
|
|
17
|
-
export function searchText(query, chunks,
|
|
29
|
+
export function searchText(query, chunks, indexData) {
|
|
30
|
+
const { index, chunkMetadata } = indexData;
|
|
18
31
|
const queryTokens = tokenize(query);
|
|
32
|
+
const querySymbols = extractSymbols(query);
|
|
19
33
|
const chunkScores = new Map();
|
|
20
34
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
35
|
+
chunks.forEach((chunk, idx) => {
|
|
36
|
+
let score = 0;
|
|
37
|
+
|
|
38
|
+
queryTokens.forEach(token => {
|
|
39
|
+
if (index.has(token)) {
|
|
40
|
+
if (index.get(token).includes(idx)) {
|
|
41
|
+
const freq = chunkMetadata[idx].frequency.get(token) || 1;
|
|
42
|
+
const boost = token.length > 4 ? 1.5 : 1;
|
|
43
|
+
score += boost * freq;
|
|
26
44
|
}
|
|
27
|
-
|
|
28
|
-
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
querySymbols.forEach(symbol => {
|
|
49
|
+
if (chunkMetadata[idx].symbols.includes(symbol)) {
|
|
50
|
+
score += 5;
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
const exactMatch = chunk.content.includes(query);
|
|
55
|
+
if (exactMatch) {
|
|
56
|
+
score += 10;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (chunkMetadata[idx].isCode) {
|
|
60
|
+
score *= 1.2;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (score > 0) {
|
|
64
|
+
chunkScores.set(idx, score);
|
|
29
65
|
}
|
|
30
66
|
});
|
|
31
67
|
|
|
32
68
|
const results = Array.from(chunkScores.entries())
|
|
33
69
|
.map(([idx, score]) => ({
|
|
34
70
|
...chunks[idx],
|
|
35
|
-
score: score /
|
|
36
|
-
|
|
71
|
+
score: Math.min(score / 100, 1),
|
|
72
|
+
_rawScore: score,
|
|
37
73
|
}))
|
|
38
74
|
.filter(r => r.score > 0)
|
|
39
|
-
.sort((a, b) => b.
|
|
75
|
+
.sort((a, b) => b._rawScore - a._rawScore);
|
|
40
76
|
|
|
41
77
|
return results;
|
|
42
78
|
}
|
|
43
79
|
|
|
44
80
|
function tokenize(text) {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
81
|
+
const tokens = new Set();
|
|
82
|
+
|
|
83
|
+
text.toLowerCase().split(/\s+/).forEach(word => {
|
|
84
|
+
if (word.length === 0) return;
|
|
85
|
+
|
|
86
|
+
tokens.add(word.replace(/[^\w]/g, ''));
|
|
87
|
+
|
|
88
|
+
const camelCaseTokens = word.match(/[a-z]+|[A-Z][a-z]*|[0-9]+/g) || [];
|
|
89
|
+
camelCaseTokens.forEach(t => {
|
|
90
|
+
if (t.length > 1) tokens.add(t.toLowerCase());
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
const snakeCaseTokens = word.split(/[-_]/).filter(t => t.length > 0);
|
|
94
|
+
snakeCaseTokens.forEach(t => {
|
|
95
|
+
if (t.length > 1) tokens.add(t.toLowerCase());
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
return Array.from(tokens).filter(t => t.length > 1);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function extractSymbols(text) {
|
|
103
|
+
const symbols = new Set();
|
|
104
|
+
|
|
105
|
+
const functionMatches = text.match(/(?:async\s+)?function\s+(\w+)|const\s+(\w+)\s*=\s*(?:async\s*)?\(/g) || [];
|
|
106
|
+
functionMatches.forEach(match => {
|
|
107
|
+
const name = match.match(/\w+(?=\s*[=\(])/)?.[0];
|
|
108
|
+
if (name) symbols.add(name.toLowerCase());
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
const classMatches = text.match(/class\s+(\w+)/g) || [];
|
|
112
|
+
classMatches.forEach(match => {
|
|
113
|
+
const name = match.match(/\w+$/)?.[0];
|
|
114
|
+
if (name) symbols.add(name.toLowerCase());
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
const exportMatches = text.match(/export\s+(?:async\s+)?(?:function|class)\s+(\w+)/g) || [];
|
|
118
|
+
exportMatches.forEach(match => {
|
|
119
|
+
const name = match.match(/\w+$/)?.[0];
|
|
120
|
+
if (name) symbols.add(name.toLowerCase());
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
return Array.from(symbols);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function isCodeFile(filePath) {
|
|
127
|
+
const codeExtensions = ['.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.go', '.rs', '.rb'];
|
|
128
|
+
return codeExtensions.some(ext => filePath.endsWith(ext));
|
|
48
129
|
}
|