codebasesearch 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codebasesearch",
3
- "version": "0.1.11",
3
+ "version": "0.1.13",
4
4
  "description": "Ultra-simple code search tool with Jina embeddings, LanceDB, and MCP protocol support",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli.js CHANGED
@@ -74,25 +74,26 @@ export async function run(args) {
74
74
  // Always reindex to ensure freshness
75
75
  console.log('Generating embeddings and indexing...');
76
76
 
77
- // Generate embeddings in batches
77
+ // Generate embeddings in batches and upsert immediately to free memory
78
78
  const batchSize = 32;
79
- const chunkTexts = chunks.map(c => c.content);
80
- const allEmbeddings = [];
79
+ let processedCount = 0;
81
80
 
82
- for (let i = 0; i < chunkTexts.length; i += batchSize) {
83
- const batchTexts = chunkTexts.slice(i, i + batchSize);
81
+ for (let i = 0; i < chunks.length; i += batchSize) {
82
+ const batchChunks = chunks.slice(i, i + batchSize);
83
+ const batchTexts = batchChunks.map(c => c.content);
84
84
  const batchEmbeddings = await generateEmbeddings(batchTexts);
85
- allEmbeddings.push(...batchEmbeddings);
86
- }
87
85
 
88
- // Create chunks with embeddings
89
- const chunksWithEmbeddings = chunks.map((chunk, idx) => ({
90
- ...chunk,
91
- vector: allEmbeddings[idx]
92
- }));
86
+ // Create batch with embeddings
87
+ const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
88
+ ...chunk,
89
+ vector: batchEmbeddings[idx]
90
+ }));
91
+
92
+ // Upsert immediately to free memory
93
+ await upsertChunks(batchWithEmbeddings);
94
+ processedCount += batchWithEmbeddings.length;
95
+ }
93
96
 
94
- // Upsert to store
95
- await upsertChunks(chunksWithEmbeddings);
96
97
  console.log('Index created\n');
97
98
 
98
99
  // Execute search
@@ -1,9 +1,87 @@
1
1
  import { readFileSync, existsSync } from 'fs';
2
- import { join, dirname } from 'path';
2
+ import { join, dirname, extname } from 'path';
3
3
  import { fileURLToPath } from 'url';
4
4
 
5
5
  const __dirname = dirname(fileURLToPath(import.meta.url));
6
6
 
7
+ // Whitelist of code file extensions to include
8
+ const CODE_EXTENSIONS = new Set([
9
+ // JavaScript/TypeScript
10
+ '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs', '.mts', '.cts',
11
+ // Python
12
+ '.py', '.pyw', '.pyi',
13
+ // Java
14
+ '.java',
15
+ // C/C++
16
+ '.c', '.cpp', '.cc', '.cxx', '.h', '.hpp', '.hh', '.hxx',
17
+ // C#
18
+ '.cs',
19
+ // Go
20
+ '.go',
21
+ // Rust
22
+ '.rs',
23
+ // Ruby
24
+ '.rb',
25
+ // PHP
26
+ '.php', '.phtml',
27
+ // Swift
28
+ '.swift',
29
+ // Kotlin
30
+ '.kt', '.kts',
31
+ // Scala
32
+ '.scala', '.sc',
33
+ // Perl
34
+ '.pl', '.pm',
35
+ // Shell/Bash
36
+ '.sh', '.bash', '.zsh', '.fish',
37
+ // PowerShell
38
+ '.ps1', '.psm1', '.psd1',
39
+ // Lua
40
+ '.lua',
41
+ // R
42
+ '.r', '.R',
43
+ // MATLAB/Octave
44
+ '.m', '.mat',
45
+ // Julia
46
+ '.jl',
47
+ // Dart
48
+ '.dart',
49
+ // Elixir
50
+ '.ex', '.exs',
51
+ // Erlang
52
+ '.erl', '.hrl',
53
+ // Haskell
54
+ '.hs', '.lhs',
55
+ // Clojure
56
+ '.clj', '.cljs', '.cljc',
57
+ // Lisp
58
+ '.lisp', '.lsp', '.scm', '.ss', '.rkt',
59
+ // Fortran
60
+ '.f', '.for', '.f90', '.f95', '.f03',
61
+ // Assembly
62
+ '.asm', '.s', '.S',
63
+ // Groovy
64
+ '.groovy', '.gvy',
65
+ // Visual Basic
66
+ '.vb', '.vbs',
67
+ // F#
68
+ '.fs', '.fsx',
69
+ // OCaml
70
+ '.ml', '.mli',
71
+ // Objective-C
72
+ '.m', '.mm',
73
+ // Arduino
74
+ '.ino',
75
+ // Vue SFC
76
+ '.vue',
77
+ // Svelte
78
+ '.svelte',
79
+ // CoffeeScript
80
+ '.coffee',
81
+ // Reason
82
+ '.re', '.rei'
83
+ ]);
84
+
7
85
  function loadDefaultIgnores() {
8
86
  const ignorePath = join(__dirname, '..', '.thornsignore');
9
87
  if (!existsSync(ignorePath)) {
@@ -32,7 +110,97 @@ function getHardcodedIgnores() {
32
110
  'temp', 'tmp', '.tmp', '.DS_Store', 'Thumbs.db',
33
111
  '.swp', '.swo', '*.swp', '*.swo', '.tern-port',
34
112
  'dist-server', 'out-tsc', '.cache', '.parcel-cache',
35
- 'typings', '.env', '.env.local', '.env.*.local'
113
+ 'typings', '.env', '.env.local', '.env.*.local',
114
+ // JSON files - PRIMARY PRIORITY for memory reduction
115
+ '*.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
116
+ 'Gemfile.lock', 'poetry.lock', 'Pipfile.lock',
117
+ // Lock files
118
+ '*.lock',
119
+ // Build outputs
120
+ 'public', 'static', 'site', '_site', '.docusaurus', '.gatsby',
121
+ // Cache/dependency directories
122
+ '.rush', '.lerna', '.nx',
123
+ // IDE/editor configs
124
+ '.cursor', '.replit', '.sublime-project', '.sublime-workspace',
125
+ '*.iml', '.project', '.classpath', '.settings', '*.sublime-*',
126
+ // OS files
127
+ '.Spotlight-V100', '.Trashes', 'ehthumbs.db', '.fseventsd',
128
+ '.TemporaryItems', '.AppleDouble', '.LSOverride', 'desktop.ini',
129
+ // Large data files
130
+ '*.db', '*.sqlite', '*.sqlite3', '*.bak', '*.dump',
131
+ '*.backup', '*.data', '*.orig',
132
+ // Logs and temp
133
+ '*.log', 'logs', 'npm-debug.log', 'yarn-error.log',
134
+ // Test coverage and reports
135
+ 'lcov.info', '.coverage', 'test-results',
136
+ // Database related
137
+ 'storage', 'fixtures',
138
+ // LLM/Vector related
139
+ '.llamaindex', '.chroma', '.vectorstore', '.embeddings',
140
+ '.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
141
+ '.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
142
+ 'chromadb', 'pinecone-cache', 'weaviate-data',
143
+ // Compiled output
144
+ '*.min.js', '*.min.css', '*.bundle.js', '*.chunk.js', '*.map',
145
+ // Generated/build artifacts
146
+ '.assets', 'out-tsc', 'cmake_build_debug', 'cmake_build_release',
147
+ // Version managers
148
+ '.rbenv', '.nvm', '.nvmrc',
149
+ // Ruby specific
150
+ '*.gem', '*.rbc', '/pkg', '/spec/reports', '/spec/examples.txt',
151
+ '/test/tmp', '/test/version_tmp', 'lib/bundler/man', '.ruby-version',
152
+ // Go specific
153
+ 'go.work',
154
+ // Rust specific
155
+ 'Cargo.lock', '**/*.rs.bk', '*.pdb',
156
+ // Java specific
157
+ '*.class', '*.jar', '*.war', '*.ear', '*.nar', '*.nupkg', '*.snupkg',
158
+ // C# specific
159
+ '*.suo', '*.user', '*.userosscache', '*.sln.docstates',
160
+ 'project.lock.json', 'project.fragment.lock.json', 'artifacts',
161
+ // C/C++ specific
162
+ '*.o', '*.a', '*.so', '*.exe', '*.obj', '*.dll', '*.dylib',
163
+ 'CMakeFiles', 'CMakeCache.txt', '*.cmake',
164
+ // Swift/Xcode specific
165
+ '*.xcodeproj', '*.xcworkspace', '*.moved-aside', '*.pbxuser',
166
+ '*.mode1v3', '*.mode2v3', '*.perspectivev3',
167
+ // Scala/SBT specific
168
+ 'lib_managed', 'src_managed', 'project/boot', 'project/plugins/project',
169
+ '.history', '.lib',
170
+ // PHP specific
171
+ 'composer.lock', '*.phar',
172
+ // Docker
173
+ '.dockerignore', 'docker-compose.override.yml', '.docker',
174
+ // Documentation build
175
+ 'docs/_build', '.vuepress',
176
+ // Testing frameworks
177
+ 'jest.config', 'vitest.config', 'pytest.ini', 'tox.ini',
178
+ '__tests__', '__mocks__', 'spec', 'cypress', 'playwright',
179
+ // Monorepo workspace patterns (implicit through directory coverage)
180
+ '.turbo', '.nx',
181
+ // Python package patterns
182
+ '*.py[cod]', '*$py.class', '.Python', 'pip-log.txt',
183
+ 'pip-delete-this-directory.txt', '.hypothesis', '.pyre', '.pytype',
184
+ '*.whl',
185
+ // Config/metadata that are typically low-value
186
+ '*.config.js', '*.config.ts', 'webpack.config.js', 'rollup.config.js',
187
+ 'vite.config.js', 'tsconfig.json', 'jsconfig.json', 'babel.config',
188
+ '.babelrc', '.eslintrc', '.prettierrc', '.stylelintrc', '.editorconfig',
189
+ '*.local', '*.development', '*.production',
190
+ // Node specific
191
+ '.npm', '.node_repl_history', '*.tsbuildinfo', 'yarn-error.log',
192
+ // Documentation/reference files that don't help with search
193
+ '*.md', '*.txt', '*.rst', '*.adoc', 'docs', 'documentation', 'wiki',
194
+ 'CHANGELOG', 'HISTORY', 'NEWS', 'UPGRADING', 'FAQ', 'CONTRIBUTING',
195
+ 'SECURITY', 'LICENSE', 'LICENCE', 'COPYRIGHT', 'NOTICE', 'AUTHORS',
196
+ 'THIRDPARTY',
197
+ // Test and coverage files
198
+ '*.test', '*.spec', 'test', 'tests', 'htmlcov',
199
+ // Profiling
200
+ '*.prof', '*.cpuprofile', '*.heapprofile',
201
+ // Misc
202
+ '.tern-port', 'firebase-debug.log', 'firestore-debug.log',
203
+ 'ui-debug.log', '.firebaserc', '.stackdump'
36
204
  ]);
37
205
  }
38
206
 
@@ -107,18 +275,139 @@ export function loadIgnorePatterns(rootPath) {
107
275
  return merged;
108
276
  }
109
277
 
278
+ // Directories to always ignore
279
+ const IGNORED_DIRECTORIES = new Set([
280
+ // Dependencies - NEVER include
281
+ 'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
282
+ // Version control
283
+ '.git', '.svn', '.hg', '.bzr', '.vscode', '.idea', '.vs', '.atom', '.sublime-project',
284
+ // Build outputs - comprehensive list
285
+ 'dist', 'dist-server', 'dist-ssr', 'dist-client', 'dist-server',
286
+ 'build', 'built', 'Build', 'BUILD',
287
+ 'out', 'output', 'Output', 'OUT', 'release', 'Release', 'RELEASE',
288
+ 'target', 'Target', 'TARGET',
289
+ 'bin', 'Bin', 'BIN', 'obj', 'Obj', 'OBJ',
290
+ 'public', 'static', 'assets', 'www', 'wwwroot',
291
+ 'site', '_site', '.site', '.docusaurus', '.gatsby', '.vuepress',
292
+ 'storybook-static', '.nuxt', 'nuxt', '.next', 'next',
293
+ 'out-tsc', 'tsc', '.tsc',
294
+ // Cache directories
295
+ '.cache', 'cache', '.parcel-cache', '.vite', 'vite', '.turbo', 'turbo',
296
+ '.npm', '.yarn', '.pnp', '.pnpm-store', '.rush', '.lerna', '.nx',
297
+ // Testing
298
+ 'coverage', '.nyc_output', '.coverage', 'htmlcov', 'test-results',
299
+ 'test', 'tests', 'Test', 'Tests', 'TEST', 'TESTS',
300
+ '__tests__', '__mocks__', '__snapshots__', '__fixtures__',
301
+ 'cypress', 'playwright', 'e2e', 'integration', 'spec', 'specs',
302
+ '.tox', '.eggs', '.hypothesis', '.pyre', '.pytype',
303
+ // Python
304
+ '__pycache__', '.pytest_cache', '.mypy_cache', '.venv', 'venv', 'env',
305
+ 'env.bak', 'venv.bak', '.Python', 'pip-wheel-metadata', '*.egg-info',
306
+ // Java/Gradle/Maven
307
+ '.gradle', '.mvn', 'gradle', 'mvn', '.settings', '.project', '.classpath',
308
+ // iOS/Android
309
+ 'Pods', 'DerivedData', 'build', '.bundle', 'xcuserdata', '.xcodeproj', '.xcworkspace',
310
+ // Ruby
311
+ 'vendor', '.bundle', '.ruby-version', 'pkg',
312
+ // Rust
313
+ 'target', 'Cargo.lock',
314
+ // Go
315
+ 'vendor', 'Godeps',
316
+ // PHP
317
+ 'vendor', 'composer',
318
+ // Infrastructure
319
+ '.terraform', '.terragrunt-cache', '.pulumi', '.serverless', '.firebase',
320
+ '.aws', '.azure', '.gcloud', '.vercel', '.netlify', '.now',
321
+ // Docker
322
+ '.docker', 'docker', '.dockerignore',
323
+ // Temp files
324
+ 'temp', 'tmp', '.tmp', '.temp', 'tmpfs', 'scratch', '.scratch',
325
+ // Documentation
326
+ 'docs', 'doc', 'documentation', 'wiki', 'guides', 'examples', 'demo', 'demos',
327
+ 'CHANGELOG', 'HISTORY', 'NEWS', 'LICENSE', 'LICENCE', 'COPYING', 'AUTHORS',
328
+ // IDE/Editor
329
+ '.vs', '.vscode', '.idea', '.eclipse', '.settings', '.classpath', '.project',
330
+ // Logs
331
+ 'logs', 'log', '*.log',
332
+ // Data/Storage
333
+ 'storage', 'data', 'database', 'db', 'fixtures', 'seeds',
334
+ 'uploads', 'files', 'media', 'resources', 'assets', 'images', 'img',
335
+ // LLM/AI
336
+ '.llamaindex', '.chroma', '.vectorstore', '.embeddings',
337
+ '.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
338
+ '.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
339
+ 'chromadb', 'pinecone-cache', 'weaviate-data',
340
+ // Package managers
341
+ '.yarn', '.pnpm', '.npm', '.bun',
342
+ // Compiled outputs
343
+ 'typings', 'types', '@types', 'type-definitions',
344
+ // Misc
345
+ 'public', 'static', 'site', '_site',
346
+ 'cmake_build_debug', 'cmake_build_release', 'CMakeFiles', 'CMakeCache.txt',
347
+ 'out-tsc', 'dist-server', 'server', 'client', 'browser', 'esm', 'cjs', 'umd', 'lib', 'es'
348
+ ]);
349
+
350
+ export function isCodeFile(filePath) {
351
+ const normalizedPath = filePath.replace(/\\/g, '/');
352
+ const pathParts = normalizedPath.split('/');
353
+ const fileName = pathParts[pathParts.length - 1];
354
+
355
+ // Get file extension
356
+ const lastDotIndex = fileName.lastIndexOf('.');
357
+ if (lastDotIndex === -1 || lastDotIndex === 0) {
358
+ return false; // No extension or hidden file without extension
359
+ }
360
+
361
+ const ext = fileName.slice(lastDotIndex).toLowerCase();
362
+ return CODE_EXTENSIONS.has(ext);
363
+ }
364
+
365
+ export function shouldIgnoreDirectory(dirPath) {
366
+ const normalizedPath = dirPath.replace(/\\/g, '/');
367
+ const pathParts = normalizedPath.split('/');
368
+
369
+ for (const part of pathParts) {
370
+ if (IGNORED_DIRECTORIES.has(part)) {
371
+ return true;
372
+ }
373
+ }
374
+
375
+ return false;
376
+ }
377
+
110
378
  export function shouldIgnore(filePath, ignorePatterns) {
111
379
  const normalizedPath = filePath.replace(/\\/g, '/');
112
380
  const pathParts = normalizedPath.split('/');
381
+ const fileName = pathParts[pathParts.length - 1];
382
+
383
+ // Check if any directory in path should be ignored
384
+ for (const part of pathParts.slice(0, -1)) {
385
+ if (IGNORED_DIRECTORIES.has(part)) {
386
+ return true;
387
+ }
388
+ }
389
+
390
+ // Check if it's a code file using whitelist
391
+ if (!isCodeFile(filePath)) {
392
+ return true;
393
+ }
113
394
 
395
+ // Check against additional ignore patterns
114
396
  for (const pattern of ignorePatterns) {
397
+ // Handle path patterns (contain /)
115
398
  if (pattern.includes('/')) {
116
399
  if (normalizedPath.includes(pattern)) {
117
400
  return true;
118
401
  }
119
- } else {
402
+ }
403
+ // Handle exact file name patterns
404
+ else if (fileName === pattern) {
405
+ return true;
406
+ }
407
+ // Handle directory name patterns (match any path part)
408
+ else {
120
409
  for (const part of pathParts) {
121
- if (part === pattern) {
410
+ if (part === pattern || part.startsWith(pattern + '/')) {
122
411
  return true;
123
412
  }
124
413
  }
package/src/scanner.js CHANGED
@@ -23,7 +23,6 @@ const SUPPORTED_EXTENSIONS = new Set([
23
23
  '.groovy',
24
24
  '.gradle',
25
25
  '.xml', '.xsd',
26
- '.json', '.jsonc',
27
26
  '.yaml', '.yml',
28
27
  '.toml',
29
28
  '.html', '.htm',
@@ -95,7 +94,7 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
95
94
  return files;
96
95
  }
97
96
 
98
- function chunkContent(content, chunkSize = 1000, overlapSize = 200) {
97
+ function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
99
98
  const lines = content.split('\n');
100
99
  const chunks = [];
101
100
 
package/src/store.js CHANGED
@@ -4,6 +4,7 @@ import { mkdirSync, existsSync } from 'fs';
4
4
 
5
5
  let dbConnection = null;
6
6
  let tableRef = null;
7
+ let isFirstBatch = true;
7
8
 
8
9
  export async function initStore(dbPath) {
9
10
  // Ensure directory exists
@@ -19,6 +20,7 @@ export async function initStore(dbPath) {
19
20
  uri: dbDir,
20
21
  mode: 'overwrite'
21
22
  });
23
+ isFirstBatch = true;
22
24
  console.error('Vector store initialized');
23
25
  return true;
24
26
  } catch (e) {
@@ -68,14 +70,24 @@ export async function upsertChunks(chunks) {
68
70
  try {
69
71
  let table = null;
70
72
 
71
- // Try to open existing table
72
- try {
73
- table = await dbConnection.openTable(tableName);
74
- // Overwrite existing table with new data
75
- await table.overwrite(data);
76
- } catch (e) {
77
- // Table doesn't exist, create new one
78
- table = await dbConnection.createTable(tableName, data);
73
+ if (isFirstBatch) {
74
+ // First batch: try to open existing table, or create new one
75
+ try {
76
+ table = await dbConnection.openTable(tableName);
77
+ await table.overwrite(data);
78
+ } catch (e) {
79
+ table = await dbConnection.createTable(tableName, data);
80
+ }
81
+ isFirstBatch = false;
82
+ } else {
83
+ // Subsequent batches: add to existing table
84
+ try {
85
+ table = await dbConnection.openTable(tableName);
86
+ await table.add(data);
87
+ } catch (e) {
88
+ console.error('Failed to add to table:', e.message);
89
+ throw e;
90
+ }
79
91
  }
80
92
 
81
93
  tableRef = table;