codebasesearch 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ import { readFileSync, existsSync } from 'fs';
2
+ import { join, dirname } from 'path';
3
+ import { fileURLToPath } from 'url';
4
+
5
+ const __dirname = dirname(fileURLToPath(import.meta.url));
6
+
7
+ function loadDefaultIgnores() {
8
+ const ignorePath = join(__dirname, '..', '.thornsignore');
9
+ if (!existsSync(ignorePath)) {
10
+ return getHardcodedIgnores();
11
+ }
12
+
13
+ try {
14
+ const content = readFileSync(ignorePath, 'utf8');
15
+ return parseIgnoreFile(content);
16
+ } catch (e) {
17
+ return getHardcodedIgnores();
18
+ }
19
+ }
20
+
21
+ function getHardcodedIgnores() {
22
+ return new Set([
23
+ 'node_modules', '.git', '.svn', '.hg', 'dist', 'build', 'out',
24
+ 'target', 'vendor', '__pycache__', '.pytest_cache', '.mypy_cache',
25
+ '.next', '.nuxt', '.cache', '.parcel-cache', '.vite', '.turbo',
26
+ 'coverage', '.nyc_output', '.firebase', '.terraform', '.aws',
27
+ '.azure', '.gcloud', '.vscode', '.idea', '.vs', 'bin', 'obj',
28
+ '.gradle', '.mvn', 'Pods', 'DerivedData', '.bundle',
29
+ '.yarn', '.pnp', 'pnpm-lock.yaml', '.pnpm-store',
30
+ '.tox', '.eggs', '*.egg-info', '.venv', 'venv', 'env',
31
+ '.tsc', '.eslintcache', '.stylelintcache', '.parcel-cache',
32
+ 'temp', 'tmp', '.tmp', '.DS_Store', 'Thumbs.db',
33
+ '.swp', '.swo', '*.swp', '*.swo', '.tern-port',
34
+ 'dist-server', 'out-tsc', '.cache', '.parcel-cache',
35
+ 'typings', '.env', '.env.local', '.env.*.local'
36
+ ]);
37
+ }
38
+
39
+ function parseIgnoreFile(content) {
40
+ const patterns = new Set();
41
+ const lines = content.split('\n');
42
+
43
+ for (let line of lines) {
44
+ line = line.trim();
45
+
46
+ // Skip comments and empty lines
47
+ if (!line || line.startsWith('#')) continue;
48
+
49
+ // Remove trailing slash for directory patterns
50
+ if (line.endsWith('/')) {
51
+ line = line.slice(0, -1);
52
+ }
53
+
54
+ // Skip negation patterns (!) for now
55
+ if (line.startsWith('!')) continue;
56
+
57
+ // Handle wildcards
58
+ if (line.includes('*')) {
59
+ // Remove trailing wildcards
60
+ line = line.replace(/\/\*+$/, '');
61
+ }
62
+
63
+ if (line) {
64
+ patterns.add(line);
65
+ }
66
+ }
67
+
68
+ return patterns;
69
+ }
70
+
71
+ function loadProjectIgnores(rootPath) {
72
+ const patterns = new Set();
73
+ const ignoreFiles = [
74
+ '.gitignore',
75
+ '.dockerignore',
76
+ '.npmignore',
77
+ '.eslintignore',
78
+ '.prettierignore',
79
+ '.thornsignore',
80
+ '.codesearchignore'
81
+ ];
82
+
83
+ for (const file of ignoreFiles) {
84
+ const path = join(rootPath, file);
85
+ if (existsSync(path)) {
86
+ try {
87
+ const content = readFileSync(path, 'utf8');
88
+ const filePatterns = parseIgnoreFile(content);
89
+ for (const pattern of filePatterns) {
90
+ patterns.add(pattern);
91
+ }
92
+ } catch (e) {
93
+ // Ignore read errors
94
+ }
95
+ }
96
+ }
97
+
98
+ return patterns;
99
+ }
100
+
101
+ export function loadIgnorePatterns(rootPath) {
102
+ const defaultPatterns = loadDefaultIgnores();
103
+ const projectPatterns = loadProjectIgnores(rootPath);
104
+
105
+ // Merge both sets
106
+ const merged = new Set([...defaultPatterns, ...projectPatterns]);
107
+ return merged;
108
+ }
109
+
110
+ export function shouldIgnore(filePath, ignorePatterns) {
111
+ const normalizedPath = filePath.replace(/\\/g, '/');
112
+ const pathParts = normalizedPath.split('/');
113
+
114
+ for (const pattern of ignorePatterns) {
115
+ if (pattern.includes('/')) {
116
+ if (normalizedPath.includes(pattern)) {
117
+ return true;
118
+ }
119
+ } else {
120
+ for (const part of pathParts) {
121
+ if (part === pattern) {
122
+ return true;
123
+ }
124
+ }
125
+ }
126
+ }
127
+
128
+ return false;
129
+ }
@@ -0,0 +1,38 @@
1
+ // Auto-patch transformers dist for sharp removal
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import { fileURLToPath } from 'url';
5
+
6
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
7
+ const distPath = path.join(__dirname, '..', 'node_modules', '@huggingface', 'transformers', 'dist', 'transformers.node.mjs');
8
+
9
+ if (fs.existsSync(distPath)) {
10
+ let content = fs.readFileSync(distPath, 'utf-8');
11
+
12
+ // Only patch if not already patched
13
+ if (!content.includes('SHARP_REMOVED_FOR_WINDOWS_COMPATIBILITY')) {
14
+ // Remove sharp import
15
+ content = content.replace(
16
+ /import \* as __WEBPACK_EXTERNAL_MODULE_sharp__ from "sharp";\n/,
17
+ '// SHARP_REMOVED_FOR_WINDOWS_COMPATIBILITY\n'
18
+ );
19
+
20
+ // Replace sharp module export with stub
21
+ content = content.replace(
22
+ /module\.exports = __WEBPACK_EXTERNAL_MODULE_sharp__;/g,
23
+ 'module.exports = {};'
24
+ );
25
+
26
+ // Replace image processing error with fallback
27
+ content = content.replace(
28
+ /} else \{\s*throw new Error\('Unable to load image processing library\.'\);\s*\}/,
29
+ '} else {\n loadImageFunction = async () => { throw new Error(\'Image processing unavailable\'); };\n}'
30
+ );
31
+
32
+ try {
33
+ fs.writeFileSync(distPath, content);
34
+ } catch (e) {
35
+ // Silently continue if unable to patch (read-only filesystem)
36
+ }
37
+ }
38
+ }
package/src/scanner.js ADDED
@@ -0,0 +1,172 @@
1
+ import { readdirSync, statSync, readFileSync } from 'fs';
2
+ import { join, relative } from 'path';
3
+ import { shouldIgnore } from './ignore-parser.js';
4
+
5
+ const SUPPORTED_EXTENSIONS = new Set([
6
+ '.js', '.ts', '.tsx', '.jsx', '.mjs', '.cjs',
7
+ '.py', '.pyw', '.pyi',
8
+ '.go',
9
+ '.rs',
10
+ '.java', '.kt', '.scala',
11
+ '.cpp', '.cc', '.cxx', '.h', '.hpp', '.hxx',
12
+ '.c', '.h',
13
+ '.rb', '.erb',
14
+ '.php',
15
+ '.cs', '.csx',
16
+ '.swift',
17
+ '.m', '.mm',
18
+ '.sh', '.bash', '.zsh',
19
+ '.sql',
20
+ '.r', '.R',
21
+ '.lua',
22
+ '.pl', '.pm',
23
+ '.groovy',
24
+ '.gradle',
25
+ '.xml', '.xsd',
26
+ '.json', '.jsonc',
27
+ '.yaml', '.yml',
28
+ '.toml',
29
+ '.html', '.htm',
30
+ '.css', '.scss', '.sass', '.less',
31
+ '.vue', '.svelte',
32
+ '.md', '.markdown'
33
+ ]);
34
+
35
+ function getFileExtension(filePath) {
36
+ const lastDot = filePath.lastIndexOf('.');
37
+ if (lastDot === -1) return '';
38
+ return filePath.substring(lastDot).toLowerCase();
39
+ }
40
+
41
+ function isBinaryFile(filePath) {
42
+ const binaryExtensions = new Set([
43
+ '.zip', '.tar', '.gz', '.rar', '.7z', '.iso',
44
+ '.exe', '.dll', '.so', '.dylib', '.bin',
45
+ '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg', '.ico',
46
+ '.mp3', '.mp4', '.mov', '.avi', '.flv', '.m4a',
47
+ '.pdf', '.doc', '.docx', '.xls', '.xlsx',
48
+ '.woff', '.woff2', '.ttf', '.otf', '.eot'
49
+ ]);
50
+ const ext = getFileExtension(filePath);
51
+ return binaryExtensions.has(ext);
52
+ }
53
+
54
+ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
55
+ const files = [];
56
+
57
+ try {
58
+ const entries = readdirSync(dirPath, { withFileTypes: true });
59
+
60
+ for (const entry of entries) {
61
+ const fullPath = join(dirPath, entry.name);
62
+ const relPath = relativePath ? join(relativePath, entry.name) : entry.name;
63
+ // Normalize to forward slashes for consistent ignore pattern matching
64
+ const normalizedRelPath = relPath.replace(/\\/g, '/');
65
+
66
+ // Check if should ignore
67
+ if (shouldIgnore(normalizedRelPath, ignorePatterns)) {
68
+ continue;
69
+ }
70
+
71
+ if (entry.isDirectory()) {
72
+ files.push(...walkDirectory(fullPath, ignorePatterns, relPath));
73
+ } else if (entry.isFile()) {
74
+ const ext = getFileExtension(entry.name);
75
+ if (SUPPORTED_EXTENSIONS.has(ext) && !isBinaryFile(entry.name)) {
76
+ try {
77
+ const stat = entry.isSymbolicLink ? null : statSync(fullPath);
78
+ const maxSize = 5 * 1024 * 1024;
79
+ if (!stat || stat.size <= maxSize) {
80
+ files.push({
81
+ fullPath,
82
+ relativePath: normalizedRelPath,
83
+ mtime: stat ? stat.mtime.getTime() : Date.now()
84
+ });
85
+ }
86
+ } catch (e) {
87
+ }
88
+ }
89
+ }
90
+ }
91
+ } catch (e) {
92
+ // Ignore read errors for individual directories
93
+ }
94
+
95
+ return files;
96
+ }
97
+
98
+ function chunkContent(content, chunkSize = 1000, overlapSize = 200) {
99
+ const lines = content.split('\n');
100
+ const chunks = [];
101
+
102
+ for (let i = 0; i < lines.length; i += chunkSize - overlapSize) {
103
+ const endIdx = Math.min(i + chunkSize, lines.length);
104
+ const chunk = lines.slice(i, endIdx).join('\n');
105
+
106
+ if (chunk.trim().length > 0) {
107
+ chunks.push({
108
+ content: chunk,
109
+ line_start: i + 1,
110
+ line_end: endIdx
111
+ });
112
+ }
113
+
114
+ // Stop if we've reached the end
115
+ if (endIdx === lines.length) {
116
+ break;
117
+ }
118
+ }
119
+
120
+ return chunks;
121
+ }
122
+
123
+ export function scanRepository(rootPath, ignorePatterns) {
124
+ const files = walkDirectory(rootPath, ignorePatterns);
125
+ const chunks = [];
126
+
127
+ for (const file of files) {
128
+ try {
129
+ const content = readFileSync(file.fullPath, 'utf8');
130
+ const mtime = file.mtime;
131
+
132
+ // For small files, treat as single chunk
133
+ if (content.split('\n').length <= 1000) {
134
+ chunks.push({
135
+ file_path: file.relativePath,
136
+ chunk_index: 0,
137
+ content,
138
+ line_start: 1,
139
+ line_end: content.split('\n').length,
140
+ mtime
141
+ });
142
+ } else {
143
+ // For large files, chunk them
144
+ const fileChunks = chunkContent(content);
145
+ fileChunks.forEach((chunk, idx) => {
146
+ chunks.push({
147
+ file_path: file.relativePath,
148
+ chunk_index: idx,
149
+ content: chunk.content,
150
+ line_start: chunk.line_start,
151
+ line_end: chunk.line_end,
152
+ mtime
153
+ });
154
+ });
155
+ }
156
+ } catch (e) {
157
+ // Ignore read errors for individual files
158
+ }
159
+ }
160
+
161
+ return chunks;
162
+ }
163
+
164
+ export function getFileStats(chunks) {
165
+ const stats = {};
166
+ for (const chunk of chunks) {
167
+ if (!stats[chunk.file_path]) {
168
+ stats[chunk.file_path] = chunk.mtime;
169
+ }
170
+ }
171
+ return stats;
172
+ }
package/src/search.js ADDED
@@ -0,0 +1,45 @@
1
+ import { generateSingleEmbedding } from './embeddings.js';
2
+ import { searchSimilar } from './store.js';
3
+
4
+ export async function executeSearch(query, limit = 10) {
5
+ if (!query || query.trim().length === 0) {
6
+ throw new Error('Query cannot be empty');
7
+ }
8
+
9
+ console.error(`Searching for: "${query}"`);
10
+
11
+ // Generate embedding for query
12
+ const queryEmbedding = await generateSingleEmbedding(query);
13
+
14
+ // Search vector store
15
+ const results = await searchSimilar(queryEmbedding, limit);
16
+
17
+ return results;
18
+ }
19
+
20
+ export function formatResults(results) {
21
+ if (results.length === 0) {
22
+ return 'No results found.';
23
+ }
24
+
25
+ const lines = [];
26
+ lines.push(`\nFound ${results.length} result${results.length !== 1 ? 's' : ''}:\n`);
27
+
28
+ for (let i = 0; i < results.length; i++) {
29
+ const result = results[i];
30
+ const match = i + 1;
31
+
32
+ lines.push(`${match}. ${result.file_path}:${result.line_start}-${result.line_end} (score: ${(result.score * 100).toFixed(1)}%)`);
33
+
34
+ // Show code snippet (first 3 lines)
35
+ const codeLines = result.content.split('\n').slice(0, 3);
36
+ for (const line of codeLines) {
37
+ const trimmed = line.slice(0, 80); // Limit line length
38
+ lines.push(` > ${trimmed}`);
39
+ }
40
+
41
+ lines.push('');
42
+ }
43
+
44
+ return lines.join('\n');
45
+ }
package/src/store.js ADDED
@@ -0,0 +1,166 @@
1
+ import { connect } from 'vectordb';
2
+ import { join } from 'path';
3
+ import { mkdirSync, existsSync } from 'fs';
4
+
5
+ let dbConnection = null;
6
+ let tableRef = null;
7
+
8
+ export async function initStore(dbPath) {
9
+ // Ensure directory exists
10
+ const dbDir = join(dbPath, 'lancedb');
11
+ if (!existsSync(dbDir)) {
12
+ mkdirSync(dbDir, { recursive: true });
13
+ }
14
+
15
+ try {
16
+ // Connect to LanceDB (embedded, file-based, no network)
17
+ // Use absolute path for Windows compatibility
18
+ dbConnection = await connect({
19
+ uri: dbDir,
20
+ mode: 'overwrite'
21
+ });
22
+ console.error('Vector store initialized');
23
+ return true;
24
+ } catch (e) {
25
+ console.error('Failed to initialize vector store:', e.message);
26
+ throw e;
27
+ }
28
+ }
29
+
30
+ export async function getTable() {
31
+ if (!dbConnection) {
32
+ throw new Error('Store not initialized. Call initStore first.');
33
+ }
34
+
35
+ const tableName = 'code_chunks';
36
+
37
+ try {
38
+ // Try to open existing table
39
+ tableRef = await dbConnection.openTable(tableName);
40
+ } catch (e) {
41
+ // Table doesn't exist, will be created on first insert
42
+ tableRef = null;
43
+ }
44
+
45
+ return tableRef;
46
+ }
47
+
48
+ export async function upsertChunks(chunks) {
49
+ if (!dbConnection) {
50
+ throw new Error('Store not initialized');
51
+ }
52
+
53
+ if (chunks.length === 0) {
54
+ return;
55
+ }
56
+
57
+ const tableName = 'code_chunks';
58
+ const data = chunks.map(chunk => ({
59
+ file_path: String(chunk.file_path),
60
+ chunk_index: Number(chunk.chunk_index),
61
+ content: String(chunk.content),
62
+ line_start: Number(chunk.line_start),
63
+ line_end: Number(chunk.line_end),
64
+ vector: chunk.vector,
65
+ mtime: Number(chunk.mtime)
66
+ }));
67
+
68
+ try {
69
+ let table = null;
70
+
71
+ // Try to open existing table
72
+ try {
73
+ table = await dbConnection.openTable(tableName);
74
+ // Overwrite existing table with new data
75
+ await table.overwrite(data);
76
+ } catch (e) {
77
+ // Table doesn't exist, create new one
78
+ table = await dbConnection.createTable(tableName, data);
79
+ }
80
+
81
+ tableRef = table;
82
+ console.error(`Indexed ${chunks.length} chunks`);
83
+ } catch (e) {
84
+ console.error('Failed to upsert chunks:', e.message);
85
+ throw e;
86
+ }
87
+ }
88
+
89
+ export async function searchSimilar(queryEmbedding, limit = 10) {
90
+ if (!tableRef) {
91
+ if (!dbConnection) {
92
+ console.error('No database connection');
93
+ return [];
94
+ }
95
+ try {
96
+ await getTable();
97
+ } catch (e) {
98
+ console.error('No index available');
99
+ return [];
100
+ }
101
+ }
102
+
103
+ if (!tableRef) {
104
+ console.error('No index available');
105
+ return [];
106
+ }
107
+
108
+ try {
109
+ // Ensure vector is a proper array/tensor
110
+ const query = Array.isArray(queryEmbedding) ? queryEmbedding : Array.from(queryEmbedding);
111
+
112
+ const results = await tableRef
113
+ .search(query)
114
+ .limit(limit)
115
+ .execute();
116
+
117
+ return results.map(result => {
118
+ const distance = result._distance !== undefined ? result._distance : (result.distance || 0);
119
+ const score = distance !== null && distance !== undefined ? 1 / (1 + distance) : 0;
120
+ return {
121
+ file_path: result.file_path,
122
+ chunk_index: result.chunk_index,
123
+ content: result.content,
124
+ line_start: result.line_start,
125
+ line_end: result.line_end,
126
+ distance: distance,
127
+ score: score
128
+ };
129
+ });
130
+ } catch (e) {
131
+ console.error('Search failed:', e.message);
132
+ return [];
133
+ }
134
+ }
135
+
136
+ export async function getRowCount() {
137
+ if (!tableRef) {
138
+ return 0;
139
+ }
140
+
141
+ try {
142
+ return await tableRef.countRows();
143
+ } catch (e) {
144
+ return 0;
145
+ }
146
+ }
147
+
148
+ export async function getIndexedFiles() {
149
+ if (!tableRef) {
150
+ return {};
151
+ }
152
+
153
+ // For now, we'll do a full reindex each time
154
+ // This ensures the index is always up-to-date
155
+ // Future optimization: store a metadata file with mtimes
156
+ return {};
157
+ }
158
+
159
+ export async function closeStore() {
160
+ // LanceDB doesn't require explicit close in embedded mode
161
+ // But we clear references for cleanliness
162
+ if (dbConnection) {
163
+ dbConnection = null;
164
+ tableRef = null;
165
+ }
166
+ }