codebasesearch 0.1.10 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codebasesearch",
3
- "version": "0.1.10",
3
+ "version": "0.1.11",
4
4
  "description": "Ultra-simple code search tool with Jina embeddings, LanceDB, and MCP protocol support",
5
5
  "type": "module",
6
6
  "bin": {
@@ -20,16 +20,16 @@ async function initializeIndex(repositoryPath) {
20
20
  const chunks = scanRepository(absolutePath, ignorePatterns);
21
21
 
22
22
  if (chunks.length === 0) {
23
- return { error: 'No code chunks found', chunks: [], index: null };
23
+ return { error: 'No code chunks found', chunks: [], indexData: null };
24
24
  }
25
25
 
26
- const index = buildTextIndex(chunks);
27
- const indexData = { chunks, index };
28
- indexCache.set(cacheKey, indexData);
26
+ const indexData = buildTextIndex(chunks);
27
+ const result = { chunks, indexData };
28
+ indexCache.set(cacheKey, result);
29
29
 
30
- return indexData;
30
+ return result;
31
31
  } catch (error) {
32
- return { error: error.message, chunks: [], index: null };
32
+ return { error: error.message, chunks: [], indexData: null };
33
33
  }
34
34
  }
35
35
 
@@ -47,7 +47,7 @@ async function performSearch(repositoryPath, query) {
47
47
  return { error: indexData.error, results: [] };
48
48
  }
49
49
 
50
- const results = searchText(query, indexData.chunks, indexData.index);
50
+ const results = searchText(query, indexData.chunks, indexData.indexData);
51
51
 
52
52
  return {
53
53
  query,
@@ -1,48 +1,129 @@
1
1
  export function buildTextIndex(chunks) {
2
2
  const index = new Map();
3
+ const chunkMetadata = [];
3
4
 
4
5
  chunks.forEach((chunk, idx) => {
5
6
  const tokens = tokenize(chunk.content);
7
+ const symbols = extractSymbols(chunk.content);
8
+ const frequency = new Map();
9
+
6
10
  tokens.forEach(token => {
11
+ frequency.set(token, (frequency.get(token) || 0) + 1);
7
12
  if (!index.has(token)) {
8
13
  index.set(token, []);
9
14
  }
10
15
  index.get(token).push(idx);
11
16
  });
17
+
18
+ chunkMetadata[idx] = {
19
+ tokens,
20
+ symbols,
21
+ frequency,
22
+ isCode: isCodeFile(chunk.file_path),
23
+ };
12
24
  });
13
25
 
14
- return index;
26
+ return { index, chunkMetadata };
15
27
  }
16
28
 
17
- export function searchText(query, chunks, index) {
29
+ export function searchText(query, chunks, indexData) {
30
+ const { index, chunkMetadata } = indexData;
18
31
  const queryTokens = tokenize(query);
32
+ const querySymbols = extractSymbols(query);
19
33
  const chunkScores = new Map();
20
34
 
21
- queryTokens.forEach(token => {
22
- if (index.has(token)) {
23
- index.get(token).forEach(chunkIdx => {
24
- if (!chunkScores.has(chunkIdx)) {
25
- chunkScores.set(chunkIdx, 0);
35
+ chunks.forEach((chunk, idx) => {
36
+ let score = 0;
37
+
38
+ queryTokens.forEach(token => {
39
+ if (index.has(token)) {
40
+ if (index.get(token).includes(idx)) {
41
+ const freq = chunkMetadata[idx].frequency.get(token) || 1;
42
+ const boost = token.length > 4 ? 1.5 : 1;
43
+ score += boost * freq;
26
44
  }
27
- chunkScores.set(chunkIdx, chunkScores.get(chunkIdx) + 1);
28
- });
45
+ }
46
+ });
47
+
48
+ querySymbols.forEach(symbol => {
49
+ if (chunkMetadata[idx].symbols.includes(symbol)) {
50
+ score += 5;
51
+ }
52
+ });
53
+
54
+ const exactMatch = chunk.content.includes(query);
55
+ if (exactMatch) {
56
+ score += 10;
57
+ }
58
+
59
+ if (chunkMetadata[idx].isCode) {
60
+ score *= 1.2;
61
+ }
62
+
63
+ if (score > 0) {
64
+ chunkScores.set(idx, score);
29
65
  }
30
66
  });
31
67
 
32
68
  const results = Array.from(chunkScores.entries())
33
69
  .map(([idx, score]) => ({
34
70
  ...chunks[idx],
35
- score: score / queryTokens.length,
36
- matchCount: score,
71
+ score: Math.min(score / 100, 1),
72
+ _rawScore: score,
37
73
  }))
38
74
  .filter(r => r.score > 0)
39
- .sort((a, b) => b.score - a.score);
75
+ .sort((a, b) => b._rawScore - a._rawScore);
40
76
 
41
77
  return results;
42
78
  }
43
79
 
44
80
  function tokenize(text) {
45
- return text
46
- .toLowerCase()
47
- .match(/\b\w+\b/g) || [];
81
+ const tokens = new Set();
82
+
83
+ text.toLowerCase().split(/\s+/).forEach(word => {
84
+ if (word.length === 0) return;
85
+
86
+ tokens.add(word.replace(/[^\w]/g, ''));
87
+
88
+ const camelCaseTokens = word.match(/[a-z]+|[A-Z][a-z]*|[0-9]+/g) || [];
89
+ camelCaseTokens.forEach(t => {
90
+ if (t.length > 1) tokens.add(t.toLowerCase());
91
+ });
92
+
93
+ const snakeCaseTokens = word.split(/[-_]/).filter(t => t.length > 0);
94
+ snakeCaseTokens.forEach(t => {
95
+ if (t.length > 1) tokens.add(t.toLowerCase());
96
+ });
97
+ });
98
+
99
+ return Array.from(tokens).filter(t => t.length > 1);
100
+ }
101
+
102
+ function extractSymbols(text) {
103
+ const symbols = new Set();
104
+
105
+ const functionMatches = text.match(/(?:async\s+)?function\s+(\w+)|const\s+(\w+)\s*=\s*(?:async\s*)?\(/g) || [];
106
+ functionMatches.forEach(match => {
107
+ const name = match.match(/\w+(?=\s*[=\(])/)?.[0];
108
+ if (name) symbols.add(name.toLowerCase());
109
+ });
110
+
111
+ const classMatches = text.match(/class\s+(\w+)/g) || [];
112
+ classMatches.forEach(match => {
113
+ const name = match.match(/\w+$/)?.[0];
114
+ if (name) symbols.add(name.toLowerCase());
115
+ });
116
+
117
+ const exportMatches = text.match(/export\s+(?:async\s+)?(?:function|class)\s+(\w+)/g) || [];
118
+ exportMatches.forEach(match => {
119
+ const name = match.match(/\w+$/)?.[0];
120
+ if (name) symbols.add(name.toLowerCase());
121
+ });
122
+
123
+ return Array.from(symbols);
124
+ }
125
+
126
+ function isCodeFile(filePath) {
127
+ const codeExtensions = ['.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.go', '.rs', '.rb'];
128
+ return codeExtensions.some(ext => filePath.endsWith(ext));
48
129
  }