@softerist/heuristic-mcp 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,314 @@
1
+ // Comprehensive ignore patterns based on industry best practices
2
+ // Researched from gitignore templates and development community standards
3
+
4
+ export const IGNORE_PATTERNS = {
5
+ // JavaScript/Node.js
6
+ javascript: [
7
+ '**/node_modules/**',
8
+ '**/.next/**',
9
+ '**/dist/**',
10
+ '**/build/**',
11
+ '**/.nuxt/**',
12
+ '**/.output/**',
13
+ '**/.vercel/**',
14
+ '**/.netlify/**',
15
+ '**/out/**',
16
+ '**/coverage/**',
17
+ '**/.nyc_output/**',
18
+ '**/npm-debug.log*',
19
+ '**/yarn-debug.log*',
20
+ '**/yarn-error.log*',
21
+ '**/.pnpm-store/**',
22
+ '**/.turbo/**'
23
+ ],
24
+
25
+ // Python
26
+ python: [
27
+ '**/__pycache__/**',
28
+ '**/*.pyc',
29
+ '**/*.pyd',
30
+ '**/.Python',
31
+ '**/build/**',
32
+ '**/develop-eggs/**',
33
+ '**/dist/**',
34
+ '**/downloads/**',
35
+ '**/eggs/**',
36
+ '**/.eggs/**',
37
+ '**/lib/**',
38
+ '**/lib64/**',
39
+ '**/parts/**',
40
+ '**/sdist/**',
41
+ '**/var/**',
42
+ '**/*.egg-info/**',
43
+ '**/.installed.cfg',
44
+ '**/*.egg',
45
+ '**/.venv/**',
46
+ '**/venv/**',
47
+ '**/env/**',
48
+ '**/ENV/**',
49
+ '**/.pytest_cache/**',
50
+ '**/htmlcov/**',
51
+ '**/.tox/**',
52
+ '**/.coverage',
53
+ '**/.hypothesis/**',
54
+ '**/.mypy_cache/**',
55
+ '**/.ruff_cache/**'
56
+ ],
57
+
58
+ // Java/Maven
59
+ java: [
60
+ '**/target/**',
61
+ '**/.gradle/**',
62
+ '**/build/**',
63
+ '**/.idea/**',
64
+ '**/*.iml',
65
+ '**/out/**',
66
+ '**/gen/**',
67
+ '**/classes/**',
68
+ '**/.classpath',
69
+ '**/.project',
70
+ '**/.settings/**',
71
+ '**/.m2/**',
72
+ '**/*.class',
73
+ '**/*.jar',
74
+ '**/*.war',
75
+ '**/*.ear'
76
+ ],
77
+
78
+ // Android
79
+ android: [
80
+ '**/.gradle/**',
81
+ '**/build/**',
82
+ '**/.idea/**',
83
+ '**/*.iml',
84
+ '**/local.properties',
85
+ '**/captures/**',
86
+ '**/.externalNativeBuild/**',
87
+ '**/.cxx/**',
88
+ '**/*.apk',
89
+ '**/*.aar',
90
+ '**/*.ap_',
91
+ '**/*.dex',
92
+ '**/google-services.json',
93
+ '**/gradle-app.setting',
94
+ '**/.navigation/**'
95
+ ],
96
+
97
+ // iOS/Swift
98
+ ios: [
99
+ '**/Pods/**',
100
+ '**/DerivedData/**',
101
+ '**/xcuserdata/**',
102
+ '**/*.xcarchive',
103
+ '**/build/**',
104
+ '**/.build/**',
105
+ '**/Packages/**',
106
+ '**/.swiftpm/**',
107
+ '**/Carthage/Build/**',
108
+ '**/fastlane/report.xml',
109
+ '**/fastlane/Preview.html',
110
+ '**/fastlane/screenshots/**',
111
+ '**/fastlane/test_output/**',
112
+ '**/*.moved-aside',
113
+ '**/*.xcuserstate',
114
+ '**/*.hmap',
115
+ '**/*.ipa'
116
+ ],
117
+
118
+ // Go
119
+ go: [
120
+ '**/vendor/**',
121
+ '**/bin/**',
122
+ '**/pkg/**',
123
+ '**/*.exe',
124
+ '**/*.test',
125
+ '**/*.prof'
126
+ ],
127
+
128
+ // PHP
129
+ php: [
130
+ '**/vendor/**',
131
+ '**/composer.phar',
132
+ '**/composer.lock',
133
+ '**/.phpunit.result.cache'
134
+ ],
135
+
136
+ // Rust
137
+ rust: [
138
+ '**/target/**',
139
+ '**/Cargo.lock',
140
+ '**/*.rs.bk'
141
+ ],
142
+
143
+ // Ruby
144
+ ruby: [
145
+ '**/vendor/bundle/**',
146
+ '**/.bundle/**',
147
+ '**/Gemfile.lock',
148
+ '**/.byebug_history'
149
+ ],
150
+
151
+ // .NET/C#
152
+ dotnet: [
153
+ '**/bin/**',
154
+ '**/obj/**',
155
+ '**/packages/**',
156
+ '**/*.user',
157
+ '**/*.suo',
158
+ '**/.vs/**',
159
+ '**/node_modules/**'
160
+ ],
161
+
162
+ // Common (IDE, OS, Build tools)
163
+ common: [
164
+ // Version control
165
+ '**/.git/**',
166
+ '**/.svn/**',
167
+ '**/.hg/**',
168
+ '**/.bzr/**',
169
+
170
+ // OS files
171
+ '**/.DS_Store',
172
+ '**/Thumbs.db',
173
+ '**/desktop.ini',
174
+ '**/$RECYCLE.BIN/**',
175
+
176
+ // Backup files
177
+ '**/*.bak',
178
+ '**/*.backup',
179
+ '**/*~',
180
+ '**/*.swp',
181
+ '**/*.swo',
182
+ '**/*.swn',
183
+ '**/#*#',
184
+ '**/.#*',
185
+
186
+ // Lock files (editor/runtime, not package managers)
187
+ '**/*.lock',
188
+ '**/.~lock*',
189
+
190
+ // Logs
191
+ '**/*.log',
192
+ '**/logs/**',
193
+ '**/*.log.*',
194
+
195
+ // IDEs and Editors
196
+ '**/.vscode/**',
197
+ '**/.idea/**',
198
+ '**/.sublime-project',
199
+ '**/.sublime-workspace',
200
+ '**/nbproject/**',
201
+ '**/.settings/**',
202
+ '**/.metadata/**',
203
+ '**/.classpath',
204
+ '**/.project',
205
+ '**/.c9/**',
206
+ '**/*.launch',
207
+ '**/*.tmproj',
208
+ '**/*.tmproject',
209
+ '**/tmtags',
210
+
211
+ // Vim
212
+ '**/*~',
213
+ '**/*.swp',
214
+ '**/*.swo',
215
+ '**/.*.sw?',
216
+ '**/Session.vim',
217
+
218
+ // Emacs
219
+ '**/*~',
220
+ '**/#*#',
221
+ '**/.#*',
222
+
223
+ // Environment files (secrets)
224
+ '**/.env',
225
+ '**/.env.local',
226
+ '**/.env.*.local',
227
+ '**/.env.production',
228
+ '**/.env.development',
229
+ '**/.env.test',
230
+ '**/secrets.json',
231
+ '**/secrets.yaml',
232
+ '**/secrets.yml',
233
+ '**/*.key',
234
+ '**/*.pem',
235
+ '**/*.crt',
236
+ '**/*.cer',
237
+ '**/*.p12',
238
+ '**/*.pfx',
239
+
240
+ // Temporary files
241
+ '**/tmp/**',
242
+ '**/temp/**',
243
+ '**/*.tmp',
244
+ '**/*.temp',
245
+ '**/.cache/**',
246
+
247
+ // Session & runtime
248
+ '**/.sass-cache/**',
249
+ '**/connect.lock',
250
+ '**/*.pid',
251
+ '**/*.seed',
252
+ '**/*.pid.lock',
253
+
254
+ // Coverage & test output
255
+ '**/coverage/**',
256
+ '**/.nyc_output/**',
257
+ '**/test-results/**',
258
+ '**/*.cover',
259
+ '**/*.coverage',
260
+ '**/htmlcov/**',
261
+
262
+ // Documentation builds
263
+ '**/docs/_build/**',
264
+ '**/site/**',
265
+
266
+ // Misc
267
+ '**/*.orig',
268
+ '**/core',
269
+ '**/*.core'
270
+ ]
271
+ };
272
+
273
+ // Map marker files to project types
274
+ export const FILE_TYPE_MAP = {
275
+ // JavaScript/Node
276
+ 'package.json': 'javascript',
277
+ 'package-lock.json': 'javascript',
278
+ 'yarn.lock': 'javascript',
279
+ 'pnpm-lock.yaml': 'javascript',
280
+
281
+ // Python
282
+ 'requirements.txt': 'python',
283
+ 'Pipfile': 'python',
284
+ 'pyproject.toml': 'python',
285
+ 'setup.py': 'python',
286
+
287
+ // Android
288
+ 'build.gradle': 'android',
289
+ 'build.gradle.kts': 'android',
290
+ 'settings.gradle': 'android',
291
+
292
+ // Java
293
+ 'pom.xml': 'java',
294
+
295
+ // iOS
296
+ 'Podfile': 'ios',
297
+ 'Package.swift': 'ios',
298
+
299
+ // Go
300
+ 'go.mod': 'go',
301
+
302
+ // PHP
303
+ 'composer.json': 'php',
304
+
305
+ // Rust
306
+ 'Cargo.toml': 'rust',
307
+
308
+ // Ruby
309
+ 'Gemfile': 'ruby',
310
+
311
+ // .NET
312
+ '*.csproj': 'dotnet',
313
+ '*.sln': 'dotnet'
314
+ };
@@ -0,0 +1,75 @@
1
+ import fs from "fs/promises";
2
+ import path from "path";
3
+ import { FILE_TYPE_MAP, IGNORE_PATTERNS } from "./ignore-patterns.js";
4
+
5
+ export class ProjectDetector {
6
+ constructor(searchDirectory) {
7
+ this.searchDirectory = searchDirectory;
8
+ this.detectedTypes = new Set();
9
+ }
10
+
11
+ async detectProjectTypes() {
12
+ const markerFiles = Object.keys(FILE_TYPE_MAP);
13
+
14
+ for (const marker of markerFiles) {
15
+ // Handle wildcard patterns like *.csproj
16
+ if (marker.includes('*')) {
17
+ await this.detectWithWildcard(marker);
18
+ } else {
19
+ await this.detectExactFile(marker);
20
+ }
21
+ }
22
+
23
+ return Array.from(this.detectedTypes);
24
+ }
25
+
26
+ async detectExactFile(markerFile) {
27
+ const markerPath = path.join(this.searchDirectory, markerFile);
28
+ try {
29
+ await fs.access(markerPath);
30
+ const projectType = FILE_TYPE_MAP[markerFile];
31
+ this.detectedTypes.add(projectType);
32
+ console.error(`[Detector] Detected ${projectType} project (${markerFile})`);
33
+ } catch {
34
+ // File doesn't exist, continue
35
+ }
36
+ }
37
+
38
+ async detectWithWildcard(pattern) {
39
+ try {
40
+ const files = await fs.readdir(this.searchDirectory);
41
+ const regex = new RegExp('^' + pattern.replace('*', '.*') + '$');
42
+
43
+ for (const file of files) {
44
+ if (regex.test(file)) {
45
+ const projectType = FILE_TYPE_MAP[pattern];
46
+ this.detectedTypes.add(projectType);
47
+ console.error(`[Detector] Detected ${projectType} project (${file})`);
48
+ break;
49
+ }
50
+ }
51
+ } catch {
52
+ // Directory read failed, continue
53
+ }
54
+ }
55
+
56
+ getSmartIgnorePatterns() {
57
+ const patterns = [...IGNORE_PATTERNS.common];
58
+
59
+ for (const type of this.detectedTypes) {
60
+ if (IGNORE_PATTERNS[type]) {
61
+ patterns.push(...IGNORE_PATTERNS[type]);
62
+ }
63
+ }
64
+
65
+ // Remove duplicates
66
+ return [...new Set(patterns)];
67
+ }
68
+
69
+ getSummary() {
70
+ return {
71
+ detectedTypes: Array.from(this.detectedTypes),
72
+ patternCount: this.getSmartIgnorePatterns().length
73
+ };
74
+ }
75
+ }
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Token estimation and limits for embedding models
3
+ *
4
+ * This module provides token counting utilities and model-specific limits
5
+ * to ensure text chunks don't exceed the model's maximum sequence length.
6
+ */
7
+
8
+ /**
9
+ * Token limits for supported embedding models
10
+ * Each model has its own maximum sequence length
11
+ */
12
+ export const MODEL_TOKEN_LIMITS = {
13
+ // Sentence Transformers / MiniLM family
14
+ "Xenova/all-MiniLM-L6-v2": 256,
15
+ "Xenova/all-MiniLM-L12-v2": 256,
16
+ "Xenova/paraphrase-MiniLM-L6-v2": 128,
17
+ "Xenova/paraphrase-MiniLM-L3-v2": 128,
18
+
19
+ // MPNet models
20
+ "Xenova/all-mpnet-base-v2": 384,
21
+ "Xenova/paraphrase-mpnet-base-v2": 384,
22
+
23
+ // Multilingual models
24
+ "Xenova/paraphrase-multilingual-MiniLM-L12-v2": 128,
25
+ "Xenova/paraphrase-multilingual-mpnet-base-v2": 256,
26
+
27
+ // Code-specific models
28
+ "Xenova/codebert-base": 512,
29
+ "Xenova/graphcodebert-base": 512,
30
+
31
+ // E5 models
32
+ "Xenova/e5-small-v2": 512,
33
+ "Xenova/e5-base-v2": 512,
34
+ "Xenova/e5-large-v2": 512,
35
+
36
+ // BGE models
37
+ "Xenova/bge-small-en-v1.5": 512,
38
+ "Xenova/bge-base-en-v1.5": 512,
39
+ "Xenova/bge-large-en-v1.5": 512,
40
+
41
+ // Default fallback
42
+ "default": 256
43
+ };
44
+
45
+ /**
46
+ * Get the maximum token limit for a given model
47
+ * Case-insensitive lookup for robustness
48
+ * @param {string} modelName - The model name (e.g., "Xenova/all-MiniLM-L6-v2")
49
+ * @returns {number} Maximum tokens supported by the model
50
+ */
51
+ export function getModelTokenLimit(modelName) {
52
+ if (!modelName) return MODEL_TOKEN_LIMITS["default"];
53
+
54
+ // Direct match first (fastest)
55
+ if (MODEL_TOKEN_LIMITS[modelName] !== undefined) {
56
+ return MODEL_TOKEN_LIMITS[modelName];
57
+ }
58
+
59
+ // Case-insensitive search
60
+ const normalizedName = modelName.toLowerCase();
61
+ for (const [key, value] of Object.entries(MODEL_TOKEN_LIMITS)) {
62
+ if (key.toLowerCase() === normalizedName) {
63
+ return value;
64
+ }
65
+ }
66
+
67
+ return MODEL_TOKEN_LIMITS["default"];
68
+ }
69
+
70
+ /**
71
+ * Get chunking parameters for a model
72
+ * Returns target and overlap tokens based on the model's limit
73
+ * @param {string} modelName - The model name
74
+ * @returns {{ maxTokens: number, targetTokens: number, overlapTokens: number }}
75
+ */
76
+ export function getChunkingParams(modelName) {
77
+ const maxTokens = getModelTokenLimit(modelName);
78
+
79
+ // Target: 85% of max to leave safety buffer
80
+ const targetTokens = Math.floor(maxTokens * 0.85);
81
+
82
+ // Overlap: 15-20% of target for context continuity
83
+ const overlapTokens = Math.floor(targetTokens * 0.18);
84
+
85
+ return {
86
+ maxTokens,
87
+ targetTokens,
88
+ overlapTokens
89
+ };
90
+ }
91
+
92
+ /**
93
+ * Estimate token count for text (conservative estimate for code)
94
+ * Uses a simple heuristic: counts words, special characters, and estimates subwords
95
+ *
96
+ * This is conservative - actual tokenizers may produce fewer tokens.
97
+ * For most accurate results, use the actual tokenizer, but this is much faster.
98
+ *
99
+ * @param {string} text - The text to estimate tokens for
100
+ * @returns {number} Estimated token count
101
+ */
102
+ export function estimateTokens(text) {
103
+ if (!text || text.length === 0) return 0;
104
+
105
+ // Count words (split by whitespace)
106
+ const words = text.split(/\s+/).filter(w => w.length > 0);
107
+
108
+ // Count special characters/punctuation that often become separate tokens
109
+ const specialChars = (text.match(/[{}()\[\];:,.<>!=+\-*\/%&|^~@#$"'`\\]/g) || []).length;
110
+
111
+ // Estimate: words + special chars + 2 (for [CLS] and [SEP] special tokens)
112
+ // For long words, add extra tokens due to subword tokenization
113
+ let tokenCount = 2; // [CLS] and [SEP]
114
+
115
+ for (const word of words) {
116
+ if (word.length <= 4) {
117
+ tokenCount += 1;
118
+ } else if (word.length <= 10) {
119
+ tokenCount += 2;
120
+ } else {
121
+ // Long words get split into ~4-char subwords
122
+ tokenCount += Math.ceil(word.length / 4);
123
+ }
124
+ }
125
+
126
+ // Many special chars merge with adjacent tokens, so count ~50%
127
+ tokenCount += Math.floor(specialChars * 0.5);
128
+
129
+ return tokenCount;
130
+ }
131
+
132
+ /**
133
+ * Check if text exceeds the token limit for a model
134
+ * @param {string} text - The text to check
135
+ * @param {string} modelName - The model name
136
+ * @returns {boolean} True if the text exceeds the limit
137
+ */
138
+ export function exceedsTokenLimit(text, modelName) {
139
+ const limit = getModelTokenLimit(modelName);
140
+ const tokens = estimateTokens(text);
141
+ return tokens > limit;
142
+ }