@softerist/heuristic-mcp 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +287 -0
- package/CONTRIBUTING.md +308 -0
- package/LICENSE +21 -0
- package/README.md +249 -0
- package/config.json +66 -0
- package/example.png +0 -0
- package/features/clear-cache.js +75 -0
- package/features/find-similar-code.js +127 -0
- package/features/hybrid-search.js +173 -0
- package/features/index-codebase.js +811 -0
- package/how-its-works.png +0 -0
- package/index.js +208 -0
- package/lib/cache.js +163 -0
- package/lib/config.js +257 -0
- package/lib/embedding-worker.js +67 -0
- package/lib/ignore-patterns.js +314 -0
- package/lib/project-detector.js +75 -0
- package/lib/tokenizer.js +142 -0
- package/lib/utils.js +301 -0
- package/package.json +65 -0
- package/scripts/clear-cache.js +31 -0
- package/test/clear-cache.test.js +288 -0
- package/test/embedding-model.test.js +230 -0
- package/test/helpers.js +128 -0
- package/test/hybrid-search.test.js +243 -0
- package/test/index-codebase.test.js +246 -0
- package/test/integration.test.js +223 -0
- package/test/tokenizer.test.js +225 -0
- package/vitest.config.js +29 -0
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
// Comprehensive ignore patterns based on industry best practices
|
|
2
|
+
// Researched from gitignore templates and development community standards
|
|
3
|
+
|
|
4
|
+
export const IGNORE_PATTERNS = {
|
|
5
|
+
// JavaScript/Node.js
|
|
6
|
+
javascript: [
|
|
7
|
+
'**/node_modules/**',
|
|
8
|
+
'**/.next/**',
|
|
9
|
+
'**/dist/**',
|
|
10
|
+
'**/build/**',
|
|
11
|
+
'**/.nuxt/**',
|
|
12
|
+
'**/.output/**',
|
|
13
|
+
'**/.vercel/**',
|
|
14
|
+
'**/.netlify/**',
|
|
15
|
+
'**/out/**',
|
|
16
|
+
'**/coverage/**',
|
|
17
|
+
'**/.nyc_output/**',
|
|
18
|
+
'**/npm-debug.log*',
|
|
19
|
+
'**/yarn-debug.log*',
|
|
20
|
+
'**/yarn-error.log*',
|
|
21
|
+
'**/.pnpm-store/**',
|
|
22
|
+
'**/.turbo/**'
|
|
23
|
+
],
|
|
24
|
+
|
|
25
|
+
// Python
|
|
26
|
+
python: [
|
|
27
|
+
'**/__pycache__/**',
|
|
28
|
+
'**/*.pyc',
|
|
29
|
+
'**/*.pyd',
|
|
30
|
+
'**/.Python',
|
|
31
|
+
'**/build/**',
|
|
32
|
+
'**/develop-eggs/**',
|
|
33
|
+
'**/dist/**',
|
|
34
|
+
'**/downloads/**',
|
|
35
|
+
'**/eggs/**',
|
|
36
|
+
'**/.eggs/**',
|
|
37
|
+
'**/lib/**',
|
|
38
|
+
'**/lib64/**',
|
|
39
|
+
'**/parts/**',
|
|
40
|
+
'**/sdist/**',
|
|
41
|
+
'**/var/**',
|
|
42
|
+
'**/*.egg-info/**',
|
|
43
|
+
'**/.installed.cfg',
|
|
44
|
+
'**/*.egg',
|
|
45
|
+
'**/.venv/**',
|
|
46
|
+
'**/venv/**',
|
|
47
|
+
'**/env/**',
|
|
48
|
+
'**/ENV/**',
|
|
49
|
+
'**/.pytest_cache/**',
|
|
50
|
+
'**/htmlcov/**',
|
|
51
|
+
'**/.tox/**',
|
|
52
|
+
'**/.coverage',
|
|
53
|
+
'**/.hypothesis/**',
|
|
54
|
+
'**/.mypy_cache/**',
|
|
55
|
+
'**/.ruff_cache/**'
|
|
56
|
+
],
|
|
57
|
+
|
|
58
|
+
// Java/Maven
|
|
59
|
+
java: [
|
|
60
|
+
'**/target/**',
|
|
61
|
+
'**/.gradle/**',
|
|
62
|
+
'**/build/**',
|
|
63
|
+
'**/.idea/**',
|
|
64
|
+
'**/*.iml',
|
|
65
|
+
'**/out/**',
|
|
66
|
+
'**/gen/**',
|
|
67
|
+
'**/classes/**',
|
|
68
|
+
'**/.classpath',
|
|
69
|
+
'**/.project',
|
|
70
|
+
'**/.settings/**',
|
|
71
|
+
'**/.m2/**',
|
|
72
|
+
'**/*.class',
|
|
73
|
+
'**/*.jar',
|
|
74
|
+
'**/*.war',
|
|
75
|
+
'**/*.ear'
|
|
76
|
+
],
|
|
77
|
+
|
|
78
|
+
// Android
|
|
79
|
+
android: [
|
|
80
|
+
'**/.gradle/**',
|
|
81
|
+
'**/build/**',
|
|
82
|
+
'**/.idea/**',
|
|
83
|
+
'**/*.iml',
|
|
84
|
+
'**/local.properties',
|
|
85
|
+
'**/captures/**',
|
|
86
|
+
'**/.externalNativeBuild/**',
|
|
87
|
+
'**/.cxx/**',
|
|
88
|
+
'**/*.apk',
|
|
89
|
+
'**/*.aar',
|
|
90
|
+
'**/*.ap_',
|
|
91
|
+
'**/*.dex',
|
|
92
|
+
'**/google-services.json',
|
|
93
|
+
'**/gradle-app.setting',
|
|
94
|
+
'**/.navigation/**'
|
|
95
|
+
],
|
|
96
|
+
|
|
97
|
+
// iOS/Swift
|
|
98
|
+
ios: [
|
|
99
|
+
'**/Pods/**',
|
|
100
|
+
'**/DerivedData/**',
|
|
101
|
+
'**/xcuserdata/**',
|
|
102
|
+
'**/*.xcarchive',
|
|
103
|
+
'**/build/**',
|
|
104
|
+
'**/.build/**',
|
|
105
|
+
'**/Packages/**',
|
|
106
|
+
'**/.swiftpm/**',
|
|
107
|
+
'**/Carthage/Build/**',
|
|
108
|
+
'**/fastlane/report.xml',
|
|
109
|
+
'**/fastlane/Preview.html',
|
|
110
|
+
'**/fastlane/screenshots/**',
|
|
111
|
+
'**/fastlane/test_output/**',
|
|
112
|
+
'**/*.moved-aside',
|
|
113
|
+
'**/*.xcuserstate',
|
|
114
|
+
'**/*.hmap',
|
|
115
|
+
'**/*.ipa'
|
|
116
|
+
],
|
|
117
|
+
|
|
118
|
+
// Go
|
|
119
|
+
go: [
|
|
120
|
+
'**/vendor/**',
|
|
121
|
+
'**/bin/**',
|
|
122
|
+
'**/pkg/**',
|
|
123
|
+
'**/*.exe',
|
|
124
|
+
'**/*.test',
|
|
125
|
+
'**/*.prof'
|
|
126
|
+
],
|
|
127
|
+
|
|
128
|
+
// PHP
|
|
129
|
+
php: [
|
|
130
|
+
'**/vendor/**',
|
|
131
|
+
'**/composer.phar',
|
|
132
|
+
'**/composer.lock',
|
|
133
|
+
'**/.phpunit.result.cache'
|
|
134
|
+
],
|
|
135
|
+
|
|
136
|
+
// Rust
|
|
137
|
+
rust: [
|
|
138
|
+
'**/target/**',
|
|
139
|
+
'**/Cargo.lock',
|
|
140
|
+
'**/*.rs.bk'
|
|
141
|
+
],
|
|
142
|
+
|
|
143
|
+
// Ruby
|
|
144
|
+
ruby: [
|
|
145
|
+
'**/vendor/bundle/**',
|
|
146
|
+
'**/.bundle/**',
|
|
147
|
+
'**/Gemfile.lock',
|
|
148
|
+
'**/.byebug_history'
|
|
149
|
+
],
|
|
150
|
+
|
|
151
|
+
// .NET/C#
|
|
152
|
+
dotnet: [
|
|
153
|
+
'**/bin/**',
|
|
154
|
+
'**/obj/**',
|
|
155
|
+
'**/packages/**',
|
|
156
|
+
'**/*.user',
|
|
157
|
+
'**/*.suo',
|
|
158
|
+
'**/.vs/**',
|
|
159
|
+
'**/node_modules/**'
|
|
160
|
+
],
|
|
161
|
+
|
|
162
|
+
// Common (IDE, OS, Build tools)
|
|
163
|
+
common: [
|
|
164
|
+
// Version control
|
|
165
|
+
'**/.git/**',
|
|
166
|
+
'**/.svn/**',
|
|
167
|
+
'**/.hg/**',
|
|
168
|
+
'**/.bzr/**',
|
|
169
|
+
|
|
170
|
+
// OS files
|
|
171
|
+
'**/.DS_Store',
|
|
172
|
+
'**/Thumbs.db',
|
|
173
|
+
'**/desktop.ini',
|
|
174
|
+
'**/$RECYCLE.BIN/**',
|
|
175
|
+
|
|
176
|
+
// Backup files
|
|
177
|
+
'**/*.bak',
|
|
178
|
+
'**/*.backup',
|
|
179
|
+
'**/*~',
|
|
180
|
+
'**/*.swp',
|
|
181
|
+
'**/*.swo',
|
|
182
|
+
'**/*.swn',
|
|
183
|
+
'**/#*#',
|
|
184
|
+
'**/.#*',
|
|
185
|
+
|
|
186
|
+
// Lock files (editor/runtime, not package managers)
|
|
187
|
+
'**/*.lock',
|
|
188
|
+
'**/.~lock*',
|
|
189
|
+
|
|
190
|
+
// Logs
|
|
191
|
+
'**/*.log',
|
|
192
|
+
'**/logs/**',
|
|
193
|
+
'**/*.log.*',
|
|
194
|
+
|
|
195
|
+
// IDEs and Editors
|
|
196
|
+
'**/.vscode/**',
|
|
197
|
+
'**/.idea/**',
|
|
198
|
+
'**/.sublime-project',
|
|
199
|
+
'**/.sublime-workspace',
|
|
200
|
+
'**/nbproject/**',
|
|
201
|
+
'**/.settings/**',
|
|
202
|
+
'**/.metadata/**',
|
|
203
|
+
'**/.classpath',
|
|
204
|
+
'**/.project',
|
|
205
|
+
'**/.c9/**',
|
|
206
|
+
'**/*.launch',
|
|
207
|
+
'**/*.tmproj',
|
|
208
|
+
'**/*.tmproject',
|
|
209
|
+
'**/tmtags',
|
|
210
|
+
|
|
211
|
+
// Vim
|
|
212
|
+
'**/*~',
|
|
213
|
+
'**/*.swp',
|
|
214
|
+
'**/*.swo',
|
|
215
|
+
'**/.*.sw?',
|
|
216
|
+
'**/Session.vim',
|
|
217
|
+
|
|
218
|
+
// Emacs
|
|
219
|
+
'**/*~',
|
|
220
|
+
'**/#*#',
|
|
221
|
+
'**/.#*',
|
|
222
|
+
|
|
223
|
+
// Environment files (secrets)
|
|
224
|
+
'**/.env',
|
|
225
|
+
'**/.env.local',
|
|
226
|
+
'**/.env.*.local',
|
|
227
|
+
'**/.env.production',
|
|
228
|
+
'**/.env.development',
|
|
229
|
+
'**/.env.test',
|
|
230
|
+
'**/secrets.json',
|
|
231
|
+
'**/secrets.yaml',
|
|
232
|
+
'**/secrets.yml',
|
|
233
|
+
'**/*.key',
|
|
234
|
+
'**/*.pem',
|
|
235
|
+
'**/*.crt',
|
|
236
|
+
'**/*.cer',
|
|
237
|
+
'**/*.p12',
|
|
238
|
+
'**/*.pfx',
|
|
239
|
+
|
|
240
|
+
// Temporary files
|
|
241
|
+
'**/tmp/**',
|
|
242
|
+
'**/temp/**',
|
|
243
|
+
'**/*.tmp',
|
|
244
|
+
'**/*.temp',
|
|
245
|
+
'**/.cache/**',
|
|
246
|
+
|
|
247
|
+
// Session & runtime
|
|
248
|
+
'**/.sass-cache/**',
|
|
249
|
+
'**/connect.lock',
|
|
250
|
+
'**/*.pid',
|
|
251
|
+
'**/*.seed',
|
|
252
|
+
'**/*.pid.lock',
|
|
253
|
+
|
|
254
|
+
// Coverage & test output
|
|
255
|
+
'**/coverage/**',
|
|
256
|
+
'**/.nyc_output/**',
|
|
257
|
+
'**/test-results/**',
|
|
258
|
+
'**/*.cover',
|
|
259
|
+
'**/*.coverage',
|
|
260
|
+
'**/htmlcov/**',
|
|
261
|
+
|
|
262
|
+
// Documentation builds
|
|
263
|
+
'**/docs/_build/**',
|
|
264
|
+
'**/site/**',
|
|
265
|
+
|
|
266
|
+
// Misc
|
|
267
|
+
'**/*.orig',
|
|
268
|
+
'**/core',
|
|
269
|
+
'**/*.core'
|
|
270
|
+
]
|
|
271
|
+
};
|
|
272
|
+
|
|
273
|
+
// Map marker files to project types
|
|
274
|
+
export const FILE_TYPE_MAP = {
|
|
275
|
+
// JavaScript/Node
|
|
276
|
+
'package.json': 'javascript',
|
|
277
|
+
'package-lock.json': 'javascript',
|
|
278
|
+
'yarn.lock': 'javascript',
|
|
279
|
+
'pnpm-lock.yaml': 'javascript',
|
|
280
|
+
|
|
281
|
+
// Python
|
|
282
|
+
'requirements.txt': 'python',
|
|
283
|
+
'Pipfile': 'python',
|
|
284
|
+
'pyproject.toml': 'python',
|
|
285
|
+
'setup.py': 'python',
|
|
286
|
+
|
|
287
|
+
// Android
|
|
288
|
+
'build.gradle': 'android',
|
|
289
|
+
'build.gradle.kts': 'android',
|
|
290
|
+
'settings.gradle': 'android',
|
|
291
|
+
|
|
292
|
+
// Java
|
|
293
|
+
'pom.xml': 'java',
|
|
294
|
+
|
|
295
|
+
// iOS
|
|
296
|
+
'Podfile': 'ios',
|
|
297
|
+
'Package.swift': 'ios',
|
|
298
|
+
|
|
299
|
+
// Go
|
|
300
|
+
'go.mod': 'go',
|
|
301
|
+
|
|
302
|
+
// PHP
|
|
303
|
+
'composer.json': 'php',
|
|
304
|
+
|
|
305
|
+
// Rust
|
|
306
|
+
'Cargo.toml': 'rust',
|
|
307
|
+
|
|
308
|
+
// Ruby
|
|
309
|
+
'Gemfile': 'ruby',
|
|
310
|
+
|
|
311
|
+
// .NET
|
|
312
|
+
'*.csproj': 'dotnet',
|
|
313
|
+
'*.sln': 'dotnet'
|
|
314
|
+
};
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import fs from "fs/promises";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import { FILE_TYPE_MAP, IGNORE_PATTERNS } from "./ignore-patterns.js";
|
|
4
|
+
|
|
5
|
+
export class ProjectDetector {
|
|
6
|
+
constructor(searchDirectory) {
|
|
7
|
+
this.searchDirectory = searchDirectory;
|
|
8
|
+
this.detectedTypes = new Set();
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
async detectProjectTypes() {
|
|
12
|
+
const markerFiles = Object.keys(FILE_TYPE_MAP);
|
|
13
|
+
|
|
14
|
+
for (const marker of markerFiles) {
|
|
15
|
+
// Handle wildcard patterns like *.csproj
|
|
16
|
+
if (marker.includes('*')) {
|
|
17
|
+
await this.detectWithWildcard(marker);
|
|
18
|
+
} else {
|
|
19
|
+
await this.detectExactFile(marker);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
return Array.from(this.detectedTypes);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
async detectExactFile(markerFile) {
|
|
27
|
+
const markerPath = path.join(this.searchDirectory, markerFile);
|
|
28
|
+
try {
|
|
29
|
+
await fs.access(markerPath);
|
|
30
|
+
const projectType = FILE_TYPE_MAP[markerFile];
|
|
31
|
+
this.detectedTypes.add(projectType);
|
|
32
|
+
console.error(`[Detector] Detected ${projectType} project (${markerFile})`);
|
|
33
|
+
} catch {
|
|
34
|
+
// File doesn't exist, continue
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
async detectWithWildcard(pattern) {
|
|
39
|
+
try {
|
|
40
|
+
const files = await fs.readdir(this.searchDirectory);
|
|
41
|
+
const regex = new RegExp('^' + pattern.replace('*', '.*') + '$');
|
|
42
|
+
|
|
43
|
+
for (const file of files) {
|
|
44
|
+
if (regex.test(file)) {
|
|
45
|
+
const projectType = FILE_TYPE_MAP[pattern];
|
|
46
|
+
this.detectedTypes.add(projectType);
|
|
47
|
+
console.error(`[Detector] Detected ${projectType} project (${file})`);
|
|
48
|
+
break;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
} catch {
|
|
52
|
+
// Directory read failed, continue
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
getSmartIgnorePatterns() {
|
|
57
|
+
const patterns = [...IGNORE_PATTERNS.common];
|
|
58
|
+
|
|
59
|
+
for (const type of this.detectedTypes) {
|
|
60
|
+
if (IGNORE_PATTERNS[type]) {
|
|
61
|
+
patterns.push(...IGNORE_PATTERNS[type]);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Remove duplicates
|
|
66
|
+
return [...new Set(patterns)];
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
getSummary() {
|
|
70
|
+
return {
|
|
71
|
+
detectedTypes: Array.from(this.detectedTypes),
|
|
72
|
+
patternCount: this.getSmartIgnorePatterns().length
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
}
|
package/lib/tokenizer.js
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token estimation and limits for embedding models
|
|
3
|
+
*
|
|
4
|
+
* This module provides token counting utilities and model-specific limits
|
|
5
|
+
* to ensure text chunks don't exceed the model's maximum sequence length.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Token limits for supported embedding models
|
|
10
|
+
* Each model has its own maximum sequence length
|
|
11
|
+
*/
|
|
12
|
+
export const MODEL_TOKEN_LIMITS = {
|
|
13
|
+
// Sentence Transformers / MiniLM family
|
|
14
|
+
"Xenova/all-MiniLM-L6-v2": 256,
|
|
15
|
+
"Xenova/all-MiniLM-L12-v2": 256,
|
|
16
|
+
"Xenova/paraphrase-MiniLM-L6-v2": 128,
|
|
17
|
+
"Xenova/paraphrase-MiniLM-L3-v2": 128,
|
|
18
|
+
|
|
19
|
+
// MPNet models
|
|
20
|
+
"Xenova/all-mpnet-base-v2": 384,
|
|
21
|
+
"Xenova/paraphrase-mpnet-base-v2": 384,
|
|
22
|
+
|
|
23
|
+
// Multilingual models
|
|
24
|
+
"Xenova/paraphrase-multilingual-MiniLM-L12-v2": 128,
|
|
25
|
+
"Xenova/paraphrase-multilingual-mpnet-base-v2": 256,
|
|
26
|
+
|
|
27
|
+
// Code-specific models
|
|
28
|
+
"Xenova/codebert-base": 512,
|
|
29
|
+
"Xenova/graphcodebert-base": 512,
|
|
30
|
+
|
|
31
|
+
// E5 models
|
|
32
|
+
"Xenova/e5-small-v2": 512,
|
|
33
|
+
"Xenova/e5-base-v2": 512,
|
|
34
|
+
"Xenova/e5-large-v2": 512,
|
|
35
|
+
|
|
36
|
+
// BGE models
|
|
37
|
+
"Xenova/bge-small-en-v1.5": 512,
|
|
38
|
+
"Xenova/bge-base-en-v1.5": 512,
|
|
39
|
+
"Xenova/bge-large-en-v1.5": 512,
|
|
40
|
+
|
|
41
|
+
// Default fallback
|
|
42
|
+
"default": 256
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Get the maximum token limit for a given model
|
|
47
|
+
* Case-insensitive lookup for robustness
|
|
48
|
+
* @param {string} modelName - The model name (e.g., "Xenova/all-MiniLM-L6-v2")
|
|
49
|
+
* @returns {number} Maximum tokens supported by the model
|
|
50
|
+
*/
|
|
51
|
+
export function getModelTokenLimit(modelName) {
|
|
52
|
+
if (!modelName) return MODEL_TOKEN_LIMITS["default"];
|
|
53
|
+
|
|
54
|
+
// Direct match first (fastest)
|
|
55
|
+
if (MODEL_TOKEN_LIMITS[modelName] !== undefined) {
|
|
56
|
+
return MODEL_TOKEN_LIMITS[modelName];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Case-insensitive search
|
|
60
|
+
const normalizedName = modelName.toLowerCase();
|
|
61
|
+
for (const [key, value] of Object.entries(MODEL_TOKEN_LIMITS)) {
|
|
62
|
+
if (key.toLowerCase() === normalizedName) {
|
|
63
|
+
return value;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return MODEL_TOKEN_LIMITS["default"];
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Get chunking parameters for a model
|
|
72
|
+
* Returns target and overlap tokens based on the model's limit
|
|
73
|
+
* @param {string} modelName - The model name
|
|
74
|
+
* @returns {{ maxTokens: number, targetTokens: number, overlapTokens: number }}
|
|
75
|
+
*/
|
|
76
|
+
export function getChunkingParams(modelName) {
|
|
77
|
+
const maxTokens = getModelTokenLimit(modelName);
|
|
78
|
+
|
|
79
|
+
// Target: 85% of max to leave safety buffer
|
|
80
|
+
const targetTokens = Math.floor(maxTokens * 0.85);
|
|
81
|
+
|
|
82
|
+
// Overlap: 15-20% of target for context continuity
|
|
83
|
+
const overlapTokens = Math.floor(targetTokens * 0.18);
|
|
84
|
+
|
|
85
|
+
return {
|
|
86
|
+
maxTokens,
|
|
87
|
+
targetTokens,
|
|
88
|
+
overlapTokens
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Estimate token count for text (conservative estimate for code)
|
|
94
|
+
* Uses a simple heuristic: counts words, special characters, and estimates subwords
|
|
95
|
+
*
|
|
96
|
+
* This is conservative - actual tokenizers may produce fewer tokens.
|
|
97
|
+
* For most accurate results, use the actual tokenizer, but this is much faster.
|
|
98
|
+
*
|
|
99
|
+
* @param {string} text - The text to estimate tokens for
|
|
100
|
+
* @returns {number} Estimated token count
|
|
101
|
+
*/
|
|
102
|
+
export function estimateTokens(text) {
|
|
103
|
+
if (!text || text.length === 0) return 0;
|
|
104
|
+
|
|
105
|
+
// Count words (split by whitespace)
|
|
106
|
+
const words = text.split(/\s+/).filter(w => w.length > 0);
|
|
107
|
+
|
|
108
|
+
// Count special characters/punctuation that often become separate tokens
|
|
109
|
+
const specialChars = (text.match(/[{}()\[\];:,.<>!=+\-*\/%&|^~@#$"'`\\]/g) || []).length;
|
|
110
|
+
|
|
111
|
+
// Estimate: words + special chars + 2 (for [CLS] and [SEP] special tokens)
|
|
112
|
+
// For long words, add extra tokens due to subword tokenization
|
|
113
|
+
let tokenCount = 2; // [CLS] and [SEP]
|
|
114
|
+
|
|
115
|
+
for (const word of words) {
|
|
116
|
+
if (word.length <= 4) {
|
|
117
|
+
tokenCount += 1;
|
|
118
|
+
} else if (word.length <= 10) {
|
|
119
|
+
tokenCount += 2;
|
|
120
|
+
} else {
|
|
121
|
+
// Long words get split into ~4-char subwords
|
|
122
|
+
tokenCount += Math.ceil(word.length / 4);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// Many special chars merge with adjacent tokens, so count ~50%
|
|
127
|
+
tokenCount += Math.floor(specialChars * 0.5);
|
|
128
|
+
|
|
129
|
+
return tokenCount;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/**
|
|
133
|
+
* Check if text exceeds the token limit for a model
|
|
134
|
+
* @param {string} text - The text to check
|
|
135
|
+
* @param {string} modelName - The model name
|
|
136
|
+
* @returns {boolean} True if the text exceeds the limit
|
|
137
|
+
*/
|
|
138
|
+
export function exceedsTokenLimit(text, modelName) {
|
|
139
|
+
const limit = getModelTokenLimit(modelName);
|
|
140
|
+
const tokens = estimateTokens(text);
|
|
141
|
+
return tokens > limit;
|
|
142
|
+
}
|