claude-brain 0.30.2 → 0.30.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +241 -191
- package/VERSION +1 -1
- package/assets/CLAUDE-unified.md +11 -11
- package/assets/CLAUDE.md +29 -29
- package/package.json +7 -3
- package/packs/backend/node.json +173 -173
- package/packs/core/javascript.json +176 -176
- package/packs/core/typescript.json +222 -222
- package/packs/frontend/react.json +254 -254
- package/packs/meta/testing.json +172 -172
- package/scripts/postinstall.mjs +531 -531
- package/src/automation/decision-detector.ts +452 -452
- package/src/automation/phase12-manager.ts +456 -456
- package/src/automation/proactive-recall.ts +373 -373
- package/src/automation/project-detector.ts +310 -310
- package/src/automation/repo-scanner.ts +210 -205
- package/src/cli/auto-setup.ts +75 -75
- package/src/cli/auto-start.ts +266 -266
- package/src/cli/bin.ts +264 -264
- package/src/cli/commands/autostart.ts +90 -90
- package/src/cli/commands/chroma.ts +578 -577
- package/src/cli/commands/export-training.ts +70 -70
- package/src/cli/commands/export.ts +130 -130
- package/src/cli/commands/git-hook.ts +183 -183
- package/src/cli/commands/hooks.ts +217 -217
- package/src/cli/commands/init.ts +123 -123
- package/src/cli/commands/install-mcp.ts +122 -111
- package/src/cli/commands/models.ts +979 -979
- package/src/cli/commands/pack.ts +200 -200
- package/src/cli/commands/refresh.ts +344 -339
- package/src/cli/commands/reindex.ts +120 -120
- package/src/cli/commands/serve.ts +466 -463
- package/src/cli/commands/start.ts +44 -44
- package/src/cli/commands/status.ts +220 -203
- package/src/cli/commands/uninstall-mcp.ts +45 -41
- package/src/cli/commands/update.ts +130 -124
- package/src/cli/migrate-chroma.ts +106 -106
- package/src/cli/ui/animations.ts +80 -80
- package/src/cli/ui/components.ts +82 -82
- package/src/cli/ui/index.ts +4 -4
- package/src/cli/ui/logo.ts +36 -36
- package/src/cli/ui/theme.ts +55 -55
- package/src/code-intelligence/indexer.ts +352 -352
- package/src/code-intelligence/linker.ts +178 -178
- package/src/code-intelligence/parser.ts +484 -484
- package/src/code-intelligence/query.ts +291 -291
- package/src/code-intelligence/schema.ts +83 -83
- package/src/code-intelligence/types.ts +95 -95
- package/src/config/defaults.ts +52 -52
- package/src/config/home.ts +56 -56
- package/src/config/index.ts +5 -5
- package/src/config/loader.ts +192 -192
- package/src/config/schema.ts +446 -415
- package/src/config/validator.ts +182 -182
- package/src/context/assembler.ts +407 -400
- package/src/context/index.ts +79 -79
- package/src/context/progress-tracker.ts +174 -174
- package/src/context/standards-manager.ts +287 -287
- package/src/context/validator.ts +58 -58
- package/src/diagnostics/index.ts +122 -121
- package/src/health/index.ts +233 -232
- package/src/hooks/brain-hook.ts +134 -131
- package/src/hooks/capture.ts +168 -168
- package/src/hooks/claude-code-mastery.md +112 -112
- package/src/hooks/context-hook.ts +260 -245
- package/src/hooks/deduplicator.ts +72 -72
- package/src/hooks/git-capture.ts +109 -109
- package/src/hooks/git-hook-installer.ts +211 -207
- package/src/hooks/index.ts +20 -20
- package/src/hooks/installer.ts +306 -288
- package/src/hooks/interceptor-hook.ts +204 -201
- package/src/hooks/passive-classifier.ts +397 -397
- package/src/hooks/queue.ts +160 -129
- package/src/hooks/session-tracker.ts +312 -312
- package/src/hooks/types.ts +52 -52
- package/src/index.ts +7 -7
- package/src/intelligence/cross-project/generalizer.ts +283 -283
- package/src/intelligence/cross-project/index.ts +7 -7
- package/src/intelligence/hf-downloader.ts +222 -222
- package/src/intelligence/hf-manifest.json +78 -78
- package/src/intelligence/index.ts +24 -24
- package/src/intelligence/inference-router.ts +762 -762
- package/src/intelligence/model-manager.ts +263 -245
- package/src/intelligence/optimization/index.ts +10 -10
- package/src/intelligence/optimization/precompute.ts +202 -202
- package/src/intelligence/optimization/semantic-cache.ts +213 -207
- package/src/intelligence/prediction/index.ts +7 -7
- package/src/intelligence/prediction/recommender.ts +276 -268
- package/src/intelligence/reasoning/chain-retrieval.ts +243 -247
- package/src/intelligence/reasoning/index.ts +7 -7
- package/src/intelligence/temporal/evolution.ts +193 -197
- package/src/intelligence/temporal/index.ts +16 -16
- package/src/intelligence/temporal/query-processor.ts +190 -190
- package/src/intelligence/temporal/timeline.ts +272 -259
- package/src/intelligence/temporal/trends.ts +263 -263
- package/src/intelligence/tokenizer.ts +118 -118
- package/src/knowledge/entity-extractor.ts +447 -443
- package/src/knowledge/graph/builder.ts +185 -185
- package/src/knowledge/graph/linker.ts +201 -201
- package/src/knowledge/graph/memory-graph.ts +359 -359
- package/src/knowledge/graph/schema.ts +99 -99
- package/src/knowledge/graph/search.ts +166 -166
- package/src/knowledge/relationship-extractor.ts +108 -108
- package/src/memory/chroma/client.ts +211 -192
- package/src/memory/chroma/collection-manager.ts +92 -92
- package/src/memory/chroma/config.ts +57 -57
- package/src/memory/chroma/embeddings.ts +177 -175
- package/src/memory/chroma/index.ts +82 -82
- package/src/memory/chroma/migration.ts +270 -270
- package/src/memory/chroma/schemas.ts +69 -69
- package/src/memory/chroma/search.ts +319 -315
- package/src/memory/chroma/store.ts +755 -747
- package/src/memory/compression.ts +121 -121
- package/src/memory/consolidation/archiver.ts +162 -165
- package/src/memory/consolidation/merger.ts +182 -186
- package/src/memory/consolidation/scorer.ts +136 -136
- package/src/memory/database.ts +9 -0
- package/src/memory/dual-write.ts +145 -0
- package/src/memory/embeddings.ts +226 -226
- package/src/memory/episodic/detector.ts +108 -108
- package/src/memory/episodic/manager.ts +347 -351
- package/src/memory/episodic/summarizer.ts +179 -179
- package/src/memory/episodic/types.ts +52 -52
- package/src/memory/fts5-search.ts +692 -633
- package/src/memory/index.ts +943 -1060
- package/src/memory/migrations/add-fts5.ts +118 -108
- package/src/memory/patterns.ts +438 -438
- package/src/memory/pruning.ts +60 -60
- package/src/memory/schema.ts +88 -88
- package/src/memory/store.ts +911 -787
- package/src/orchestrator/handlers/decision-handler.ts +204 -204
- package/src/packs/index.ts +9 -9
- package/src/packs/loader.ts +134 -134
- package/src/packs/manager.ts +204 -204
- package/src/packs/ranker.ts +78 -78
- package/src/packs/types.ts +81 -81
- package/src/phase12/index.ts +5 -5
- package/src/retrieval/bm25/index.ts +300 -297
- package/src/retrieval/bm25/tokenizer.ts +184 -184
- package/src/retrieval/feedback/adaptive.ts +221 -221
- package/src/retrieval/feedback/index.ts +16 -16
- package/src/retrieval/feedback/metrics.ts +221 -221
- package/src/retrieval/feedback/store.ts +283 -283
- package/src/retrieval/fusion/index.ts +194 -194
- package/src/retrieval/fusion/rrf.ts +165 -165
- package/src/retrieval/index.ts +12 -12
- package/src/retrieval/pipeline.ts +375 -375
- package/src/retrieval/query/expander.ts +203 -203
- package/src/retrieval/query/index.ts +27 -27
- package/src/retrieval/query/intent-classifier.ts +252 -252
- package/src/retrieval/query/temporal-parser.ts +295 -295
- package/src/retrieval/reranker/index.ts +189 -188
- package/src/retrieval/reranker/model.ts +99 -95
- package/src/retrieval/service.ts +125 -125
- package/src/retrieval/types.ts +162 -162
- package/src/routing/entity-extractor.ts +454 -454
- package/src/routing/handlers/exploration-handler.ts +369 -0
- package/src/routing/handlers/index.ts +19 -0
- package/src/routing/handlers/memory-handler.ts +273 -0
- package/src/routing/handlers/mutation-handler.ts +241 -0
- package/src/routing/handlers/recall-handler.ts +642 -0
- package/src/routing/handlers/shared.ts +515 -0
- package/src/routing/handlers/types.ts +48 -0
- package/src/routing/intent-classifier.ts +552 -552
- package/src/routing/response-filter.ts +399 -391
- package/src/routing/router.ts +245 -2193
- package/src/routing/search-engine.ts +521 -514
- package/src/routing/types.ts +104 -94
- package/src/scripts/health-check.ts +118 -118
- package/src/scripts/setup.ts +122 -122
- package/src/server/auto-updater.ts +283 -276
- package/src/server/handlers/call-tool.ts +159 -159
- package/src/server/handlers/list-tools.ts +35 -35
- package/src/server/handlers/tools/auto-remember.ts +165 -165
- package/src/server/handlers/tools/brain.ts +86 -86
- package/src/server/handlers/tools/create-project.ts +135 -135
- package/src/server/handlers/tools/get-code-standards.ts +123 -123
- package/src/server/handlers/tools/get-corrections.ts +152 -152
- package/src/server/handlers/tools/get-patterns.ts +156 -156
- package/src/server/handlers/tools/get-project-context.ts +75 -75
- package/src/server/handlers/tools/index.ts +30 -30
- package/src/server/handlers/tools/init-project.ts +756 -756
- package/src/server/handlers/tools/list-projects.ts +126 -126
- package/src/server/handlers/tools/recall-similar.ts +87 -87
- package/src/server/handlers/tools/recognize-pattern.ts +132 -132
- package/src/server/handlers/tools/record-correction.ts +131 -131
- package/src/server/handlers/tools/remember-decision.ts +168 -168
- package/src/server/handlers/tools/schemas.ts +179 -179
- package/src/server/handlers/tools/search-code.ts +122 -122
- package/src/server/handlers/tools/smart-context.ts +146 -146
- package/src/server/handlers/tools/update-progress.ts +131 -131
- package/src/server/http-api.ts +215 -1229
- package/src/server/mcp-proxy.ts +85 -84
- package/src/server/mcp-server.ts +285 -284
- package/src/server/middleware/auth.ts +39 -0
- package/src/server/middleware/error-handler.ts +37 -0
- package/src/server/middleware/rate-limit.ts +53 -0
- package/src/server/middleware/validate.ts +42 -0
- package/src/server/pid-manager.ts +137 -136
- package/src/server/providers/resources.ts +581 -581
- package/src/server/routes/code.ts +228 -0
- package/src/server/routes/context.ts +26 -0
- package/src/server/routes/health.ts +19 -0
- package/src/server/routes/helpers.ts +100 -0
- package/src/server/routes/hooks.ts +197 -0
- package/src/server/routes/mcp.ts +47 -0
- package/src/server/routes/memory.ts +397 -0
- package/src/server/routes/models.ts +96 -0
- package/src/server/routes/projects.ts +89 -0
- package/src/server/routes/types.ts +21 -0
- package/src/server/schemas/api-schemas.ts +202 -0
- package/src/server/services.ts +720 -720
- package/src/server/utils/memory-indicator.ts +84 -84
- package/src/server/utils/response-formatter.ts +129 -129
- package/src/server/web-viewer.ts +1145 -1115
- package/src/setup/index.ts +38 -38
- package/src/tools/registry.ts +115 -115
- package/src/tools/schemas.ts +666 -666
- package/src/tools/types.ts +412 -412
- package/src/training/data-store.ts +320 -298
- package/src/training/retrain-pipeline.ts +399 -394
- package/src/utils/error-handler.ts +136 -136
- package/src/utils/index.ts +58 -58
- package/src/utils/kill-port.ts +55 -53
- package/src/utils/phase12-helper.ts +56 -56
- package/src/utils/safe-path.ts +43 -0
- package/src/utils/timing.ts +47 -47
- package/src/utils/transaction.ts +63 -63
- package/src/vault/index.ts +4 -3
- package/src/vault/paths.ts +106 -106
- package/src/vault/query.ts +4 -1
- package/src/vault/reader.ts +44 -1
- package/src/vault/watcher.ts +24 -1
- package/src/vault/writer.ts +487 -413
- package/skills/persistent-memory/SKILL.md +0 -148
- package/skills/persistent-memory/references/tool-reference.md +0 -90
|
@@ -1,184 +1,184 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Text Tokenizer for BM25
|
|
3
|
-
* Handles text preprocessing for sparse search
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
// Common English stopwords to filter
|
|
7
|
-
const STOPWORDS = new Set([
|
|
8
|
-
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
9
|
-
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
10
|
-
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
11
|
-
'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
|
|
12
|
-
'used', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'me', 'my',
|
|
13
|
-
'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
|
|
14
|
-
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
|
|
15
|
-
'hers', 'herself', 'they', 'them', 'their', 'theirs', 'themselves',
|
|
16
|
-
'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how', 'all',
|
|
17
|
-
'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
|
|
18
|
-
'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
|
|
19
|
-
's', 't', 'just', 'don', 'now', 'then', 'here', 'there', 'also'
|
|
20
|
-
])
|
|
21
|
-
|
|
22
|
-
// Common programming terms to keep
|
|
23
|
-
const KEEP_TERMS = new Set([
|
|
24
|
-
'api', 'sql', 'css', 'html', 'json', 'xml', 'http', 'https', 'url', 'uri',
|
|
25
|
-
'jwt', 'oauth', 'rest', 'graphql', 'grpc', 'tcp', 'udp', 'ip', 'dns',
|
|
26
|
-
'aws', 'gcp', 'azure', 'docker', 'kubernetes', 'k8s', 'npm', 'yarn', 'pnpm',
|
|
27
|
-
'git', 'github', 'gitlab', 'ci', 'cd', 'devops', 'mlops', 'db', 'orm',
|
|
28
|
-
'ui', 'ux', 'cli', 'gui', 'ide', 'sdk', 'mcp', 'llm', 'ai', 'ml'
|
|
29
|
-
])
|
|
30
|
-
|
|
31
|
-
export interface TokenizerOptions {
|
|
32
|
-
/** Minimum token length to keep */
|
|
33
|
-
minLength?: number
|
|
34
|
-
/** Maximum token length */
|
|
35
|
-
maxLength?: number
|
|
36
|
-
/** Remove stopwords */
|
|
37
|
-
removeStopwords?: boolean
|
|
38
|
-
/** Convert to lowercase */
|
|
39
|
-
lowercase?: boolean
|
|
40
|
-
/** Apply stemming (basic) */
|
|
41
|
-
stemming?: boolean
|
|
42
|
-
/** Split on camelCase */
|
|
43
|
-
splitCamelCase?: boolean
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
const DEFAULT_OPTIONS: TokenizerOptions = {
|
|
47
|
-
minLength: 2,
|
|
48
|
-
maxLength: 50,
|
|
49
|
-
removeStopwords: true,
|
|
50
|
-
lowercase: true,
|
|
51
|
-
stemming: true,
|
|
52
|
-
splitCamelCase: true
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
/**
|
|
56
|
-
* Tokenize text for BM25 indexing and search
|
|
57
|
-
*/
|
|
58
|
-
export function tokenize(text: string, options: TokenizerOptions = {}): string[] {
|
|
59
|
-
const opts = { ...DEFAULT_OPTIONS, ...options }
|
|
60
|
-
|
|
61
|
-
if (!text || typeof text !== 'string') {
|
|
62
|
-
return []
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
// Split on camelCase if enabled
|
|
66
|
-
let processedText = text
|
|
67
|
-
if (opts.splitCamelCase) {
|
|
68
|
-
processedText = text.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
// Lowercase if enabled
|
|
72
|
-
if (opts.lowercase) {
|
|
73
|
-
processedText = processedText.toLowerCase()
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
// Split into tokens
|
|
77
|
-
// Match words, numbers, and hyphenated compounds
|
|
78
|
-
const tokenRegex = /[\w]+(?:[-_][\w]+)*/g
|
|
79
|
-
const rawTokens = processedText.match(tokenRegex) || []
|
|
80
|
-
|
|
81
|
-
// Process tokens
|
|
82
|
-
let tokens = rawTokens
|
|
83
|
-
.map(token => {
|
|
84
|
-
// Keep programming terms intact
|
|
85
|
-
if (KEEP_TERMS.has(token.toLowerCase())) {
|
|
86
|
-
return token
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
// Apply basic stemming if enabled
|
|
90
|
-
if (opts.stemming) {
|
|
91
|
-
return basicStem(token)
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
return token
|
|
95
|
-
})
|
|
96
|
-
.filter(token => {
|
|
97
|
-
// Length filter
|
|
98
|
-
if (token.length < opts.minLength! || token.length > opts.maxLength!) {
|
|
99
|
-
return false
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
// Stopword filter
|
|
103
|
-
if (opts.removeStopwords && STOPWORDS.has(token.toLowerCase())) {
|
|
104
|
-
return false
|
|
105
|
-
}
|
|
106
|
-
|
|
107
|
-
// Filter pure numbers (but keep alphanumeric)
|
|
108
|
-
if (/^\d+$/.test(token)) {
|
|
109
|
-
return false
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
return true
|
|
113
|
-
})
|
|
114
|
-
|
|
115
|
-
return tokens
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
/**
|
|
119
|
-
* Basic Porter-like stemming
|
|
120
|
-
* Simplified for performance - handles common English suffixes
|
|
121
|
-
*/
|
|
122
|
-
function basicStem(word: string): string {
|
|
123
|
-
if (word.length < 4) return word
|
|
124
|
-
|
|
125
|
-
// Common suffix replacements
|
|
126
|
-
const suffixRules: [RegExp, string][] = [
|
|
127
|
-
[/ies$/, 'y'],
|
|
128
|
-
[/ied$/, 'y'],
|
|
129
|
-
[/es$/, ''],
|
|
130
|
-
[/s$/, ''],
|
|
131
|
-
[/ing$/, ''],
|
|
132
|
-
[/ed$/, ''],
|
|
133
|
-
[/tion$/, 't'],
|
|
134
|
-
[/ness$/, ''],
|
|
135
|
-
[/ment$/, ''],
|
|
136
|
-
[/able$/, ''],
|
|
137
|
-
[/ible$/, ''],
|
|
138
|
-
[/ful$/, ''],
|
|
139
|
-
[/less$/, ''],
|
|
140
|
-
[/ly$/, '']
|
|
141
|
-
]
|
|
142
|
-
|
|
143
|
-
let stemmed = word
|
|
144
|
-
for (const [pattern, replacement] of suffixRules) {
|
|
145
|
-
if (pattern.test(word)) {
|
|
146
|
-
const candidate = word.replace(pattern, replacement)
|
|
147
|
-
// Only apply if result is at least 3 chars
|
|
148
|
-
if (candidate.length >= 3) {
|
|
149
|
-
stemmed = candidate
|
|
150
|
-
break
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
return stemmed
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
/**
|
|
159
|
-
* Get n-grams from tokens
|
|
160
|
-
*/
|
|
161
|
-
export function getNGrams(tokens: string[], n: number = 2): string[] {
|
|
162
|
-
if (tokens.length < n) return []
|
|
163
|
-
|
|
164
|
-
const ngrams: string[] = []
|
|
165
|
-
for (let i = 0; i <= tokens.length - n; i++) {
|
|
166
|
-
ngrams.push(tokens.slice(i, i + n).join(' '))
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
return ngrams
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
/**
|
|
173
|
-
* Combined tokenizer for search that includes unigrams and bigrams
|
|
174
|
-
*/
|
|
175
|
-
export function tokenizeForSearch(
|
|
176
|
-
text: string,
|
|
177
|
-
options: TokenizerOptions = {}
|
|
178
|
-
): string[] {
|
|
179
|
-
const unigrams = tokenize(text, options)
|
|
180
|
-
const bigrams = getNGrams(unigrams, 2)
|
|
181
|
-
|
|
182
|
-
// Return unique tokens (unigrams + bigrams)
|
|
183
|
-
return [...new Set([...unigrams, ...bigrams])]
|
|
184
|
-
}
|
|
1
|
+
/**
|
|
2
|
+
* Text Tokenizer for BM25
|
|
3
|
+
* Handles text preprocessing for sparse search
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// Common English stopwords to filter
|
|
7
|
+
const STOPWORDS = new Set([
|
|
8
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
9
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
10
|
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
11
|
+
'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
|
|
12
|
+
'used', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'me', 'my',
|
|
13
|
+
'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
|
|
14
|
+
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
|
|
15
|
+
'hers', 'herself', 'they', 'them', 'their', 'theirs', 'themselves',
|
|
16
|
+
'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how', 'all',
|
|
17
|
+
'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
|
|
18
|
+
'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
|
|
19
|
+
's', 't', 'just', 'don', 'now', 'then', 'here', 'there', 'also'
|
|
20
|
+
])
|
|
21
|
+
|
|
22
|
+
// Common programming terms to keep
|
|
23
|
+
const KEEP_TERMS = new Set([
|
|
24
|
+
'api', 'sql', 'css', 'html', 'json', 'xml', 'http', 'https', 'url', 'uri',
|
|
25
|
+
'jwt', 'oauth', 'rest', 'graphql', 'grpc', 'tcp', 'udp', 'ip', 'dns',
|
|
26
|
+
'aws', 'gcp', 'azure', 'docker', 'kubernetes', 'k8s', 'npm', 'yarn', 'pnpm',
|
|
27
|
+
'git', 'github', 'gitlab', 'ci', 'cd', 'devops', 'mlops', 'db', 'orm',
|
|
28
|
+
'ui', 'ux', 'cli', 'gui', 'ide', 'sdk', 'mcp', 'llm', 'ai', 'ml'
|
|
29
|
+
])
|
|
30
|
+
|
|
31
|
+
export interface TokenizerOptions {
|
|
32
|
+
/** Minimum token length to keep */
|
|
33
|
+
minLength?: number
|
|
34
|
+
/** Maximum token length */
|
|
35
|
+
maxLength?: number
|
|
36
|
+
/** Remove stopwords */
|
|
37
|
+
removeStopwords?: boolean
|
|
38
|
+
/** Convert to lowercase */
|
|
39
|
+
lowercase?: boolean
|
|
40
|
+
/** Apply stemming (basic) */
|
|
41
|
+
stemming?: boolean
|
|
42
|
+
/** Split on camelCase */
|
|
43
|
+
splitCamelCase?: boolean
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const DEFAULT_OPTIONS: TokenizerOptions = {
|
|
47
|
+
minLength: 2,
|
|
48
|
+
maxLength: 50,
|
|
49
|
+
removeStopwords: true,
|
|
50
|
+
lowercase: true,
|
|
51
|
+
stemming: true,
|
|
52
|
+
splitCamelCase: true
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Tokenize text for BM25 indexing and search
|
|
57
|
+
*/
|
|
58
|
+
export function tokenize(text: string, options: TokenizerOptions = {}): string[] {
|
|
59
|
+
const opts = { ...DEFAULT_OPTIONS, ...options }
|
|
60
|
+
|
|
61
|
+
if (!text || typeof text !== 'string') {
|
|
62
|
+
return []
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Split on camelCase if enabled
|
|
66
|
+
let processedText = text
|
|
67
|
+
if (opts.splitCamelCase) {
|
|
68
|
+
processedText = text.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Lowercase if enabled
|
|
72
|
+
if (opts.lowercase) {
|
|
73
|
+
processedText = processedText.toLowerCase()
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Split into tokens
|
|
77
|
+
// Match words, numbers, and hyphenated compounds
|
|
78
|
+
const tokenRegex = /[\w]+(?:[-_][\w]+)*/g
|
|
79
|
+
const rawTokens = processedText.match(tokenRegex) || []
|
|
80
|
+
|
|
81
|
+
// Process tokens
|
|
82
|
+
let tokens = rawTokens
|
|
83
|
+
.map(token => {
|
|
84
|
+
// Keep programming terms intact
|
|
85
|
+
if (KEEP_TERMS.has(token.toLowerCase())) {
|
|
86
|
+
return token
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Apply basic stemming if enabled
|
|
90
|
+
if (opts.stemming) {
|
|
91
|
+
return basicStem(token)
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return token
|
|
95
|
+
})
|
|
96
|
+
.filter(token => {
|
|
97
|
+
// Length filter
|
|
98
|
+
if (token.length < opts.minLength! || token.length > opts.maxLength!) {
|
|
99
|
+
return false
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Stopword filter
|
|
103
|
+
if (opts.removeStopwords && STOPWORDS.has(token.toLowerCase())) {
|
|
104
|
+
return false
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Filter pure numbers (but keep alphanumeric)
|
|
108
|
+
if (/^\d+$/.test(token)) {
|
|
109
|
+
return false
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return true
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
return tokens
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Basic Porter-like stemming
|
|
120
|
+
* Simplified for performance - handles common English suffixes
|
|
121
|
+
*/
|
|
122
|
+
function basicStem(word: string): string {
|
|
123
|
+
if (word.length < 4) return word
|
|
124
|
+
|
|
125
|
+
// Common suffix replacements
|
|
126
|
+
const suffixRules: [RegExp, string][] = [
|
|
127
|
+
[/ies$/, 'y'],
|
|
128
|
+
[/ied$/, 'y'],
|
|
129
|
+
[/es$/, ''],
|
|
130
|
+
[/s$/, ''],
|
|
131
|
+
[/ing$/, ''],
|
|
132
|
+
[/ed$/, ''],
|
|
133
|
+
[/tion$/, 't'],
|
|
134
|
+
[/ness$/, ''],
|
|
135
|
+
[/ment$/, ''],
|
|
136
|
+
[/able$/, ''],
|
|
137
|
+
[/ible$/, ''],
|
|
138
|
+
[/ful$/, ''],
|
|
139
|
+
[/less$/, ''],
|
|
140
|
+
[/ly$/, '']
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
let stemmed = word
|
|
144
|
+
for (const [pattern, replacement] of suffixRules) {
|
|
145
|
+
if (pattern.test(word)) {
|
|
146
|
+
const candidate = word.replace(pattern, replacement)
|
|
147
|
+
// Only apply if result is at least 3 chars
|
|
148
|
+
if (candidate.length >= 3) {
|
|
149
|
+
stemmed = candidate
|
|
150
|
+
break
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return stemmed
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Get n-grams from tokens
|
|
160
|
+
*/
|
|
161
|
+
export function getNGrams(tokens: string[], n: number = 2): string[] {
|
|
162
|
+
if (tokens.length < n) return []
|
|
163
|
+
|
|
164
|
+
const ngrams: string[] = []
|
|
165
|
+
for (let i = 0; i <= tokens.length - n; i++) {
|
|
166
|
+
ngrams.push(tokens.slice(i, i + n).join(' '))
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return ngrams
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Combined tokenizer for search that includes unigrams and bigrams
|
|
174
|
+
*/
|
|
175
|
+
export function tokenizeForSearch(
|
|
176
|
+
text: string,
|
|
177
|
+
options: TokenizerOptions = {}
|
|
178
|
+
): string[] {
|
|
179
|
+
const unigrams = tokenize(text, options)
|
|
180
|
+
const bigrams = getNGrams(unigrams, 2)
|
|
181
|
+
|
|
182
|
+
// Return unique tokens (unigrams + bigrams)
|
|
183
|
+
return [...new Set([...unigrams, ...bigrams])]
|
|
184
|
+
}
|