claude-brain 0.15.2 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/README.md +191 -191
  2. package/VERSION +1 -1
  3. package/assets/CLAUDE-unified.md +11 -11
  4. package/assets/CLAUDE.md +29 -11
  5. package/bunfig.toml +8 -8
  6. package/package.json +82 -82
  7. package/packs/backend/node.json +173 -173
  8. package/packs/core/javascript.json +176 -176
  9. package/packs/core/typescript.json +222 -222
  10. package/packs/frontend/react.json +254 -254
  11. package/packs/meta/testing.json +172 -172
  12. package/scripts/postinstall.mjs +341 -341
  13. package/src/automation/auto-context.ts +240 -240
  14. package/src/automation/decision-detector.ts +452 -452
  15. package/src/automation/index.ts +11 -11
  16. package/src/automation/phase12-manager.ts +456 -456
  17. package/src/automation/proactive-recall.ts +373 -373
  18. package/src/automation/project-detector.ts +310 -310
  19. package/src/automation/repo-scanner.ts +205 -205
  20. package/src/cli/auto-setup.ts +82 -82
  21. package/src/cli/bin.ts +209 -202
  22. package/src/cli/commands/chroma.ts +573 -573
  23. package/src/cli/commands/git-hook.ts +189 -189
  24. package/src/cli/commands/hooks.ts +213 -213
  25. package/src/cli/commands/init.ts +122 -122
  26. package/src/cli/commands/install-mcp.ts +92 -92
  27. package/src/cli/commands/pack.ts +197 -197
  28. package/src/cli/commands/refresh.ts +323 -0
  29. package/src/cli/commands/serve.ts +167 -173
  30. package/src/cli/commands/start.ts +42 -42
  31. package/src/cli/commands/uninstall-mcp.ts +41 -41
  32. package/src/cli/commands/update.ts +124 -121
  33. package/src/cli/diagnose.ts +4 -4
  34. package/src/cli/health-check.ts +4 -4
  35. package/src/cli/migrate-chroma.ts +106 -106
  36. package/src/cli/setup.ts +4 -4
  37. package/src/cli/ui/animations.ts +80 -80
  38. package/src/cli/ui/components.ts +82 -82
  39. package/src/cli/ui/index.ts +4 -4
  40. package/src/cli/ui/logo.ts +36 -36
  41. package/src/cli/ui/theme.ts +55 -55
  42. package/src/config/defaults.ts +50 -50
  43. package/src/config/home.ts +55 -55
  44. package/src/config/index.ts +7 -7
  45. package/src/config/loader.ts +166 -166
  46. package/src/config/migration.ts +76 -76
  47. package/src/config/schema.ts +360 -360
  48. package/src/config/validator.ts +184 -184
  49. package/src/config/watcher.ts +86 -86
  50. package/src/context/assembler.ts +398 -398
  51. package/src/context/cache-manager.ts +101 -101
  52. package/src/context/formatter.ts +84 -84
  53. package/src/context/hierarchy.ts +85 -85
  54. package/src/context/index.ts +83 -83
  55. package/src/context/progress-tracker.ts +174 -174
  56. package/src/context/standards-manager.ts +287 -287
  57. package/src/context/types.ts +252 -252
  58. package/src/context/validator.ts +58 -58
  59. package/src/diagnostics/index.ts +123 -123
  60. package/src/health/index.ts +229 -229
  61. package/src/hooks/brain-hook.ts +128 -112
  62. package/src/hooks/capture.ts +168 -205
  63. package/src/hooks/context-hook.ts +137 -0
  64. package/src/hooks/deduplicator.ts +72 -72
  65. package/src/hooks/git-capture.ts +109 -109
  66. package/src/hooks/git-hook-installer.ts +207 -207
  67. package/src/hooks/index.ts +20 -20
  68. package/src/hooks/installer.ts +244 -194
  69. package/src/hooks/passive-classifier.ts +404 -723
  70. package/src/hooks/queue.ts +129 -129
  71. package/src/hooks/session-tracker.ts +312 -275
  72. package/src/hooks/types.ts +52 -47
  73. package/src/index.ts +7 -7
  74. package/src/intelligence/cross-project/affinity.ts +162 -162
  75. package/src/intelligence/cross-project/generalizer.ts +283 -283
  76. package/src/intelligence/cross-project/index.ts +13 -13
  77. package/src/intelligence/cross-project/transfer.ts +201 -201
  78. package/src/intelligence/index.ts +24 -24
  79. package/src/intelligence/optimization/index.ts +10 -10
  80. package/src/intelligence/optimization/precompute.ts +202 -202
  81. package/src/intelligence/optimization/semantic-cache.ts +207 -207
  82. package/src/intelligence/prediction/context-anticipator.ts +198 -198
  83. package/src/intelligence/prediction/decision-predictor.ts +184 -184
  84. package/src/intelligence/prediction/index.ts +13 -13
  85. package/src/intelligence/prediction/recommender.ts +268 -268
  86. package/src/intelligence/reasoning/chain-retrieval.ts +247 -247
  87. package/src/intelligence/reasoning/counterfactual.ts +248 -248
  88. package/src/intelligence/reasoning/index.ts +13 -13
  89. package/src/intelligence/reasoning/synthesizer.ts +169 -169
  90. package/src/intelligence/temporal/evolution.ts +197 -197
  91. package/src/intelligence/temporal/index.ts +16 -16
  92. package/src/intelligence/temporal/query-processor.ts +190 -190
  93. package/src/intelligence/temporal/timeline.ts +259 -259
  94. package/src/intelligence/temporal/trends.ts +263 -263
  95. package/src/knowledge/entity-extractor.ts +416 -416
  96. package/src/knowledge/graph/builder.ts +185 -185
  97. package/src/knowledge/graph/linker.ts +201 -201
  98. package/src/knowledge/graph/memory-graph.ts +359 -359
  99. package/src/knowledge/graph/schema.ts +99 -99
  100. package/src/knowledge/graph/search.ts +168 -168
  101. package/src/knowledge/relationship-extractor.ts +108 -108
  102. package/src/memory/chroma/client.ts +174 -174
  103. package/src/memory/chroma/collection-manager.ts +94 -94
  104. package/src/memory/chroma/config.ts +57 -57
  105. package/src/memory/chroma/embeddings.ts +155 -155
  106. package/src/memory/chroma/index.ts +82 -82
  107. package/src/memory/chroma/migration.ts +270 -270
  108. package/src/memory/chroma/schemas.ts +69 -69
  109. package/src/memory/chroma/search.ts +315 -315
  110. package/src/memory/chroma/store.ts +741 -741
  111. package/src/memory/consolidation/archiver.ts +164 -164
  112. package/src/memory/consolidation/merger.ts +186 -186
  113. package/src/memory/consolidation/scorer.ts +138 -138
  114. package/src/memory/context-builder.ts +236 -236
  115. package/src/memory/database.ts +169 -169
  116. package/src/memory/embedding-utils.ts +156 -156
  117. package/src/memory/embeddings.ts +226 -226
  118. package/src/memory/episodic/detector.ts +108 -108
  119. package/src/memory/episodic/manager.ts +351 -351
  120. package/src/memory/episodic/summarizer.ts +179 -179
  121. package/src/memory/episodic/types.ts +52 -52
  122. package/src/memory/index.ts +582 -582
  123. package/src/memory/knowledge-extractor.ts +455 -455
  124. package/src/memory/learning.ts +378 -378
  125. package/src/memory/patterns.ts +396 -396
  126. package/src/memory/schema.ts +88 -88
  127. package/src/memory/search.ts +309 -309
  128. package/src/memory/store.ts +787 -787
  129. package/src/memory/types.ts +121 -121
  130. package/src/orchestrator/coordinator.ts +272 -272
  131. package/src/orchestrator/decision-logger.ts +228 -228
  132. package/src/orchestrator/event-emitter.ts +198 -198
  133. package/src/orchestrator/event-queue.ts +184 -184
  134. package/src/orchestrator/handlers/base-handler.ts +70 -70
  135. package/src/orchestrator/handlers/context-handler.ts +73 -73
  136. package/src/orchestrator/handlers/decision-handler.ts +204 -204
  137. package/src/orchestrator/handlers/index.ts +10 -10
  138. package/src/orchestrator/handlers/status-handler.ts +131 -131
  139. package/src/orchestrator/handlers/task-handler.ts +171 -171
  140. package/src/orchestrator/index.ts +275 -275
  141. package/src/orchestrator/task-parser.ts +284 -284
  142. package/src/orchestrator/types.ts +98 -98
  143. package/src/packs/index.ts +9 -9
  144. package/src/packs/loader.ts +134 -134
  145. package/src/packs/manager.ts +204 -204
  146. package/src/packs/ranker.ts +78 -78
  147. package/src/packs/types.ts +81 -81
  148. package/src/phase12/index.ts +5 -5
  149. package/src/retrieval/bm25/index.ts +300 -300
  150. package/src/retrieval/bm25/tokenizer.ts +184 -184
  151. package/src/retrieval/feedback/adaptive.ts +223 -223
  152. package/src/retrieval/feedback/index.ts +16 -16
  153. package/src/retrieval/feedback/metrics.ts +223 -223
  154. package/src/retrieval/feedback/store.ts +283 -283
  155. package/src/retrieval/fusion/index.ts +194 -194
  156. package/src/retrieval/fusion/rrf.ts +163 -163
  157. package/src/retrieval/index.ts +12 -12
  158. package/src/retrieval/pipeline.ts +375 -375
  159. package/src/retrieval/query/expander.ts +198 -198
  160. package/src/retrieval/query/index.ts +27 -27
  161. package/src/retrieval/query/intent-classifier.ts +236 -236
  162. package/src/retrieval/query/temporal-parser.ts +295 -295
  163. package/src/retrieval/reranker/index.ts +188 -188
  164. package/src/retrieval/reranker/model.ts +95 -95
  165. package/src/retrieval/service.ts +125 -125
  166. package/src/retrieval/types.ts +162 -162
  167. package/src/routing/entity-extractor.ts +428 -428
  168. package/src/routing/intent-classifier.ts +450 -436
  169. package/src/routing/response-filter.ts +261 -258
  170. package/src/routing/router.ts +1441 -1322
  171. package/src/routing/search-engine.ts +515 -475
  172. package/src/routing/types.ts +94 -94
  173. package/src/scripts/health-check.ts +118 -118
  174. package/src/scripts/setup.ts +122 -122
  175. package/src/server/handlers/call-tool.ts +156 -156
  176. package/src/server/handlers/index.ts +9 -9
  177. package/src/server/handlers/list-tools.ts +35 -35
  178. package/src/server/handlers/tools/analyze-decision-evolution.ts +151 -151
  179. package/src/server/handlers/tools/auto-remember.ts +200 -200
  180. package/src/server/handlers/tools/brain.ts +85 -85
  181. package/src/server/handlers/tools/create-project.ts +135 -135
  182. package/src/server/handlers/tools/detect-trends.ts +144 -144
  183. package/src/server/handlers/tools/find-cross-project-patterns.ts +168 -168
  184. package/src/server/handlers/tools/get-activity-log.ts +194 -194
  185. package/src/server/handlers/tools/get-code-standards.ts +124 -124
  186. package/src/server/handlers/tools/get-corrections.ts +154 -154
  187. package/src/server/handlers/tools/get-decision-timeline.ts +172 -172
  188. package/src/server/handlers/tools/get-episode.ts +103 -103
  189. package/src/server/handlers/tools/get-patterns.ts +158 -158
  190. package/src/server/handlers/tools/get-phase12-status.ts +63 -63
  191. package/src/server/handlers/tools/get-project-context.ts +75 -75
  192. package/src/server/handlers/tools/get-recommendations.ts +145 -145
  193. package/src/server/handlers/tools/index.ts +31 -31
  194. package/src/server/handlers/tools/init-project.ts +757 -757
  195. package/src/server/handlers/tools/list-episodes.ts +90 -90
  196. package/src/server/handlers/tools/list-projects.ts +125 -125
  197. package/src/server/handlers/tools/rate-memory.ts +101 -101
  198. package/src/server/handlers/tools/recall-similar.ts +87 -87
  199. package/src/server/handlers/tools/recognize-pattern.ts +126 -126
  200. package/src/server/handlers/tools/record-correction.ts +125 -125
  201. package/src/server/handlers/tools/remember-decision.ts +153 -153
  202. package/src/server/handlers/tools/schemas.ts +253 -253
  203. package/src/server/handlers/tools/search-knowledge-graph.ts +102 -102
  204. package/src/server/handlers/tools/smart-context.ts +146 -146
  205. package/src/server/handlers/tools/update-progress.ts +131 -131
  206. package/src/server/handlers/tools/what-if-analysis.ts +135 -135
  207. package/src/server/http-api.ts +761 -693
  208. package/src/server/index.ts +40 -40
  209. package/src/server/mcp-server.ts +283 -283
  210. package/src/server/providers/index.ts +7 -7
  211. package/src/server/providers/prompts.ts +327 -327
  212. package/src/server/providers/resources.ts +622 -622
  213. package/src/server/services.ts +468 -468
  214. package/src/server/types.ts +39 -39
  215. package/src/server/utils/error-handler.ts +155 -155
  216. package/src/server/utils/index.ts +13 -13
  217. package/src/server/utils/memory-indicator.ts +83 -83
  218. package/src/server/utils/request-context.ts +122 -122
  219. package/src/server/utils/response-formatter.ts +129 -129
  220. package/src/server/utils/validators.ts +210 -210
  221. package/src/setup/index.ts +48 -48
  222. package/src/setup/wizard.ts +461 -461
  223. package/src/tools/index.ts +24 -24
  224. package/src/tools/registry.ts +115 -115
  225. package/src/tools/schemas.test.ts +30 -30
  226. package/src/tools/schemas.ts +617 -617
  227. package/src/tools/types.ts +412 -412
  228. package/src/utils/circuit-breaker.ts +130 -130
  229. package/src/utils/cleanup.ts +34 -34
  230. package/src/utils/error-handler.ts +132 -132
  231. package/src/utils/error-messages.ts +60 -60
  232. package/src/utils/fallback.ts +45 -45
  233. package/src/utils/index.ts +54 -54
  234. package/src/utils/logger-utils.ts +80 -80
  235. package/src/utils/logger.ts +88 -88
  236. package/src/utils/phase12-helper.ts +56 -56
  237. package/src/utils/retry.ts +94 -94
  238. package/src/utils/timing.ts +47 -47
  239. package/src/utils/transaction.ts +63 -63
  240. package/src/vault/frontmatter.ts +264 -264
  241. package/src/vault/index.ts +318 -318
  242. package/src/vault/paths.ts +106 -106
  243. package/src/vault/query.ts +422 -422
  244. package/src/vault/reader.ts +264 -264
  245. package/src/vault/templates.ts +186 -186
  246. package/src/vault/types.ts +73 -73
  247. package/src/vault/watcher.ts +277 -277
  248. package/src/vault/writer.ts +413 -413
  249. package/tsconfig.json +30 -30
  250. package/src/cli/auto-update.ts +0 -157
@@ -1,184 +1,184 @@
1
- /**
2
- * Text Tokenizer for BM25
3
- * Handles text preprocessing for sparse search
4
- */
5
-
6
- // Common English stopwords to filter
7
- const STOPWORDS = new Set([
8
- 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
9
- 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
10
- 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
11
- 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
12
- 'used', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'me', 'my',
13
- 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
14
- 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
15
- 'hers', 'herself', 'they', 'them', 'their', 'theirs', 'themselves',
16
- 'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how', 'all',
17
- 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
18
- 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
19
- 's', 't', 'just', 'don', 'now', 'then', 'here', 'there', 'also'
20
- ])
21
-
22
- // Common programming terms to keep
23
- const KEEP_TERMS = new Set([
24
- 'api', 'sql', 'css', 'html', 'json', 'xml', 'http', 'https', 'url', 'uri',
25
- 'jwt', 'oauth', 'rest', 'graphql', 'grpc', 'tcp', 'udp', 'ip', 'dns',
26
- 'aws', 'gcp', 'azure', 'docker', 'kubernetes', 'k8s', 'npm', 'yarn', 'pnpm',
27
- 'git', 'github', 'gitlab', 'ci', 'cd', 'devops', 'mlops', 'db', 'orm',
28
- 'ui', 'ux', 'cli', 'gui', 'ide', 'sdk', 'mcp', 'llm', 'ai', 'ml'
29
- ])
30
-
31
- export interface TokenizerOptions {
32
- /** Minimum token length to keep */
33
- minLength?: number
34
- /** Maximum token length */
35
- maxLength?: number
36
- /** Remove stopwords */
37
- removeStopwords?: boolean
38
- /** Convert to lowercase */
39
- lowercase?: boolean
40
- /** Apply stemming (basic) */
41
- stemming?: boolean
42
- /** Split on camelCase */
43
- splitCamelCase?: boolean
44
- }
45
-
46
- const DEFAULT_OPTIONS: TokenizerOptions = {
47
- minLength: 2,
48
- maxLength: 50,
49
- removeStopwords: true,
50
- lowercase: true,
51
- stemming: true,
52
- splitCamelCase: true
53
- }
54
-
55
- /**
56
- * Tokenize text for BM25 indexing and search
57
- */
58
- export function tokenize(text: string, options: TokenizerOptions = {}): string[] {
59
- const opts = { ...DEFAULT_OPTIONS, ...options }
60
-
61
- if (!text || typeof text !== 'string') {
62
- return []
63
- }
64
-
65
- // Split on camelCase if enabled
66
- let processedText = text
67
- if (opts.splitCamelCase) {
68
- processedText = text.replace(/([a-z])([A-Z])/g, '$1 $2')
69
- }
70
-
71
- // Lowercase if enabled
72
- if (opts.lowercase) {
73
- processedText = processedText.toLowerCase()
74
- }
75
-
76
- // Split into tokens
77
- // Match words, numbers, and hyphenated compounds
78
- const tokenRegex = /[\w]+(?:[-_][\w]+)*/g
79
- const rawTokens = processedText.match(tokenRegex) || []
80
-
81
- // Process tokens
82
- let tokens = rawTokens
83
- .map(token => {
84
- // Keep programming terms intact
85
- if (KEEP_TERMS.has(token.toLowerCase())) {
86
- return token
87
- }
88
-
89
- // Apply basic stemming if enabled
90
- if (opts.stemming) {
91
- return basicStem(token)
92
- }
93
-
94
- return token
95
- })
96
- .filter(token => {
97
- // Length filter
98
- if (token.length < opts.minLength! || token.length > opts.maxLength!) {
99
- return false
100
- }
101
-
102
- // Stopword filter
103
- if (opts.removeStopwords && STOPWORDS.has(token.toLowerCase())) {
104
- return false
105
- }
106
-
107
- // Filter pure numbers (but keep alphanumeric)
108
- if (/^\d+$/.test(token)) {
109
- return false
110
- }
111
-
112
- return true
113
- })
114
-
115
- return tokens
116
- }
117
-
118
- /**
119
- * Basic Porter-like stemming
120
- * Simplified for performance - handles common English suffixes
121
- */
122
- function basicStem(word: string): string {
123
- if (word.length < 4) return word
124
-
125
- // Common suffix replacements
126
- const suffixRules: [RegExp, string][] = [
127
- [/ies$/, 'y'],
128
- [/ied$/, 'y'],
129
- [/es$/, ''],
130
- [/s$/, ''],
131
- [/ing$/, ''],
132
- [/ed$/, ''],
133
- [/tion$/, 't'],
134
- [/ness$/, ''],
135
- [/ment$/, ''],
136
- [/able$/, ''],
137
- [/ible$/, ''],
138
- [/ful$/, ''],
139
- [/less$/, ''],
140
- [/ly$/, '']
141
- ]
142
-
143
- let stemmed = word
144
- for (const [pattern, replacement] of suffixRules) {
145
- if (pattern.test(word)) {
146
- const candidate = word.replace(pattern, replacement)
147
- // Only apply if result is at least 3 chars
148
- if (candidate.length >= 3) {
149
- stemmed = candidate
150
- break
151
- }
152
- }
153
- }
154
-
155
- return stemmed
156
- }
157
-
158
- /**
159
- * Get n-grams from tokens
160
- */
161
- export function getNGrams(tokens: string[], n: number = 2): string[] {
162
- if (tokens.length < n) return []
163
-
164
- const ngrams: string[] = []
165
- for (let i = 0; i <= tokens.length - n; i++) {
166
- ngrams.push(tokens.slice(i, i + n).join(' '))
167
- }
168
-
169
- return ngrams
170
- }
171
-
172
- /**
173
- * Combined tokenizer for search that includes unigrams and bigrams
174
- */
175
- export function tokenizeForSearch(
176
- text: string,
177
- options: TokenizerOptions = {}
178
- ): string[] {
179
- const unigrams = tokenize(text, options)
180
- const bigrams = getNGrams(unigrams, 2)
181
-
182
- // Return unique tokens (unigrams + bigrams)
183
- return [...new Set([...unigrams, ...bigrams])]
184
- }
1
+ /**
2
+ * Text Tokenizer for BM25
3
+ * Handles text preprocessing for sparse search
4
+ */
5
+
6
+ // Common English stopwords to filter
7
+ const STOPWORDS = new Set([
8
+ 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
9
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
10
+ 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
11
+ 'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
12
+ 'used', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'me', 'my',
13
+ 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
14
+ 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
15
+ 'hers', 'herself', 'they', 'them', 'their', 'theirs', 'themselves',
16
+ 'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how', 'all',
17
+ 'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
18
+ 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
19
+ 's', 't', 'just', 'don', 'now', 'then', 'here', 'there', 'also'
20
+ ])
21
+
22
+ // Common programming terms to keep
23
+ const KEEP_TERMS = new Set([
24
+ 'api', 'sql', 'css', 'html', 'json', 'xml', 'http', 'https', 'url', 'uri',
25
+ 'jwt', 'oauth', 'rest', 'graphql', 'grpc', 'tcp', 'udp', 'ip', 'dns',
26
+ 'aws', 'gcp', 'azure', 'docker', 'kubernetes', 'k8s', 'npm', 'yarn', 'pnpm',
27
+ 'git', 'github', 'gitlab', 'ci', 'cd', 'devops', 'mlops', 'db', 'orm',
28
+ 'ui', 'ux', 'cli', 'gui', 'ide', 'sdk', 'mcp', 'llm', 'ai', 'ml'
29
+ ])
30
+
31
+ export interface TokenizerOptions {
32
+ /** Minimum token length to keep */
33
+ minLength?: number
34
+ /** Maximum token length */
35
+ maxLength?: number
36
+ /** Remove stopwords */
37
+ removeStopwords?: boolean
38
+ /** Convert to lowercase */
39
+ lowercase?: boolean
40
+ /** Apply stemming (basic) */
41
+ stemming?: boolean
42
+ /** Split on camelCase */
43
+ splitCamelCase?: boolean
44
+ }
45
+
46
+ const DEFAULT_OPTIONS: TokenizerOptions = {
47
+ minLength: 2,
48
+ maxLength: 50,
49
+ removeStopwords: true,
50
+ lowercase: true,
51
+ stemming: true,
52
+ splitCamelCase: true
53
+ }
54
+
55
+ /**
56
+ * Tokenize text for BM25 indexing and search
57
+ */
58
+ export function tokenize(text: string, options: TokenizerOptions = {}): string[] {
59
+ const opts = { ...DEFAULT_OPTIONS, ...options }
60
+
61
+ if (!text || typeof text !== 'string') {
62
+ return []
63
+ }
64
+
65
+ // Split on camelCase if enabled
66
+ let processedText = text
67
+ if (opts.splitCamelCase) {
68
+ processedText = text.replace(/([a-z])([A-Z])/g, '$1 $2')
69
+ }
70
+
71
+ // Lowercase if enabled
72
+ if (opts.lowercase) {
73
+ processedText = processedText.toLowerCase()
74
+ }
75
+
76
+ // Split into tokens
77
+ // Match words, numbers, and hyphenated compounds
78
+ const tokenRegex = /[\w]+(?:[-_][\w]+)*/g
79
+ const rawTokens = processedText.match(tokenRegex) || []
80
+
81
+ // Process tokens
82
+ let tokens = rawTokens
83
+ .map(token => {
84
+ // Keep programming terms intact
85
+ if (KEEP_TERMS.has(token.toLowerCase())) {
86
+ return token
87
+ }
88
+
89
+ // Apply basic stemming if enabled
90
+ if (opts.stemming) {
91
+ return basicStem(token)
92
+ }
93
+
94
+ return token
95
+ })
96
+ .filter(token => {
97
+ // Length filter
98
+ if (token.length < opts.minLength! || token.length > opts.maxLength!) {
99
+ return false
100
+ }
101
+
102
+ // Stopword filter
103
+ if (opts.removeStopwords && STOPWORDS.has(token.toLowerCase())) {
104
+ return false
105
+ }
106
+
107
+ // Filter pure numbers (but keep alphanumeric)
108
+ if (/^\d+$/.test(token)) {
109
+ return false
110
+ }
111
+
112
+ return true
113
+ })
114
+
115
+ return tokens
116
+ }
117
+
118
+ /**
119
+ * Basic Porter-like stemming
120
+ * Simplified for performance - handles common English suffixes
121
+ */
122
+ function basicStem(word: string): string {
123
+ if (word.length < 4) return word
124
+
125
+ // Common suffix replacements
126
+ const suffixRules: [RegExp, string][] = [
127
+ [/ies$/, 'y'],
128
+ [/ied$/, 'y'],
129
+ [/es$/, ''],
130
+ [/s$/, ''],
131
+ [/ing$/, ''],
132
+ [/ed$/, ''],
133
+ [/tion$/, 't'],
134
+ [/ness$/, ''],
135
+ [/ment$/, ''],
136
+ [/able$/, ''],
137
+ [/ible$/, ''],
138
+ [/ful$/, ''],
139
+ [/less$/, ''],
140
+ [/ly$/, '']
141
+ ]
142
+
143
+ let stemmed = word
144
+ for (const [pattern, replacement] of suffixRules) {
145
+ if (pattern.test(word)) {
146
+ const candidate = word.replace(pattern, replacement)
147
+ // Only apply if result is at least 3 chars
148
+ if (candidate.length >= 3) {
149
+ stemmed = candidate
150
+ break
151
+ }
152
+ }
153
+ }
154
+
155
+ return stemmed
156
+ }
157
+
158
+ /**
159
+ * Get n-grams from tokens
160
+ */
161
+ export function getNGrams(tokens: string[], n: number = 2): string[] {
162
+ if (tokens.length < n) return []
163
+
164
+ const ngrams: string[] = []
165
+ for (let i = 0; i <= tokens.length - n; i++) {
166
+ ngrams.push(tokens.slice(i, i + n).join(' '))
167
+ }
168
+
169
+ return ngrams
170
+ }
171
+
172
+ /**
173
+ * Combined tokenizer for search that includes unigrams and bigrams
174
+ */
175
+ export function tokenizeForSearch(
176
+ text: string,
177
+ options: TokenizerOptions = {}
178
+ ): string[] {
179
+ const unigrams = tokenize(text, options)
180
+ const bigrams = getNGrams(unigrams, 2)
181
+
182
+ // Return unique tokens (unigrams + bigrams)
183
+ return [...new Set([...unigrams, ...bigrams])]
184
+ }