claude-brain 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +157 -0
- package/VERSION +1 -0
- package/assets/CLAUDE.md +307 -0
- package/bunfig.toml +8 -0
- package/package.json +74 -0
- package/src/automation/auto-context.ts +240 -0
- package/src/automation/decision-detector.ts +452 -0
- package/src/automation/index.ts +11 -0
- package/src/automation/proactive-recall.ts +373 -0
- package/src/automation/project-detector.ts +297 -0
- package/src/cli/auto-setup.ts +74 -0
- package/src/cli/bin.ts +110 -0
- package/src/cli/commands/install-mcp.ts +50 -0
- package/src/cli/commands/serve.ts +129 -0
- package/src/cli/diagnose.ts +4 -0
- package/src/cli/health-check.ts +4 -0
- package/src/cli/migrate-chroma.ts +106 -0
- package/src/cli/setup.ts +4 -0
- package/src/config/defaults.ts +47 -0
- package/src/config/home.ts +55 -0
- package/src/config/index.ts +7 -0
- package/src/config/loader.ts +166 -0
- package/src/config/migration.ts +76 -0
- package/src/config/schema.ts +257 -0
- package/src/config/validator.ts +184 -0
- package/src/config/watcher.ts +86 -0
- package/src/context/assembler.ts +398 -0
- package/src/context/cache-manager.ts +101 -0
- package/src/context/formatter.ts +84 -0
- package/src/context/hierarchy.ts +85 -0
- package/src/context/index.ts +83 -0
- package/src/context/progress-tracker.ts +174 -0
- package/src/context/standards-manager.ts +267 -0
- package/src/context/types.ts +252 -0
- package/src/context/validator.ts +58 -0
- package/src/cross-project/affinity.ts +162 -0
- package/src/cross-project/generalizer.ts +283 -0
- package/src/cross-project/index.ts +13 -0
- package/src/cross-project/transfer.ts +201 -0
- package/src/diagnostics/index.ts +123 -0
- package/src/health/index.ts +229 -0
- package/src/index.ts +7 -0
- package/src/knowledge/entity-extractor.ts +416 -0
- package/src/knowledge/graph/builder.ts +159 -0
- package/src/knowledge/graph/linker.ts +201 -0
- package/src/knowledge/graph/memory-graph.ts +359 -0
- package/src/knowledge/graph/schema.ts +99 -0
- package/src/knowledge/graph/search.ts +168 -0
- package/src/knowledge/relationship-extractor.ts +108 -0
- package/src/memory/chroma/client.ts +169 -0
- package/src/memory/chroma/collection-manager.ts +94 -0
- package/src/memory/chroma/config.ts +46 -0
- package/src/memory/chroma/embeddings.ts +153 -0
- package/src/memory/chroma/index.ts +82 -0
- package/src/memory/chroma/migration.ts +270 -0
- package/src/memory/chroma/schemas.ts +69 -0
- package/src/memory/chroma/search.ts +315 -0
- package/src/memory/chroma/store.ts +694 -0
- package/src/memory/consolidation/archiver.ts +164 -0
- package/src/memory/consolidation/merger.ts +186 -0
- package/src/memory/consolidation/scorer.ts +138 -0
- package/src/memory/context-builder.ts +236 -0
- package/src/memory/database.ts +169 -0
- package/src/memory/embedding-utils.ts +156 -0
- package/src/memory/embeddings.ts +226 -0
- package/src/memory/episodic/detector.ts +108 -0
- package/src/memory/episodic/manager.ts +334 -0
- package/src/memory/episodic/summarizer.ts +179 -0
- package/src/memory/episodic/types.ts +52 -0
- package/src/memory/index.ts +395 -0
- package/src/memory/knowledge-extractor.ts +455 -0
- package/src/memory/learning.ts +378 -0
- package/src/memory/patterns.ts +396 -0
- package/src/memory/schema.ts +56 -0
- package/src/memory/search.ts +309 -0
- package/src/memory/store.ts +344 -0
- package/src/memory/types.ts +121 -0
- package/src/optimization/index.ts +10 -0
- package/src/optimization/precompute.ts +202 -0
- package/src/optimization/semantic-cache.ts +207 -0
- package/src/orchestrator/coordinator.ts +272 -0
- package/src/orchestrator/decision-logger.ts +228 -0
- package/src/orchestrator/event-emitter.ts +198 -0
- package/src/orchestrator/event-queue.ts +184 -0
- package/src/orchestrator/handlers/base-handler.ts +70 -0
- package/src/orchestrator/handlers/context-handler.ts +73 -0
- package/src/orchestrator/handlers/decision-handler.ts +204 -0
- package/src/orchestrator/handlers/index.ts +10 -0
- package/src/orchestrator/handlers/status-handler.ts +131 -0
- package/src/orchestrator/handlers/task-handler.ts +171 -0
- package/src/orchestrator/index.ts +275 -0
- package/src/orchestrator/task-parser.ts +284 -0
- package/src/orchestrator/types.ts +98 -0
- package/src/phase12/index.ts +456 -0
- package/src/prediction/context-anticipator.ts +198 -0
- package/src/prediction/decision-predictor.ts +184 -0
- package/src/prediction/index.ts +13 -0
- package/src/prediction/recommender.ts +268 -0
- package/src/reasoning/chain-retrieval.ts +247 -0
- package/src/reasoning/counterfactual.ts +248 -0
- package/src/reasoning/index.ts +13 -0
- package/src/reasoning/synthesizer.ts +169 -0
- package/src/retrieval/bm25/index.ts +300 -0
- package/src/retrieval/bm25/tokenizer.ts +184 -0
- package/src/retrieval/feedback/adaptive.ts +223 -0
- package/src/retrieval/feedback/index.ts +16 -0
- package/src/retrieval/feedback/metrics.ts +223 -0
- package/src/retrieval/feedback/store.ts +283 -0
- package/src/retrieval/fusion/index.ts +194 -0
- package/src/retrieval/fusion/rrf.ts +163 -0
- package/src/retrieval/index.ts +12 -0
- package/src/retrieval/pipeline.ts +375 -0
- package/src/retrieval/query/expander.ts +198 -0
- package/src/retrieval/query/index.ts +27 -0
- package/src/retrieval/query/intent-classifier.ts +236 -0
- package/src/retrieval/query/temporal-parser.ts +295 -0
- package/src/retrieval/reranker/index.ts +188 -0
- package/src/retrieval/reranker/model.ts +95 -0
- package/src/retrieval/service.ts +125 -0
- package/src/retrieval/types.ts +162 -0
- package/src/scripts/health-check.ts +118 -0
- package/src/scripts/setup.ts +122 -0
- package/src/server/handlers/call-tool.ts +194 -0
- package/src/server/handlers/index.ts +9 -0
- package/src/server/handlers/list-tools.ts +18 -0
- package/src/server/handlers/tools/analyze-decision-evolution.ts +71 -0
- package/src/server/handlers/tools/auto-remember.ts +200 -0
- package/src/server/handlers/tools/create-project.ts +135 -0
- package/src/server/handlers/tools/detect-trends.ts +80 -0
- package/src/server/handlers/tools/find-cross-project-patterns.ts +73 -0
- package/src/server/handlers/tools/get-activity-log.ts +194 -0
- package/src/server/handlers/tools/get-code-standards.ts +124 -0
- package/src/server/handlers/tools/get-corrections.ts +154 -0
- package/src/server/handlers/tools/get-decision-timeline.ts +86 -0
- package/src/server/handlers/tools/get-episode.ts +93 -0
- package/src/server/handlers/tools/get-patterns.ts +158 -0
- package/src/server/handlers/tools/get-phase12-status.ts +63 -0
- package/src/server/handlers/tools/get-project-context.ts +75 -0
- package/src/server/handlers/tools/get-recommendations.ts +65 -0
- package/src/server/handlers/tools/index.ts +33 -0
- package/src/server/handlers/tools/init-project.ts +710 -0
- package/src/server/handlers/tools/list-episodes.ts +80 -0
- package/src/server/handlers/tools/list-projects.ts +125 -0
- package/src/server/handlers/tools/rate-memory.ts +95 -0
- package/src/server/handlers/tools/recall-similar.ts +87 -0
- package/src/server/handlers/tools/recognize-pattern.ts +126 -0
- package/src/server/handlers/tools/record-correction.ts +125 -0
- package/src/server/handlers/tools/remember-decision.ts +153 -0
- package/src/server/handlers/tools/schemas.ts +241 -0
- package/src/server/handlers/tools/search-knowledge-graph.ts +89 -0
- package/src/server/handlers/tools/smart-context.ts +124 -0
- package/src/server/handlers/tools/update-progress.ts +114 -0
- package/src/server/handlers/tools/what-if-analysis.ts +73 -0
- package/src/server/http-api.ts +474 -0
- package/src/server/index.ts +40 -0
- package/src/server/mcp-server.ts +283 -0
- package/src/server/providers/index.ts +7 -0
- package/src/server/providers/prompts.ts +327 -0
- package/src/server/providers/resources.ts +427 -0
- package/src/server/services.ts +388 -0
- package/src/server/types.ts +39 -0
- package/src/server/utils/error-handler.ts +155 -0
- package/src/server/utils/index.ts +13 -0
- package/src/server/utils/memory-indicator.ts +83 -0
- package/src/server/utils/request-context.ts +122 -0
- package/src/server/utils/response-formatter.ts +124 -0
- package/src/server/utils/validators.ts +210 -0
- package/src/setup/index.ts +22 -0
- package/src/setup/wizard.ts +321 -0
- package/src/temporal/evolution.ts +197 -0
- package/src/temporal/index.ts +16 -0
- package/src/temporal/query-processor.ts +190 -0
- package/src/temporal/timeline.ts +259 -0
- package/src/temporal/trends.ts +263 -0
- package/src/tools/index.ts +24 -0
- package/src/tools/registry.ts +106 -0
- package/src/tools/schemas.test.ts +30 -0
- package/src/tools/schemas.ts +907 -0
- package/src/tools/types.ts +412 -0
- package/src/utils/circuit-breaker.ts +130 -0
- package/src/utils/cleanup.ts +34 -0
- package/src/utils/error-handler.ts +132 -0
- package/src/utils/error-messages.ts +60 -0
- package/src/utils/fallback.ts +45 -0
- package/src/utils/index.ts +54 -0
- package/src/utils/logger-utils.ts +80 -0
- package/src/utils/logger.ts +88 -0
- package/src/utils/phase12-helper.ts +56 -0
- package/src/utils/retry.ts +94 -0
- package/src/utils/transaction.ts +63 -0
- package/src/vault/frontmatter.ts +264 -0
- package/src/vault/index.ts +318 -0
- package/src/vault/paths.ts +106 -0
- package/src/vault/query.ts +422 -0
- package/src/vault/reader.ts +264 -0
- package/src/vault/templates.ts +186 -0
- package/src/vault/types.ts +73 -0
- package/src/vault/watcher.ts +277 -0
- package/src/vault/writer.ts +393 -0
- package/tsconfig.json +30 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 Sparse Search Engine
|
|
3
|
+
* Uses MiniSearch for fast keyword-based retrieval
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type { Logger } from 'pino'
|
|
7
|
+
import MiniSearch from 'minisearch'
|
|
8
|
+
import { tokenize, tokenizeForSearch, type TokenizerOptions } from './tokenizer'
|
|
9
|
+
|
|
10
|
+
export interface BM25Document {
|
|
11
|
+
id: string
|
|
12
|
+
content: string
|
|
13
|
+
metadata: Record<string, unknown>
|
|
14
|
+
collection: string
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface BM25SearchResult {
|
|
18
|
+
id: string
|
|
19
|
+
content: string
|
|
20
|
+
metadata: Record<string, unknown>
|
|
21
|
+
collection: string
|
|
22
|
+
score: number
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface BM25Config {
|
|
26
|
+
/** Fields to index */
|
|
27
|
+
fields: string[]
|
|
28
|
+
/** Fields to store */
|
|
29
|
+
storeFields: string[]
|
|
30
|
+
/** BM25 k1 parameter (term frequency saturation) */
|
|
31
|
+
k1?: number
|
|
32
|
+
/** BM25 b parameter (document length normalization) */
|
|
33
|
+
b?: number
|
|
34
|
+
/** Boost for exact matches */
|
|
35
|
+
boostExact?: number
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const DEFAULT_CONFIG: BM25Config = {
|
|
39
|
+
fields: ['content'],
|
|
40
|
+
storeFields: ['content', 'metadata', 'collection'],
|
|
41
|
+
k1: 1.2,
|
|
42
|
+
b: 0.75,
|
|
43
|
+
boostExact: 2.0
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export class BM25Engine {
|
|
47
|
+
private logger: Logger
|
|
48
|
+
private config: BM25Config
|
|
49
|
+
private index: MiniSearch<BM25Document>
|
|
50
|
+
private documentCount: number = 0
|
|
51
|
+
private tokenizerOptions: TokenizerOptions
|
|
52
|
+
|
|
53
|
+
constructor(logger: Logger, config: Partial<BM25Config> = {}) {
|
|
54
|
+
this.logger = logger.child({ component: 'bm25-engine' })
|
|
55
|
+
this.config = { ...DEFAULT_CONFIG, ...config }
|
|
56
|
+
this.tokenizerOptions = {
|
|
57
|
+
minLength: 2,
|
|
58
|
+
removeStopwords: true,
|
|
59
|
+
stemming: true,
|
|
60
|
+
splitCamelCase: true
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
this.index = this.createIndex()
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Create a new MiniSearch index
|
|
68
|
+
*/
|
|
69
|
+
private createIndex(): MiniSearch<BM25Document> {
|
|
70
|
+
return new MiniSearch<BM25Document>({
|
|
71
|
+
fields: this.config.fields,
|
|
72
|
+
storeFields: this.config.storeFields,
|
|
73
|
+
idField: 'id',
|
|
74
|
+
tokenize: (text: string) => tokenize(text, this.tokenizerOptions),
|
|
75
|
+
processTerm: (term: string) => term.toLowerCase()
|
|
76
|
+
})
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Build index from documents
|
|
81
|
+
*/
|
|
82
|
+
async buildIndex(documents: BM25Document[]): Promise<void> {
|
|
83
|
+
this.logger.info({ count: documents.length }, 'Building BM25 index')
|
|
84
|
+
|
|
85
|
+
// Create fresh index
|
|
86
|
+
this.index = this.createIndex()
|
|
87
|
+
this.documentCount = 0
|
|
88
|
+
|
|
89
|
+
// Add documents in batches
|
|
90
|
+
const batchSize = 1000
|
|
91
|
+
for (let i = 0; i < documents.length; i += batchSize) {
|
|
92
|
+
const batch = documents.slice(i, i + batchSize)
|
|
93
|
+
await this.addDocuments(batch)
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
this.logger.info({
|
|
97
|
+
indexed: this.documentCount,
|
|
98
|
+
terms: this.index.termCount
|
|
99
|
+
}, 'BM25 index built')
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Add documents to index
|
|
104
|
+
*/
|
|
105
|
+
async addDocuments(documents: BM25Document[]): Promise<void> {
|
|
106
|
+
try {
|
|
107
|
+
this.index.addAll(documents)
|
|
108
|
+
this.documentCount += documents.length
|
|
109
|
+
} catch (error) {
|
|
110
|
+
this.logger.error({ error }, 'Failed to add documents to BM25 index')
|
|
111
|
+
throw error
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Add single document
|
|
117
|
+
*/
|
|
118
|
+
addDocument(document: BM25Document): void {
|
|
119
|
+
try {
|
|
120
|
+
this.index.add(document)
|
|
121
|
+
this.documentCount++
|
|
122
|
+
} catch (error) {
|
|
123
|
+
this.logger.error({ error, id: document.id }, 'Failed to add document to BM25 index')
|
|
124
|
+
throw error
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Remove document from index
|
|
130
|
+
*/
|
|
131
|
+
removeDocument(document: BM25Document): void {
|
|
132
|
+
try {
|
|
133
|
+
this.index.remove(document)
|
|
134
|
+
this.documentCount--
|
|
135
|
+
} catch (error) {
|
|
136
|
+
this.logger.error({ error, id: document.id }, 'Failed to remove document from BM25 index')
|
|
137
|
+
// Don't throw - document might not exist
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Update document in index
|
|
143
|
+
*/
|
|
144
|
+
updateDocument(document: BM25Document): void {
|
|
145
|
+
this.removeDocument(document)
|
|
146
|
+
this.addDocument(document)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Search the index
|
|
151
|
+
*/
|
|
152
|
+
search(
|
|
153
|
+
query: string,
|
|
154
|
+
options: {
|
|
155
|
+
limit?: number
|
|
156
|
+
filter?: (result: BM25SearchResult) => boolean
|
|
157
|
+
collection?: string
|
|
158
|
+
} = {}
|
|
159
|
+
): BM25SearchResult[] {
|
|
160
|
+
const { limit = 20, filter, collection } = options
|
|
161
|
+
|
|
162
|
+
if (!query.trim()) {
|
|
163
|
+
return []
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
this.logger.debug({
|
|
167
|
+
query: query.slice(0, 50),
|
|
168
|
+
limit
|
|
169
|
+
}, 'BM25 search')
|
|
170
|
+
|
|
171
|
+
try {
|
|
172
|
+
// Tokenize query
|
|
173
|
+
const queryTokens = tokenizeForSearch(query, this.tokenizerOptions)
|
|
174
|
+
|
|
175
|
+
// Search with MiniSearch options
|
|
176
|
+
const results = this.index.search(query, {
|
|
177
|
+
prefix: true, // Allow prefix matching
|
|
178
|
+
fuzzy: 0.2, // Allow minor typos
|
|
179
|
+
combineWith: 'OR', // Match any term
|
|
180
|
+
boost: {
|
|
181
|
+
content: this.config.boostExact!
|
|
182
|
+
}
|
|
183
|
+
})
|
|
184
|
+
|
|
185
|
+
// Transform and filter results
|
|
186
|
+
let searchResults: BM25SearchResult[] = results.map(result => ({
|
|
187
|
+
id: result.id,
|
|
188
|
+
content: (result as any).content || '',
|
|
189
|
+
metadata: (result as any).metadata || {},
|
|
190
|
+
collection: (result as any).collection || '',
|
|
191
|
+
score: result.score
|
|
192
|
+
}))
|
|
193
|
+
|
|
194
|
+
// Filter by collection if specified
|
|
195
|
+
if (collection) {
|
|
196
|
+
searchResults = searchResults.filter(r => r.collection === collection)
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Apply custom filter
|
|
200
|
+
if (filter) {
|
|
201
|
+
searchResults = searchResults.filter(filter)
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Normalize scores to 0-1 range
|
|
205
|
+
searchResults = this.normalizeScores(searchResults)
|
|
206
|
+
|
|
207
|
+
// Apply limit
|
|
208
|
+
return searchResults.slice(0, limit)
|
|
209
|
+
|
|
210
|
+
} catch (error) {
|
|
211
|
+
this.logger.error({ error, query }, 'BM25 search failed')
|
|
212
|
+
return []
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Normalize scores to 0-1 range
|
|
218
|
+
*/
|
|
219
|
+
private normalizeScores(results: BM25SearchResult[]): BM25SearchResult[] {
|
|
220
|
+
if (results.length === 0) return []
|
|
221
|
+
if (results.length === 1) {
|
|
222
|
+
return [{ ...results[0], score: 1.0 }]
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const maxScore = Math.max(...results.map(r => r.score))
|
|
226
|
+
if (maxScore === 0) {
|
|
227
|
+
return results.map(r => ({ ...r, score: 0 }))
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
return results.map(r => ({
|
|
231
|
+
...r,
|
|
232
|
+
score: r.score / maxScore
|
|
233
|
+
}))
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Get suggestions for autocomplete
|
|
238
|
+
*/
|
|
239
|
+
suggest(query: string, limit: number = 5): string[] {
|
|
240
|
+
try {
|
|
241
|
+
const results = this.index.autoSuggest(query, { limit })
|
|
242
|
+
return results.map(r => r.suggestion)
|
|
243
|
+
} catch (error) {
|
|
244
|
+
this.logger.error({ error, query }, 'BM25 suggest failed')
|
|
245
|
+
return []
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
/**
|
|
250
|
+
* Get index statistics
|
|
251
|
+
*/
|
|
252
|
+
getStats(): {
|
|
253
|
+
documentCount: number
|
|
254
|
+
termCount: number
|
|
255
|
+
} {
|
|
256
|
+
return {
|
|
257
|
+
documentCount: this.documentCount,
|
|
258
|
+
termCount: this.index.termCount
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Clear the index
|
|
264
|
+
*/
|
|
265
|
+
clear(): void {
|
|
266
|
+
this.index = this.createIndex()
|
|
267
|
+
this.documentCount = 0
|
|
268
|
+
this.logger.info('BM25 index cleared')
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Export index for persistence
|
|
273
|
+
*/
|
|
274
|
+
exportIndex(): string {
|
|
275
|
+
return JSON.stringify(this.index.toJSON())
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Import index from persisted data
|
|
280
|
+
*/
|
|
281
|
+
importIndex(data: string): void {
|
|
282
|
+
try {
|
|
283
|
+
const parsed = JSON.parse(data)
|
|
284
|
+
this.index = MiniSearch.loadJSON(data, {
|
|
285
|
+
fields: this.config.fields,
|
|
286
|
+
storeFields: this.config.storeFields,
|
|
287
|
+
idField: 'id'
|
|
288
|
+
})
|
|
289
|
+
// Count documents after import
|
|
290
|
+
this.documentCount = this.index.documentCount
|
|
291
|
+
this.logger.info({ documentCount: this.documentCount }, 'BM25 index imported')
|
|
292
|
+
} catch (error) {
|
|
293
|
+
this.logger.error({ error }, 'Failed to import BM25 index')
|
|
294
|
+
throw error
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
export { tokenize, tokenizeForSearch, getNGrams } from './tokenizer'
|
|
300
|
+
export type { TokenizerOptions } from './tokenizer'
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text Tokenizer for BM25
|
|
3
|
+
* Handles text preprocessing for sparse search
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// Common English stopwords to filter
|
|
7
|
+
const STOPWORDS = new Set([
|
|
8
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
9
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
10
|
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
11
|
+
'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
|
|
12
|
+
'used', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'me', 'my',
|
|
13
|
+
'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
|
|
14
|
+
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
|
|
15
|
+
'hers', 'herself', 'they', 'them', 'their', 'theirs', 'themselves',
|
|
16
|
+
'what', 'which', 'who', 'whom', 'when', 'where', 'why', 'how', 'all',
|
|
17
|
+
'each', 'every', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
|
|
18
|
+
'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
|
|
19
|
+
's', 't', 'just', 'don', 'now', 'then', 'here', 'there', 'also'
|
|
20
|
+
])
|
|
21
|
+
|
|
22
|
+
// Common programming terms to keep
|
|
23
|
+
const KEEP_TERMS = new Set([
|
|
24
|
+
'api', 'sql', 'css', 'html', 'json', 'xml', 'http', 'https', 'url', 'uri',
|
|
25
|
+
'jwt', 'oauth', 'rest', 'graphql', 'grpc', 'tcp', 'udp', 'ip', 'dns',
|
|
26
|
+
'aws', 'gcp', 'azure', 'docker', 'kubernetes', 'k8s', 'npm', 'yarn', 'pnpm',
|
|
27
|
+
'git', 'github', 'gitlab', 'ci', 'cd', 'devops', 'mlops', 'db', 'orm',
|
|
28
|
+
'ui', 'ux', 'cli', 'gui', 'ide', 'sdk', 'mcp', 'llm', 'ai', 'ml'
|
|
29
|
+
])
|
|
30
|
+
|
|
31
|
+
export interface TokenizerOptions {
|
|
32
|
+
/** Minimum token length to keep */
|
|
33
|
+
minLength?: number
|
|
34
|
+
/** Maximum token length */
|
|
35
|
+
maxLength?: number
|
|
36
|
+
/** Remove stopwords */
|
|
37
|
+
removeStopwords?: boolean
|
|
38
|
+
/** Convert to lowercase */
|
|
39
|
+
lowercase?: boolean
|
|
40
|
+
/** Apply stemming (basic) */
|
|
41
|
+
stemming?: boolean
|
|
42
|
+
/** Split on camelCase */
|
|
43
|
+
splitCamelCase?: boolean
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const DEFAULT_OPTIONS: TokenizerOptions = {
|
|
47
|
+
minLength: 2,
|
|
48
|
+
maxLength: 50,
|
|
49
|
+
removeStopwords: true,
|
|
50
|
+
lowercase: true,
|
|
51
|
+
stemming: true,
|
|
52
|
+
splitCamelCase: true
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Tokenize text for BM25 indexing and search
|
|
57
|
+
*/
|
|
58
|
+
export function tokenize(text: string, options: TokenizerOptions = {}): string[] {
|
|
59
|
+
const opts = { ...DEFAULT_OPTIONS, ...options }
|
|
60
|
+
|
|
61
|
+
if (!text || typeof text !== 'string') {
|
|
62
|
+
return []
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Split on camelCase if enabled
|
|
66
|
+
let processedText = text
|
|
67
|
+
if (opts.splitCamelCase) {
|
|
68
|
+
processedText = text.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Lowercase if enabled
|
|
72
|
+
if (opts.lowercase) {
|
|
73
|
+
processedText = processedText.toLowerCase()
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Split into tokens
|
|
77
|
+
// Match words, numbers, and hyphenated compounds
|
|
78
|
+
const tokenRegex = /[\w]+(?:[-_][\w]+)*/g
|
|
79
|
+
const rawTokens = processedText.match(tokenRegex) || []
|
|
80
|
+
|
|
81
|
+
// Process tokens
|
|
82
|
+
let tokens = rawTokens
|
|
83
|
+
.map(token => {
|
|
84
|
+
// Keep programming terms intact
|
|
85
|
+
if (KEEP_TERMS.has(token.toLowerCase())) {
|
|
86
|
+
return token
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Apply basic stemming if enabled
|
|
90
|
+
if (opts.stemming) {
|
|
91
|
+
return basicStem(token)
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return token
|
|
95
|
+
})
|
|
96
|
+
.filter(token => {
|
|
97
|
+
// Length filter
|
|
98
|
+
if (token.length < opts.minLength! || token.length > opts.maxLength!) {
|
|
99
|
+
return false
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Stopword filter
|
|
103
|
+
if (opts.removeStopwords && STOPWORDS.has(token.toLowerCase())) {
|
|
104
|
+
return false
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Filter pure numbers (but keep alphanumeric)
|
|
108
|
+
if (/^\d+$/.test(token)) {
|
|
109
|
+
return false
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return true
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
return tokens
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Basic Porter-like stemming
|
|
120
|
+
* Simplified for performance - handles common English suffixes
|
|
121
|
+
*/
|
|
122
|
+
function basicStem(word: string): string {
|
|
123
|
+
if (word.length < 4) return word
|
|
124
|
+
|
|
125
|
+
// Common suffix replacements
|
|
126
|
+
const suffixRules: [RegExp, string][] = [
|
|
127
|
+
[/ies$/, 'y'],
|
|
128
|
+
[/ied$/, 'y'],
|
|
129
|
+
[/es$/, ''],
|
|
130
|
+
[/s$/, ''],
|
|
131
|
+
[/ing$/, ''],
|
|
132
|
+
[/ed$/, ''],
|
|
133
|
+
[/tion$/, 't'],
|
|
134
|
+
[/ness$/, ''],
|
|
135
|
+
[/ment$/, ''],
|
|
136
|
+
[/able$/, ''],
|
|
137
|
+
[/ible$/, ''],
|
|
138
|
+
[/ful$/, ''],
|
|
139
|
+
[/less$/, ''],
|
|
140
|
+
[/ly$/, '']
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
let stemmed = word
|
|
144
|
+
for (const [pattern, replacement] of suffixRules) {
|
|
145
|
+
if (pattern.test(word)) {
|
|
146
|
+
const candidate = word.replace(pattern, replacement)
|
|
147
|
+
// Only apply if result is at least 3 chars
|
|
148
|
+
if (candidate.length >= 3) {
|
|
149
|
+
stemmed = candidate
|
|
150
|
+
break
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return stemmed
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Get n-grams from tokens
|
|
160
|
+
*/
|
|
161
|
+
export function getNGrams(tokens: string[], n: number = 2): string[] {
|
|
162
|
+
if (tokens.length < n) return []
|
|
163
|
+
|
|
164
|
+
const ngrams: string[] = []
|
|
165
|
+
for (let i = 0; i <= tokens.length - n; i++) {
|
|
166
|
+
ngrams.push(tokens.slice(i, i + n).join(' '))
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return ngrams
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Combined tokenizer for search that includes unigrams and bigrams
|
|
174
|
+
*/
|
|
175
|
+
export function tokenizeForSearch(
|
|
176
|
+
text: string,
|
|
177
|
+
options: TokenizerOptions = {}
|
|
178
|
+
): string[] {
|
|
179
|
+
const unigrams = tokenize(text, options)
|
|
180
|
+
const bigrams = getNGrams(unigrams, 2)
|
|
181
|
+
|
|
182
|
+
// Return unique tokens (unigrams + bigrams)
|
|
183
|
+
return [...new Set([...unigrams, ...bigrams])]
|
|
184
|
+
}
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adaptive Learning
|
|
3
|
+
* Learns optimal retrieval thresholds from feedback
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type { Logger } from 'pino'
|
|
7
|
+
import type { MemoryFeedback, AdaptiveThresholds, RetrievalMetrics } from '../types'
|
|
8
|
+
import type { FeedbackStore } from './store'
|
|
9
|
+
import { calculateAllMetrics, calculateAverageRating, calculatePositiveRate } from './metrics'
|
|
10
|
+
|
|
11
|
+
/** Default thresholds before any learning */
|
|
12
|
+
const DEFAULT_THRESHOLDS: AdaptiveThresholds = {
|
|
13
|
+
denseMinSimilarity: 0.3,
|
|
14
|
+
denseWeight: 0.7,
|
|
15
|
+
sparseWeight: 0.3,
|
|
16
|
+
rrfK: 60,
|
|
17
|
+
feedbackCount: 0,
|
|
18
|
+
lastUpdated: new Date().toISOString()
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/** Thresholds bounds to prevent extreme values */
|
|
22
|
+
const BOUNDS = {
|
|
23
|
+
denseMinSimilarity: { min: 0.1, max: 0.9 },
|
|
24
|
+
denseWeight: { min: 0.3, max: 0.9 },
|
|
25
|
+
sparseWeight: { min: 0.1, max: 0.7 },
|
|
26
|
+
rrfK: { min: 20, max: 100 }
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface AdaptiveLearnerConfig {
|
|
30
|
+
/** Minimum feedback count before adaptation */
|
|
31
|
+
minFeedbackForAdaptation: number
|
|
32
|
+
/** Learning rate for threshold adjustments */
|
|
33
|
+
learningRate: number
|
|
34
|
+
/** Target positive feedback rate */
|
|
35
|
+
targetPositiveRate: number
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const DEFAULT_CONFIG: AdaptiveLearnerConfig = {
|
|
39
|
+
minFeedbackForAdaptation: 10,
|
|
40
|
+
learningRate: 0.1,
|
|
41
|
+
targetPositiveRate: 0.7
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export class AdaptiveLearner {
|
|
45
|
+
private logger: Logger
|
|
46
|
+
private feedbackStore: FeedbackStore
|
|
47
|
+
private config: AdaptiveLearnerConfig
|
|
48
|
+
private currentThresholds: AdaptiveThresholds
|
|
49
|
+
|
|
50
|
+
constructor(
|
|
51
|
+
logger: Logger,
|
|
52
|
+
feedbackStore: FeedbackStore,
|
|
53
|
+
config: Partial<AdaptiveLearnerConfig> = {}
|
|
54
|
+
) {
|
|
55
|
+
this.logger = logger.child({ component: 'adaptive-learner' })
|
|
56
|
+
this.feedbackStore = feedbackStore
|
|
57
|
+
this.config = { ...DEFAULT_CONFIG, ...config }
|
|
58
|
+
this.currentThresholds = { ...DEFAULT_THRESHOLDS }
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Get current adaptive thresholds
|
|
63
|
+
*/
|
|
64
|
+
getThresholds(): AdaptiveThresholds {
|
|
65
|
+
return { ...this.currentThresholds }
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Update thresholds based on feedback
|
|
70
|
+
*/
|
|
71
|
+
async updateThresholds(): Promise<AdaptiveThresholds> {
|
|
72
|
+
const feedback = await this.feedbackStore.getRecentFeedback(1000)
|
|
73
|
+
|
|
74
|
+
if (feedback.length < this.config.minFeedbackForAdaptation) {
|
|
75
|
+
this.logger.debug({
|
|
76
|
+
feedbackCount: feedback.length,
|
|
77
|
+
required: this.config.minFeedbackForAdaptation
|
|
78
|
+
}, 'Not enough feedback for adaptation')
|
|
79
|
+
return this.currentThresholds
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
this.logger.info({ feedbackCount: feedback.length }, 'Adapting thresholds')
|
|
83
|
+
|
|
84
|
+
const metrics = calculateAllMetrics(feedback)
|
|
85
|
+
const positiveRate = calculatePositiveRate(feedback)
|
|
86
|
+
const avgRating = calculateAverageRating(feedback)
|
|
87
|
+
|
|
88
|
+
// Analyze feedback by provenance
|
|
89
|
+
const denseOnly = feedback.filter(f => this.inferProvenance(f) === 'dense')
|
|
90
|
+
const sparseOnly = feedback.filter(f => this.inferProvenance(f) === 'sparse')
|
|
91
|
+
const both = feedback.filter(f => this.inferProvenance(f) === 'both')
|
|
92
|
+
|
|
93
|
+
const densePositiveRate = denseOnly.length > 0 ? calculatePositiveRate(denseOnly) : 0.5
|
|
94
|
+
const sparsePositiveRate = sparseOnly.length > 0 ? calculatePositiveRate(sparseOnly) : 0.5
|
|
95
|
+
const bothPositiveRate = both.length > 0 ? calculatePositiveRate(both) : 0.5
|
|
96
|
+
|
|
97
|
+
// Adjust dense/sparse weights based on performance
|
|
98
|
+
if (denseOnly.length >= 5 && sparseOnly.length >= 5) {
|
|
99
|
+
// If dense performs better, increase its weight
|
|
100
|
+
if (densePositiveRate > sparsePositiveRate + 0.1) {
|
|
101
|
+
this.currentThresholds.denseWeight = this.adjustValue(
|
|
102
|
+
this.currentThresholds.denseWeight,
|
|
103
|
+
this.config.learningRate,
|
|
104
|
+
'increase',
|
|
105
|
+
BOUNDS.denseWeight
|
|
106
|
+
)
|
|
107
|
+
this.currentThresholds.sparseWeight = this.adjustValue(
|
|
108
|
+
this.currentThresholds.sparseWeight,
|
|
109
|
+
this.config.learningRate,
|
|
110
|
+
'decrease',
|
|
111
|
+
BOUNDS.sparseWeight
|
|
112
|
+
)
|
|
113
|
+
}
|
|
114
|
+
// If sparse performs better, increase its weight
|
|
115
|
+
else if (sparsePositiveRate > densePositiveRate + 0.1) {
|
|
116
|
+
this.currentThresholds.sparseWeight = this.adjustValue(
|
|
117
|
+
this.currentThresholds.sparseWeight,
|
|
118
|
+
this.config.learningRate,
|
|
119
|
+
'increase',
|
|
120
|
+
BOUNDS.sparseWeight
|
|
121
|
+
)
|
|
122
|
+
this.currentThresholds.denseWeight = this.adjustValue(
|
|
123
|
+
this.currentThresholds.denseWeight,
|
|
124
|
+
this.config.learningRate,
|
|
125
|
+
'decrease',
|
|
126
|
+
BOUNDS.denseWeight
|
|
127
|
+
)
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Adjust minimum similarity based on positive rate
|
|
132
|
+
if (positiveRate < this.config.targetPositiveRate - 0.1) {
|
|
133
|
+
// Too many negative results - lower threshold to get more results
|
|
134
|
+
this.currentThresholds.denseMinSimilarity = this.adjustValue(
|
|
135
|
+
this.currentThresholds.denseMinSimilarity,
|
|
136
|
+
this.config.learningRate,
|
|
137
|
+
'decrease',
|
|
138
|
+
BOUNDS.denseMinSimilarity
|
|
139
|
+
)
|
|
140
|
+
} else if (positiveRate > this.config.targetPositiveRate + 0.1) {
|
|
141
|
+
// Results are good - can raise threshold to be more selective
|
|
142
|
+
this.currentThresholds.denseMinSimilarity = this.adjustValue(
|
|
143
|
+
this.currentThresholds.denseMinSimilarity,
|
|
144
|
+
this.config.learningRate / 2, // More conservative increase
|
|
145
|
+
'increase',
|
|
146
|
+
BOUNDS.denseMinSimilarity
|
|
147
|
+
)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Adjust RRF K based on metrics
|
|
151
|
+
// Higher K reduces effect of rank, lower K emphasizes top ranks
|
|
152
|
+
if (metrics.mrr < 0.5 && both.length > 0) {
|
|
153
|
+
// If MRR is low and we have combined results, try lower K
|
|
154
|
+
this.currentThresholds.rrfK = this.adjustValue(
|
|
155
|
+
this.currentThresholds.rrfK,
|
|
156
|
+
5, // Adjust by fixed amount
|
|
157
|
+
'decrease',
|
|
158
|
+
BOUNDS.rrfK
|
|
159
|
+
)
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
this.currentThresholds.feedbackCount = feedback.length
|
|
163
|
+
this.currentThresholds.lastUpdated = new Date().toISOString()
|
|
164
|
+
|
|
165
|
+
this.logger.info({
|
|
166
|
+
thresholds: this.currentThresholds,
|
|
167
|
+
metrics,
|
|
168
|
+
positiveRate,
|
|
169
|
+
avgRating
|
|
170
|
+
}, 'Thresholds adapted')
|
|
171
|
+
|
|
172
|
+
return this.currentThresholds
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Adjust a value with bounds checking
|
|
177
|
+
*/
|
|
178
|
+
private adjustValue(
|
|
179
|
+
current: number,
|
|
180
|
+
amount: number,
|
|
181
|
+
direction: 'increase' | 'decrease',
|
|
182
|
+
bounds: { min: number; max: number }
|
|
183
|
+
): number {
|
|
184
|
+
const delta = direction === 'increase' ? amount : -amount
|
|
185
|
+
const newValue = current + delta
|
|
186
|
+
return Math.max(bounds.min, Math.min(bounds.max, newValue))
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Infer provenance from feedback (heuristic)
|
|
191
|
+
* In a real implementation, this would be stored in feedback
|
|
192
|
+
*/
|
|
193
|
+
private inferProvenance(feedback: MemoryFeedback): 'dense' | 'sparse' | 'both' {
|
|
194
|
+
// This is a heuristic - in production, provenance should be stored with feedback
|
|
195
|
+
// For now, assume most results are from combined search
|
|
196
|
+
return 'both'
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Get adaptation status
|
|
201
|
+
*/
|
|
202
|
+
getStatus(): {
|
|
203
|
+
feedbackCount: number
|
|
204
|
+
canAdapt: boolean
|
|
205
|
+
currentThresholds: AdaptiveThresholds
|
|
206
|
+
lastUpdated: string
|
|
207
|
+
} {
|
|
208
|
+
return {
|
|
209
|
+
feedbackCount: this.currentThresholds.feedbackCount,
|
|
210
|
+
canAdapt: this.currentThresholds.feedbackCount >= this.config.minFeedbackForAdaptation,
|
|
211
|
+
currentThresholds: this.currentThresholds,
|
|
212
|
+
lastUpdated: this.currentThresholds.lastUpdated
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Reset thresholds to defaults
|
|
218
|
+
*/
|
|
219
|
+
reset(): void {
|
|
220
|
+
this.currentThresholds = { ...DEFAULT_THRESHOLDS }
|
|
221
|
+
this.logger.info('Thresholds reset to defaults')
|
|
222
|
+
}
|
|
223
|
+
}
|