@soulcraft/brainy 4.1.4 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +35 -0
- package/dist/import/FormatDetector.d.ts +6 -1
- package/dist/import/FormatDetector.js +40 -1
- package/dist/import/ImportCoordinator.d.ts +102 -4
- package/dist/import/ImportCoordinator.js +248 -6
- package/dist/import/InstancePool.d.ts +136 -0
- package/dist/import/InstancePool.js +231 -0
- package/dist/importers/SmartCSVImporter.d.ts +2 -1
- package/dist/importers/SmartCSVImporter.js +11 -22
- package/dist/importers/SmartDOCXImporter.d.ts +125 -0
- package/dist/importers/SmartDOCXImporter.js +227 -0
- package/dist/importers/SmartExcelImporter.d.ts +12 -1
- package/dist/importers/SmartExcelImporter.js +40 -25
- package/dist/importers/SmartJSONImporter.d.ts +1 -0
- package/dist/importers/SmartJSONImporter.js +25 -6
- package/dist/importers/SmartMarkdownImporter.d.ts +2 -1
- package/dist/importers/SmartMarkdownImporter.js +11 -16
- package/dist/importers/SmartPDFImporter.d.ts +2 -1
- package/dist/importers/SmartPDFImporter.js +11 -22
- package/dist/importers/SmartYAMLImporter.d.ts +121 -0
- package/dist/importers/SmartYAMLImporter.js +275 -0
- package/dist/importers/VFSStructureGenerator.js +12 -0
- package/dist/neural/SmartExtractor.d.ts +279 -0
- package/dist/neural/SmartExtractor.js +592 -0
- package/dist/neural/SmartRelationshipExtractor.d.ts +217 -0
- package/dist/neural/SmartRelationshipExtractor.js +396 -0
- package/dist/neural/embeddedTypeEmbeddings.d.ts +1 -1
- package/dist/neural/embeddedTypeEmbeddings.js +2 -2
- package/dist/neural/entityExtractor.d.ts +3 -0
- package/dist/neural/entityExtractor.js +34 -36
- package/dist/neural/presets.d.ts +189 -0
- package/dist/neural/presets.js +365 -0
- package/dist/neural/signals/ContextSignal.d.ts +166 -0
- package/dist/neural/signals/ContextSignal.js +646 -0
- package/dist/neural/signals/EmbeddingSignal.d.ts +175 -0
- package/dist/neural/signals/EmbeddingSignal.js +435 -0
- package/dist/neural/signals/ExactMatchSignal.d.ts +220 -0
- package/dist/neural/signals/ExactMatchSignal.js +542 -0
- package/dist/neural/signals/PatternSignal.d.ts +159 -0
- package/dist/neural/signals/PatternSignal.js +478 -0
- package/dist/neural/signals/VerbContextSignal.d.ts +102 -0
- package/dist/neural/signals/VerbContextSignal.js +390 -0
- package/dist/neural/signals/VerbEmbeddingSignal.d.ts +131 -0
- package/dist/neural/signals/VerbEmbeddingSignal.js +304 -0
- package/dist/neural/signals/VerbExactMatchSignal.d.ts +115 -0
- package/dist/neural/signals/VerbExactMatchSignal.js +335 -0
- package/dist/neural/signals/VerbPatternSignal.d.ts +104 -0
- package/dist/neural/signals/VerbPatternSignal.js +457 -0
- package/dist/types/graphTypes.d.ts +2 -0
- package/dist/utils/metadataIndex.d.ts +22 -0
- package/dist/utils/metadataIndex.js +76 -0
- package/package.json +4 -1
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SmartExtractor - Unified entity type extraction using ensemble of neural signals
|
|
3
|
+
*
|
|
4
|
+
* PRODUCTION-READY: Single orchestration class for all entity type classification
|
|
5
|
+
*
|
|
6
|
+
* Design Philosophy:
|
|
7
|
+
* - Simplicity over complexity (KISS principle)
|
|
8
|
+
* - One class instead of multiple strategy layers
|
|
9
|
+
* - Clear execution path for debugging
|
|
10
|
+
* - Comprehensive format intelligence built-in
|
|
11
|
+
*
|
|
12
|
+
* Ensemble Architecture:
|
|
13
|
+
* - ExactMatchSignal (40%) - Explicit patterns and exact keywords
|
|
14
|
+
* - EmbeddingSignal (35%) - Neural similarity with type embeddings
|
|
15
|
+
* - PatternSignal (20%) - Regex patterns and naming conventions
|
|
16
|
+
* - ContextSignal (5%) - Relationship-based inference
|
|
17
|
+
*
|
|
18
|
+
* Format Intelligence:
|
|
19
|
+
* Supports 7 major formats with automatic hint extraction:
|
|
20
|
+
* - Excel (.xlsx): Column headers, sheet names, "Related Terms" detection
|
|
21
|
+
* - CSV (.csv): Header row patterns, naming conventions
|
|
22
|
+
* - PDF (.pdf): Form field names and labels
|
|
23
|
+
* - YAML (.yaml, .yml): Semantic key names
|
|
24
|
+
* - DOCX (.docx): Heading levels and structure
|
|
25
|
+
* - JSON (.json): Field name patterns
|
|
26
|
+
* - Markdown (.md): Heading hierarchy
|
|
27
|
+
*
|
|
28
|
+
* Performance:
|
|
29
|
+
* - Parallel signal execution (~15ms total)
|
|
30
|
+
* - LRU caching for hot entities
|
|
31
|
+
* - Confidence boosting when signals agree
|
|
32
|
+
* - Graceful degradation on errors
|
|
33
|
+
*/
|
|
34
|
+
import { ExactMatchSignal } from './signals/ExactMatchSignal.js';
|
|
35
|
+
import { PatternSignal } from './signals/PatternSignal.js';
|
|
36
|
+
import { EmbeddingSignal } from './signals/EmbeddingSignal.js';
|
|
37
|
+
import { ContextSignal } from './signals/ContextSignal.js';
|
|
38
|
+
/**
|
|
39
|
+
* SmartExtractor - Unified entity type classification
|
|
40
|
+
*
|
|
41
|
+
* This is the single entry point for all entity type extraction.
|
|
42
|
+
* It orchestrates all 4 signals, applies format intelligence,
|
|
43
|
+
* and combines results using ensemble weighting.
|
|
44
|
+
*
|
|
45
|
+
* Production features:
|
|
46
|
+
* - Parallel signal execution for performance
|
|
47
|
+
* - Format-specific hint extraction
|
|
48
|
+
* - Ensemble voting with confidence boosting
|
|
49
|
+
* - Comprehensive statistics and observability
|
|
50
|
+
* - LRU caching for hot paths
|
|
51
|
+
* - Graceful error handling
|
|
52
|
+
*/
|
|
53
|
+
export class SmartExtractor {
|
|
54
|
+
constructor(brain, options) {
|
|
55
|
+
// LRU cache
|
|
56
|
+
this.cache = new Map();
|
|
57
|
+
this.cacheOrder = [];
|
|
58
|
+
// Statistics
|
|
59
|
+
this.stats = {
|
|
60
|
+
calls: 0,
|
|
61
|
+
cacheHits: 0,
|
|
62
|
+
exactMatchWins: 0,
|
|
63
|
+
patternWins: 0,
|
|
64
|
+
embeddingWins: 0,
|
|
65
|
+
contextWins: 0,
|
|
66
|
+
ensembleWins: 0,
|
|
67
|
+
agreementBoosts: 0,
|
|
68
|
+
formatHintsUsed: 0,
|
|
69
|
+
averageConfidence: 0,
|
|
70
|
+
averageSignalsUsed: 0
|
|
71
|
+
};
|
|
72
|
+
this.brain = brain;
|
|
73
|
+
// Set default options
|
|
74
|
+
this.options = {
|
|
75
|
+
minConfidence: options?.minConfidence ?? 0.60,
|
|
76
|
+
enableFormatHints: options?.enableFormatHints ?? true,
|
|
77
|
+
enableEnsemble: options?.enableEnsemble ?? true,
|
|
78
|
+
cacheSize: options?.cacheSize ?? 2000,
|
|
79
|
+
weights: {
|
|
80
|
+
exactMatch: options?.weights?.exactMatch ?? 0.40,
|
|
81
|
+
embedding: options?.weights?.embedding ?? 0.35,
|
|
82
|
+
pattern: options?.weights?.pattern ?? 0.20,
|
|
83
|
+
context: options?.weights?.context ?? 0.05
|
|
84
|
+
}
|
|
85
|
+
};
|
|
86
|
+
// Validate weights sum to 1.0
|
|
87
|
+
const weightSum = Object.values(this.options.weights).reduce((a, b) => a + b, 0);
|
|
88
|
+
if (Math.abs(weightSum - 1.0) > 0.01) {
|
|
89
|
+
throw new Error(`Signal weights must sum to 1.0, got ${weightSum}`);
|
|
90
|
+
}
|
|
91
|
+
// Initialize signals
|
|
92
|
+
this.exactMatchSignal = new ExactMatchSignal(brain, {
|
|
93
|
+
minConfidence: 0.50, // Lower threshold, ensemble will filter
|
|
94
|
+
cacheSize: Math.floor(this.options.cacheSize / 4)
|
|
95
|
+
});
|
|
96
|
+
this.patternSignal = new PatternSignal(brain, {
|
|
97
|
+
minConfidence: 0.50,
|
|
98
|
+
cacheSize: Math.floor(this.options.cacheSize / 4)
|
|
99
|
+
});
|
|
100
|
+
this.embeddingSignal = new EmbeddingSignal(brain, {
|
|
101
|
+
minConfidence: 0.50,
|
|
102
|
+
checkGraph: true,
|
|
103
|
+
checkHistory: true,
|
|
104
|
+
cacheSize: Math.floor(this.options.cacheSize / 4)
|
|
105
|
+
});
|
|
106
|
+
this.contextSignal = new ContextSignal(brain, {
|
|
107
|
+
minConfidence: 0.50,
|
|
108
|
+
cacheSize: Math.floor(this.options.cacheSize / 4)
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Extract entity type using ensemble of signals
|
|
113
|
+
*
|
|
114
|
+
* Main entry point - orchestrates all signals and combines results
|
|
115
|
+
*
|
|
116
|
+
* @param candidate Entity text to classify
|
|
117
|
+
* @param context Classification context with format hints
|
|
118
|
+
* @returns ExtractionResult with type and confidence
|
|
119
|
+
*/
|
|
120
|
+
async extract(candidate, context) {
|
|
121
|
+
this.stats.calls++;
|
|
122
|
+
// Check cache first
|
|
123
|
+
const cacheKey = this.getCacheKey(candidate, context);
|
|
124
|
+
const cached = this.getFromCache(cacheKey);
|
|
125
|
+
if (cached !== undefined) {
|
|
126
|
+
this.stats.cacheHits++;
|
|
127
|
+
return cached;
|
|
128
|
+
}
|
|
129
|
+
try {
|
|
130
|
+
// Extract format hints if enabled
|
|
131
|
+
const formatHints = this.options.enableFormatHints && context?.formatContext
|
|
132
|
+
? this.extractFormatHints(context.formatContext)
|
|
133
|
+
: [];
|
|
134
|
+
if (formatHints.length > 0) {
|
|
135
|
+
this.stats.formatHintsUsed++;
|
|
136
|
+
}
|
|
137
|
+
// Build enriched context with format hints
|
|
138
|
+
const enrichedContext = {
|
|
139
|
+
definition: context?.definition,
|
|
140
|
+
allTerms: [...(context?.allTerms || []), ...formatHints],
|
|
141
|
+
metadata: context?.metadata
|
|
142
|
+
};
|
|
143
|
+
// Execute all signals in parallel
|
|
144
|
+
const [exactMatch, patternMatch, embeddingMatch, contextMatch] = await Promise.all([
|
|
145
|
+
this.exactMatchSignal.classify(candidate, enrichedContext).catch(() => null),
|
|
146
|
+
this.patternSignal.classify(candidate, enrichedContext).catch(() => null),
|
|
147
|
+
this.embeddingSignal.classify(candidate, enrichedContext).catch(() => null),
|
|
148
|
+
this.contextSignal.classify(candidate, enrichedContext).catch(() => null)
|
|
149
|
+
]);
|
|
150
|
+
// Wrap results with weights
|
|
151
|
+
const signalResults = [
|
|
152
|
+
{
|
|
153
|
+
signal: 'exact-match',
|
|
154
|
+
type: exactMatch?.type || null,
|
|
155
|
+
confidence: exactMatch?.confidence || 0,
|
|
156
|
+
weight: this.options.weights.exactMatch,
|
|
157
|
+
evidence: exactMatch?.evidence || ''
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
signal: 'pattern',
|
|
161
|
+
type: patternMatch?.type || null,
|
|
162
|
+
confidence: patternMatch?.confidence || 0,
|
|
163
|
+
weight: this.options.weights.pattern,
|
|
164
|
+
evidence: patternMatch?.evidence || ''
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
signal: 'embedding',
|
|
168
|
+
type: embeddingMatch?.type || null,
|
|
169
|
+
confidence: embeddingMatch?.confidence || 0,
|
|
170
|
+
weight: this.options.weights.embedding,
|
|
171
|
+
evidence: embeddingMatch?.evidence || ''
|
|
172
|
+
},
|
|
173
|
+
{
|
|
174
|
+
signal: 'context',
|
|
175
|
+
type: contextMatch?.type || null,
|
|
176
|
+
confidence: contextMatch?.confidence || 0,
|
|
177
|
+
weight: this.options.weights.context,
|
|
178
|
+
evidence: contextMatch?.evidence || ''
|
|
179
|
+
}
|
|
180
|
+
];
|
|
181
|
+
// Combine using ensemble or best signal
|
|
182
|
+
const result = this.options.enableEnsemble
|
|
183
|
+
? this.combineEnsemble(signalResults, formatHints, context?.formatContext)
|
|
184
|
+
: this.selectBestSignal(signalResults, formatHints, context?.formatContext);
|
|
185
|
+
// Cache result (including nulls to avoid recomputation)
|
|
186
|
+
this.addToCache(cacheKey, result);
|
|
187
|
+
// Update statistics
|
|
188
|
+
if (result) {
|
|
189
|
+
this.updateStatistics(result);
|
|
190
|
+
}
|
|
191
|
+
return result;
|
|
192
|
+
}
|
|
193
|
+
catch (error) {
|
|
194
|
+
// Graceful degradation
|
|
195
|
+
console.warn(`SmartExtractor error for "${candidate}":`, error);
|
|
196
|
+
return null;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Extract format-specific hints from context
|
|
201
|
+
*
|
|
202
|
+
* Returns array of hint strings that can help with classification
|
|
203
|
+
*/
|
|
204
|
+
extractFormatHints(formatContext) {
|
|
205
|
+
const hints = [];
|
|
206
|
+
switch (formatContext.format) {
|
|
207
|
+
case 'excel':
|
|
208
|
+
hints.push(...this.extractExcelHints(formatContext));
|
|
209
|
+
break;
|
|
210
|
+
case 'csv':
|
|
211
|
+
hints.push(...this.extractCsvHints(formatContext));
|
|
212
|
+
break;
|
|
213
|
+
case 'pdf':
|
|
214
|
+
hints.push(...this.extractPdfHints(formatContext));
|
|
215
|
+
break;
|
|
216
|
+
case 'yaml':
|
|
217
|
+
hints.push(...this.extractYamlHints(formatContext));
|
|
218
|
+
break;
|
|
219
|
+
case 'docx':
|
|
220
|
+
hints.push(...this.extractDocxHints(formatContext));
|
|
221
|
+
break;
|
|
222
|
+
case 'json':
|
|
223
|
+
hints.push(...this.extractJsonHints(formatContext));
|
|
224
|
+
break;
|
|
225
|
+
case 'markdown':
|
|
226
|
+
hints.push(...this.extractMarkdownHints(formatContext));
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
return hints.filter(h => h && h.trim().length > 0);
|
|
230
|
+
}
|
|
231
|
+
/**
|
|
232
|
+
* Extract Excel-specific hints
|
|
233
|
+
*/
|
|
234
|
+
extractExcelHints(context) {
|
|
235
|
+
const hints = [];
|
|
236
|
+
if (context.columnHeader) {
|
|
237
|
+
hints.push(context.columnHeader);
|
|
238
|
+
// Extract type keywords from header
|
|
239
|
+
const headerLower = context.columnHeader.toLowerCase();
|
|
240
|
+
const typeKeywords = [
|
|
241
|
+
'person', 'people', 'user', 'author', 'creator', 'employee', 'member',
|
|
242
|
+
'organization', 'company', 'org', 'business',
|
|
243
|
+
'location', 'place', 'city', 'country', 'address',
|
|
244
|
+
'event', 'meeting', 'conference', 'workshop',
|
|
245
|
+
'concept', 'idea', 'term', 'definition',
|
|
246
|
+
'document', 'file', 'report', 'paper',
|
|
247
|
+
'project', 'initiative', 'program',
|
|
248
|
+
'product', 'service', 'offering',
|
|
249
|
+
'date', 'time', 'timestamp', 'when'
|
|
250
|
+
];
|
|
251
|
+
for (const keyword of typeKeywords) {
|
|
252
|
+
if (headerLower.includes(keyword)) {
|
|
253
|
+
hints.push(keyword);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
if (context.sheetName) {
|
|
258
|
+
hints.push(context.sheetName);
|
|
259
|
+
}
|
|
260
|
+
return hints;
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* Extract CSV-specific hints
|
|
264
|
+
*/
|
|
265
|
+
extractCsvHints(context) {
|
|
266
|
+
const hints = [];
|
|
267
|
+
if (context.columnHeader) {
|
|
268
|
+
hints.push(context.columnHeader);
|
|
269
|
+
// Parse underscore/hyphen patterns
|
|
270
|
+
const headerLower = context.columnHeader.toLowerCase();
|
|
271
|
+
if (headerLower.includes('_') || headerLower.includes('-')) {
|
|
272
|
+
const parts = headerLower.split(/[_-]/);
|
|
273
|
+
hints.push(...parts);
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
return hints;
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Extract PDF-specific hints
|
|
280
|
+
*/
|
|
281
|
+
extractPdfHints(context) {
|
|
282
|
+
const hints = [];
|
|
283
|
+
if (context.fieldName) {
|
|
284
|
+
hints.push(context.fieldName);
|
|
285
|
+
// Convert snake_case or camelCase to words
|
|
286
|
+
const words = context.fieldName
|
|
287
|
+
.replace(/([A-Z])/g, ' $1')
|
|
288
|
+
.replace(/[_-]/g, ' ')
|
|
289
|
+
.trim()
|
|
290
|
+
.split(/\s+/);
|
|
291
|
+
hints.push(...words);
|
|
292
|
+
}
|
|
293
|
+
return hints;
|
|
294
|
+
}
|
|
295
|
+
/**
|
|
296
|
+
* Extract YAML-specific hints
|
|
297
|
+
*/
|
|
298
|
+
extractYamlHints(context) {
|
|
299
|
+
const hints = [];
|
|
300
|
+
if (context.yamlKey) {
|
|
301
|
+
hints.push(context.yamlKey);
|
|
302
|
+
// Parse key structure
|
|
303
|
+
const keyWords = context.yamlKey
|
|
304
|
+
.replace(/([A-Z])/g, ' $1')
|
|
305
|
+
.replace(/[-_]/g, ' ')
|
|
306
|
+
.trim()
|
|
307
|
+
.split(/\s+/);
|
|
308
|
+
hints.push(...keyWords);
|
|
309
|
+
}
|
|
310
|
+
return hints;
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Extract DOCX-specific hints
|
|
314
|
+
*/
|
|
315
|
+
extractDocxHints(context) {
|
|
316
|
+
const hints = [];
|
|
317
|
+
if (context.headingLevel !== undefined) {
|
|
318
|
+
// Heading 1 = major entities (organizations, projects)
|
|
319
|
+
// Heading 2-3 = sub-entities (people, concepts)
|
|
320
|
+
if (context.headingLevel === 1) {
|
|
321
|
+
hints.push('major entity', 'organization', 'project');
|
|
322
|
+
}
|
|
323
|
+
else if (context.headingLevel === 2) {
|
|
324
|
+
hints.push('sub entity', 'person', 'concept');
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
return hints;
|
|
328
|
+
}
|
|
329
|
+
/**
|
|
330
|
+
* Extract JSON-specific hints
|
|
331
|
+
*/
|
|
332
|
+
extractJsonHints(context) {
|
|
333
|
+
const hints = [];
|
|
334
|
+
if (context.fieldName) {
|
|
335
|
+
hints.push(context.fieldName);
|
|
336
|
+
// Parse camelCase or snake_case
|
|
337
|
+
const words = context.fieldName
|
|
338
|
+
.replace(/([A-Z])/g, ' $1')
|
|
339
|
+
.replace(/[_-]/g, ' ')
|
|
340
|
+
.trim()
|
|
341
|
+
.split(/\s+/);
|
|
342
|
+
hints.push(...words);
|
|
343
|
+
}
|
|
344
|
+
return hints;
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
* Extract Markdown-specific hints
|
|
348
|
+
*/
|
|
349
|
+
extractMarkdownHints(context) {
|
|
350
|
+
const hints = [];
|
|
351
|
+
if (context.headingLevel !== undefined) {
|
|
352
|
+
if (context.headingLevel === 1) {
|
|
353
|
+
hints.push('major entity');
|
|
354
|
+
}
|
|
355
|
+
else if (context.headingLevel === 2) {
|
|
356
|
+
hints.push('sub entity');
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
return hints;
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Combine signal results using ensemble voting
|
|
363
|
+
*
|
|
364
|
+
* Applies weighted voting with confidence boosting when signals agree
|
|
365
|
+
*/
|
|
366
|
+
combineEnsemble(signalResults, formatHints, formatContext) {
|
|
367
|
+
// Filter out null results
|
|
368
|
+
const validResults = signalResults.filter(r => r.type !== null);
|
|
369
|
+
if (validResults.length === 0) {
|
|
370
|
+
return null;
|
|
371
|
+
}
|
|
372
|
+
// Count votes by type with weighted confidence
|
|
373
|
+
const typeScores = new Map();
|
|
374
|
+
for (const result of validResults) {
|
|
375
|
+
if (!result.type)
|
|
376
|
+
continue;
|
|
377
|
+
const weighted = result.confidence * result.weight;
|
|
378
|
+
const existing = typeScores.get(result.type);
|
|
379
|
+
if (existing) {
|
|
380
|
+
existing.score += weighted;
|
|
381
|
+
existing.signals.push(result);
|
|
382
|
+
}
|
|
383
|
+
else {
|
|
384
|
+
typeScores.set(result.type, { score: weighted, signals: [result] });
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
// Find best type
|
|
388
|
+
let bestType = null;
|
|
389
|
+
let bestScore = 0;
|
|
390
|
+
let bestSignals = [];
|
|
391
|
+
for (const [type, data] of typeScores.entries()) {
|
|
392
|
+
// Apply agreement boost (multiple signals agree)
|
|
393
|
+
let finalScore = data.score;
|
|
394
|
+
if (data.signals.length > 1) {
|
|
395
|
+
const agreementBoost = 0.05 * (data.signals.length - 1);
|
|
396
|
+
finalScore += agreementBoost;
|
|
397
|
+
this.stats.agreementBoosts++;
|
|
398
|
+
}
|
|
399
|
+
if (finalScore > bestScore) {
|
|
400
|
+
bestScore = finalScore;
|
|
401
|
+
bestType = type;
|
|
402
|
+
bestSignals = data.signals;
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
// Check minimum confidence threshold
|
|
406
|
+
if (!bestType || bestScore < this.options.minConfidence) {
|
|
407
|
+
return null;
|
|
408
|
+
}
|
|
409
|
+
// Track signal contributions
|
|
410
|
+
const usedSignals = bestSignals.length;
|
|
411
|
+
this.stats.averageSignalsUsed =
|
|
412
|
+
(this.stats.averageSignalsUsed * (this.stats.calls - 1) + usedSignals) / this.stats.calls;
|
|
413
|
+
// Build evidence string
|
|
414
|
+
const signalNames = bestSignals.map(s => s.signal).join(' + ');
|
|
415
|
+
const evidence = `Ensemble: ${signalNames} (${bestSignals.length} signal${bestSignals.length > 1 ? 's' : ''} agree)`;
|
|
416
|
+
return {
|
|
417
|
+
type: bestType,
|
|
418
|
+
confidence: Math.min(bestScore, 1.0), // Cap at 1.0
|
|
419
|
+
source: 'ensemble',
|
|
420
|
+
evidence,
|
|
421
|
+
metadata: {
|
|
422
|
+
signalResults: bestSignals.map(s => ({
|
|
423
|
+
signal: s.signal,
|
|
424
|
+
type: s.type,
|
|
425
|
+
confidence: s.confidence,
|
|
426
|
+
weight: s.weight
|
|
427
|
+
})),
|
|
428
|
+
agreementBoost: bestSignals.length > 1 ? 0.05 * (bestSignals.length - 1) : 0,
|
|
429
|
+
formatHints: formatHints.length > 0 ? formatHints : undefined,
|
|
430
|
+
formatContext
|
|
431
|
+
}
|
|
432
|
+
};
|
|
433
|
+
}
|
|
434
|
+
/**
|
|
435
|
+
* Select best single signal (when ensemble is disabled)
|
|
436
|
+
*/
|
|
437
|
+
selectBestSignal(signalResults, formatHints, formatContext) {
|
|
438
|
+
// Filter valid results and sort by weighted confidence
|
|
439
|
+
const validResults = signalResults
|
|
440
|
+
.filter(r => r.type !== null)
|
|
441
|
+
.map(r => ({ ...r, weightedScore: r.confidence * r.weight }))
|
|
442
|
+
.sort((a, b) => b.weightedScore - a.weightedScore);
|
|
443
|
+
if (validResults.length === 0) {
|
|
444
|
+
return null;
|
|
445
|
+
}
|
|
446
|
+
const best = validResults[0];
|
|
447
|
+
if (best.weightedScore < this.options.minConfidence) {
|
|
448
|
+
return null;
|
|
449
|
+
}
|
|
450
|
+
return {
|
|
451
|
+
type: best.type,
|
|
452
|
+
confidence: best.confidence,
|
|
453
|
+
source: best.signal,
|
|
454
|
+
evidence: best.evidence,
|
|
455
|
+
metadata: {
|
|
456
|
+
formatHints: formatHints.length > 0 ? formatHints : undefined,
|
|
457
|
+
formatContext
|
|
458
|
+
}
|
|
459
|
+
};
|
|
460
|
+
}
|
|
461
|
+
/**
|
|
462
|
+
* Update statistics based on result
|
|
463
|
+
*/
|
|
464
|
+
updateStatistics(result) {
|
|
465
|
+
// Track win counts
|
|
466
|
+
if (result.source === 'ensemble') {
|
|
467
|
+
this.stats.ensembleWins++;
|
|
468
|
+
}
|
|
469
|
+
else if (result.source === 'exact-match') {
|
|
470
|
+
this.stats.exactMatchWins++;
|
|
471
|
+
}
|
|
472
|
+
else if (result.source === 'pattern') {
|
|
473
|
+
this.stats.patternWins++;
|
|
474
|
+
}
|
|
475
|
+
else if (result.source === 'embedding') {
|
|
476
|
+
this.stats.embeddingWins++;
|
|
477
|
+
}
|
|
478
|
+
else if (result.source === 'context') {
|
|
479
|
+
this.stats.contextWins++;
|
|
480
|
+
}
|
|
481
|
+
// Update rolling average confidence
|
|
482
|
+
this.stats.averageConfidence =
|
|
483
|
+
(this.stats.averageConfidence * (this.stats.calls - 1) + result.confidence) / this.stats.calls;
|
|
484
|
+
}
|
|
485
|
+
/**
|
|
486
|
+
* Get cache key from candidate and context
|
|
487
|
+
*/
|
|
488
|
+
getCacheKey(candidate, context) {
|
|
489
|
+
const normalized = candidate.toLowerCase().trim();
|
|
490
|
+
const defSnippet = context?.definition?.substring(0, 50) || '';
|
|
491
|
+
const format = context?.formatContext?.format || '';
|
|
492
|
+
return `${normalized}:${defSnippet}:${format}`;
|
|
493
|
+
}
|
|
494
|
+
/**
|
|
495
|
+
* Get from LRU cache
|
|
496
|
+
*/
|
|
497
|
+
getFromCache(key) {
|
|
498
|
+
if (!this.cache.has(key))
|
|
499
|
+
return undefined;
|
|
500
|
+
const cached = this.cache.get(key);
|
|
501
|
+
// Move to end (most recently used)
|
|
502
|
+
this.cacheOrder = this.cacheOrder.filter(k => k !== key);
|
|
503
|
+
this.cacheOrder.push(key);
|
|
504
|
+
return cached ?? null;
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Add to LRU cache with eviction
|
|
508
|
+
*/
|
|
509
|
+
addToCache(key, value) {
|
|
510
|
+
this.cache.set(key, value);
|
|
511
|
+
this.cacheOrder.push(key);
|
|
512
|
+
// Evict oldest if over limit
|
|
513
|
+
if (this.cache.size > this.options.cacheSize) {
|
|
514
|
+
const oldest = this.cacheOrder.shift();
|
|
515
|
+
if (oldest) {
|
|
516
|
+
this.cache.delete(oldest);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
/**
|
|
521
|
+
* Get comprehensive statistics
|
|
522
|
+
*/
|
|
523
|
+
getStats() {
|
|
524
|
+
return {
|
|
525
|
+
...this.stats,
|
|
526
|
+
cacheSize: this.cache.size,
|
|
527
|
+
cacheHitRate: this.stats.calls > 0 ? this.stats.cacheHits / this.stats.calls : 0,
|
|
528
|
+
ensembleRate: this.stats.calls > 0 ? this.stats.ensembleWins / this.stats.calls : 0,
|
|
529
|
+
formatHintRate: this.stats.calls > 0 ? this.stats.formatHintsUsed / this.stats.calls : 0,
|
|
530
|
+
signalStats: {
|
|
531
|
+
exactMatch: this.exactMatchSignal.getStats(),
|
|
532
|
+
pattern: this.patternSignal.getStats(),
|
|
533
|
+
embedding: this.embeddingSignal.getStats(),
|
|
534
|
+
context: this.contextSignal.getStats()
|
|
535
|
+
}
|
|
536
|
+
};
|
|
537
|
+
}
|
|
538
|
+
/**
|
|
539
|
+
* Reset all statistics
|
|
540
|
+
*/
|
|
541
|
+
resetStats() {
|
|
542
|
+
this.stats = {
|
|
543
|
+
calls: 0,
|
|
544
|
+
cacheHits: 0,
|
|
545
|
+
exactMatchWins: 0,
|
|
546
|
+
patternWins: 0,
|
|
547
|
+
embeddingWins: 0,
|
|
548
|
+
contextWins: 0,
|
|
549
|
+
ensembleWins: 0,
|
|
550
|
+
agreementBoosts: 0,
|
|
551
|
+
formatHintsUsed: 0,
|
|
552
|
+
averageConfidence: 0,
|
|
553
|
+
averageSignalsUsed: 0
|
|
554
|
+
};
|
|
555
|
+
this.exactMatchSignal.resetStats();
|
|
556
|
+
this.patternSignal.resetStats();
|
|
557
|
+
this.embeddingSignal.resetStats();
|
|
558
|
+
this.contextSignal.resetStats();
|
|
559
|
+
}
|
|
560
|
+
/**
|
|
561
|
+
* Clear all caches
|
|
562
|
+
*/
|
|
563
|
+
clearCache() {
|
|
564
|
+
this.cache.clear();
|
|
565
|
+
this.cacheOrder = [];
|
|
566
|
+
this.exactMatchSignal.clearCache();
|
|
567
|
+
this.patternSignal.clearCache();
|
|
568
|
+
this.embeddingSignal.clearCache();
|
|
569
|
+
this.contextSignal.clearCache();
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Add entity to historical data (for embedding signal temporal boosting)
|
|
573
|
+
*/
|
|
574
|
+
addToHistory(text, type, vector) {
|
|
575
|
+
this.embeddingSignal.addToHistory(text, type, vector);
|
|
576
|
+
}
|
|
577
|
+
/**
|
|
578
|
+
* Clear historical data
|
|
579
|
+
*/
|
|
580
|
+
clearHistory() {
|
|
581
|
+
this.embeddingSignal.clearHistory();
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
/**
|
|
585
|
+
* Create a new SmartExtractor instance
|
|
586
|
+
*
|
|
587
|
+
* Convenience factory function
|
|
588
|
+
*/
|
|
589
|
+
export function createSmartExtractor(brain, options) {
|
|
590
|
+
return new SmartExtractor(brain, options);
|
|
591
|
+
}
|
|
592
|
+
//# sourceMappingURL=SmartExtractor.js.map
|