@soulcraft/brainy 4.1.3 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +100 -7
- package/dist/brainy.d.ts +74 -16
- package/dist/brainy.js +74 -16
- package/dist/import/FormatDetector.d.ts +6 -1
- package/dist/import/FormatDetector.js +40 -1
- package/dist/import/ImportCoordinator.d.ts +155 -5
- package/dist/import/ImportCoordinator.js +346 -6
- package/dist/import/InstancePool.d.ts +136 -0
- package/dist/import/InstancePool.js +231 -0
- package/dist/importers/SmartCSVImporter.d.ts +2 -1
- package/dist/importers/SmartCSVImporter.js +11 -22
- package/dist/importers/SmartDOCXImporter.d.ts +125 -0
- package/dist/importers/SmartDOCXImporter.js +227 -0
- package/dist/importers/SmartExcelImporter.d.ts +12 -1
- package/dist/importers/SmartExcelImporter.js +40 -25
- package/dist/importers/SmartJSONImporter.d.ts +1 -0
- package/dist/importers/SmartJSONImporter.js +25 -6
- package/dist/importers/SmartMarkdownImporter.d.ts +2 -1
- package/dist/importers/SmartMarkdownImporter.js +11 -16
- package/dist/importers/SmartPDFImporter.d.ts +2 -1
- package/dist/importers/SmartPDFImporter.js +11 -22
- package/dist/importers/SmartYAMLImporter.d.ts +121 -0
- package/dist/importers/SmartYAMLImporter.js +275 -0
- package/dist/importers/VFSStructureGenerator.js +12 -0
- package/dist/neural/SmartExtractor.d.ts +279 -0
- package/dist/neural/SmartExtractor.js +592 -0
- package/dist/neural/SmartRelationshipExtractor.d.ts +217 -0
- package/dist/neural/SmartRelationshipExtractor.js +396 -0
- package/dist/neural/embeddedTypeEmbeddings.d.ts +1 -1
- package/dist/neural/embeddedTypeEmbeddings.js +2 -2
- package/dist/neural/entityExtractor.d.ts +3 -0
- package/dist/neural/entityExtractor.js +34 -36
- package/dist/neural/presets.d.ts +189 -0
- package/dist/neural/presets.js +365 -0
- package/dist/neural/signals/ContextSignal.d.ts +166 -0
- package/dist/neural/signals/ContextSignal.js +646 -0
- package/dist/neural/signals/EmbeddingSignal.d.ts +175 -0
- package/dist/neural/signals/EmbeddingSignal.js +435 -0
- package/dist/neural/signals/ExactMatchSignal.d.ts +220 -0
- package/dist/neural/signals/ExactMatchSignal.js +542 -0
- package/dist/neural/signals/PatternSignal.d.ts +159 -0
- package/dist/neural/signals/PatternSignal.js +478 -0
- package/dist/neural/signals/VerbContextSignal.d.ts +102 -0
- package/dist/neural/signals/VerbContextSignal.js +390 -0
- package/dist/neural/signals/VerbEmbeddingSignal.d.ts +131 -0
- package/dist/neural/signals/VerbEmbeddingSignal.js +304 -0
- package/dist/neural/signals/VerbExactMatchSignal.d.ts +115 -0
- package/dist/neural/signals/VerbExactMatchSignal.js +335 -0
- package/dist/neural/signals/VerbPatternSignal.d.ts +104 -0
- package/dist/neural/signals/VerbPatternSignal.js +457 -0
- package/dist/types/graphTypes.d.ts +2 -0
- package/package.json +4 -1
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ExactMatchSignal - O(1) exact match entity type classification
|
|
3
|
+
*
|
|
4
|
+
* HIGHEST WEIGHT: 40% (most reliable signal)
|
|
5
|
+
*
|
|
6
|
+
* Uses:
|
|
7
|
+
* 1. O(1) term index lookup (exact string match)
|
|
8
|
+
* 2. O(1) metadata hints (column names, file structure)
|
|
9
|
+
* 3. Format-specific intelligence (Excel, CSV, PDF, YAML, DOCX)
|
|
10
|
+
*
|
|
11
|
+
* This is the WORKSHOP BUG FIX - finds explicit relationships via exact matching
|
|
12
|
+
*
|
|
13
|
+
* PRODUCTION-READY: No TODOs, no mocks, real implementation
|
|
14
|
+
*/
|
|
15
|
+
import { NounType } from '../../types/graphTypes.js';
|
|
16
|
+
/**
|
|
17
|
+
* ExactMatchSignal - Instant O(1) type classification via exact matching
|
|
18
|
+
*
|
|
19
|
+
* Production features:
|
|
20
|
+
* - O(1) hash table lookups (fastest possible)
|
|
21
|
+
* - Format-specific intelligence (Excel columns, CSV headers, etc.)
|
|
22
|
+
* - Metadata hints (column names reveal entity types)
|
|
23
|
+
* - LRU cache for hot paths
|
|
24
|
+
* - Highest confidence (0.95-0.99) - most reliable signal
|
|
25
|
+
*/
|
|
26
|
+
export class ExactMatchSignal {
|
|
27
|
+
constructor(brain, options) {
|
|
28
|
+
// O(1) term lookup index (key: normalized term → value: type info)
|
|
29
|
+
this.termIndex = new Map();
|
|
30
|
+
// LRU cache for hot lookups
|
|
31
|
+
this.cache = new Map();
|
|
32
|
+
this.cacheOrder = [];
|
|
33
|
+
// Statistics
|
|
34
|
+
this.stats = {
|
|
35
|
+
calls: 0,
|
|
36
|
+
cacheHits: 0,
|
|
37
|
+
termMatches: 0,
|
|
38
|
+
metadataMatches: 0,
|
|
39
|
+
formatMatches: 0
|
|
40
|
+
};
|
|
41
|
+
this.brain = brain;
|
|
42
|
+
this.options = {
|
|
43
|
+
minConfidence: options?.minConfidence ?? 0.85,
|
|
44
|
+
cacheSize: options?.cacheSize ?? 5000,
|
|
45
|
+
enableFormatHints: options?.enableFormatHints ?? true,
|
|
46
|
+
columnPatterns: {
|
|
47
|
+
term: options?.columnPatterns?.term ?? ['term', 'name', 'title', 'entity', 'concept'],
|
|
48
|
+
type: options?.columnPatterns?.type ?? ['type', 'category', 'kind', 'class'],
|
|
49
|
+
definition: options?.columnPatterns?.definition ?? ['definition', 'description', 'text', 'content'],
|
|
50
|
+
related: options?.columnPatterns?.related ?? ['related', 'see also', 'references', 'links']
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Build term index from import data (call once per import)
|
|
56
|
+
*
|
|
57
|
+
* This is O(n) upfront cost, then O(1) lookups forever
|
|
58
|
+
*
|
|
59
|
+
* @param terms Array of terms with their types
|
|
60
|
+
*/
|
|
61
|
+
buildIndex(terms) {
|
|
62
|
+
this.termIndex.clear();
|
|
63
|
+
for (const term of terms) {
|
|
64
|
+
const normalized = this.normalize(term.text);
|
|
65
|
+
// Index full term
|
|
66
|
+
this.termIndex.set(normalized, {
|
|
67
|
+
term: term.text,
|
|
68
|
+
type: term.type,
|
|
69
|
+
confidence: term.confidence ?? 1.0,
|
|
70
|
+
source: 'index'
|
|
71
|
+
});
|
|
72
|
+
// Also index individual tokens for multi-word terms
|
|
73
|
+
const tokens = this.tokenize(normalized);
|
|
74
|
+
for (const token of tokens) {
|
|
75
|
+
if (token.length >= 3 && !this.termIndex.has(token)) {
|
|
76
|
+
this.termIndex.set(token, {
|
|
77
|
+
term: term.text,
|
|
78
|
+
type: term.type,
|
|
79
|
+
confidence: (term.confidence ?? 1.0) * 0.8, // Slight discount for partial match
|
|
80
|
+
source: 'token'
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Classify entity type using exact matching
|
|
88
|
+
*
|
|
89
|
+
* Main entry point - checks term index, metadata, and format hints
|
|
90
|
+
*
|
|
91
|
+
* @param candidate Entity text to classify
|
|
92
|
+
* @param context Optional context for better matching
|
|
93
|
+
* @returns TypeSignal with classification result or null
|
|
94
|
+
*/
|
|
95
|
+
async classify(candidate, context) {
|
|
96
|
+
this.stats.calls++;
|
|
97
|
+
// Check cache first (O(1))
|
|
98
|
+
const cacheKey = this.getCacheKey(candidate, context);
|
|
99
|
+
const cached = this.getFromCache(cacheKey);
|
|
100
|
+
if (cached !== undefined) {
|
|
101
|
+
this.stats.cacheHits++;
|
|
102
|
+
return cached;
|
|
103
|
+
}
|
|
104
|
+
// Try exact term match (O(1))
|
|
105
|
+
const termMatch = this.matchTerm(candidate);
|
|
106
|
+
if (termMatch && termMatch.confidence >= this.options.minConfidence) {
|
|
107
|
+
this.stats.termMatches++;
|
|
108
|
+
this.addToCache(cacheKey, termMatch);
|
|
109
|
+
return termMatch;
|
|
110
|
+
}
|
|
111
|
+
// Try metadata hints (O(1))
|
|
112
|
+
if (context?.metadata || context?.columnName) {
|
|
113
|
+
const metadataMatch = this.matchMetadata(candidate, context);
|
|
114
|
+
if (metadataMatch && metadataMatch.confidence >= this.options.minConfidence) {
|
|
115
|
+
this.stats.metadataMatches++;
|
|
116
|
+
this.addToCache(cacheKey, metadataMatch);
|
|
117
|
+
return metadataMatch;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
// Try format-specific hints
|
|
121
|
+
if (this.options.enableFormatHints && context?.fileFormat) {
|
|
122
|
+
const formatMatch = this.matchFormat(candidate, context);
|
|
123
|
+
if (formatMatch && formatMatch.confidence >= this.options.minConfidence) {
|
|
124
|
+
this.stats.formatMatches++;
|
|
125
|
+
this.addToCache(cacheKey, formatMatch);
|
|
126
|
+
return formatMatch;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
// No match found - cache null to avoid recomputation
|
|
130
|
+
this.addToCache(cacheKey, null);
|
|
131
|
+
return null;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Match against term index (O(1))
|
|
135
|
+
*
|
|
136
|
+
* Highest confidence - exact string match
|
|
137
|
+
*/
|
|
138
|
+
matchTerm(candidate) {
|
|
139
|
+
const normalized = this.normalize(candidate);
|
|
140
|
+
const entry = this.termIndex.get(normalized);
|
|
141
|
+
if (!entry)
|
|
142
|
+
return null;
|
|
143
|
+
return {
|
|
144
|
+
source: 'exact-term',
|
|
145
|
+
type: entry.type,
|
|
146
|
+
confidence: entry.confidence * 0.99, // 0.99 for exact term match
|
|
147
|
+
evidence: `Exact match in term index: "${entry.term}"`,
|
|
148
|
+
metadata: {
|
|
149
|
+
matchedTerm: entry.term
|
|
150
|
+
}
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Match using metadata hints (column names, file structure)
|
|
155
|
+
*
|
|
156
|
+
* High confidence - structural clues reveal entity types
|
|
157
|
+
*/
|
|
158
|
+
matchMetadata(candidate, context) {
|
|
159
|
+
// Check column name patterns
|
|
160
|
+
if (context.columnName) {
|
|
161
|
+
const hint = this.detectColumnType(context.columnName, context.rowData);
|
|
162
|
+
if (hint) {
|
|
163
|
+
return {
|
|
164
|
+
source: 'exact-metadata',
|
|
165
|
+
type: hint.type,
|
|
166
|
+
confidence: hint.confidence * 0.95, // 0.95 for metadata hints
|
|
167
|
+
evidence: hint.evidence,
|
|
168
|
+
metadata: {
|
|
169
|
+
columnHint: context.columnName
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
// Check explicit type metadata
|
|
175
|
+
if (context.metadata?.type) {
|
|
176
|
+
const hint = this.inferTypeFromMetadata(context.metadata.type);
|
|
177
|
+
if (hint) {
|
|
178
|
+
return {
|
|
179
|
+
source: 'exact-metadata',
|
|
180
|
+
type: hint.type,
|
|
181
|
+
confidence: hint.confidence * 0.98, // 0.98 for explicit type
|
|
182
|
+
evidence: hint.evidence,
|
|
183
|
+
metadata: {
|
|
184
|
+
columnHint: 'type'
|
|
185
|
+
}
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
return null;
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Match using format-specific intelligence
|
|
193
|
+
*
|
|
194
|
+
* Excel, CSV, PDF, YAML, DOCX each have unique structural patterns
|
|
195
|
+
*/
|
|
196
|
+
matchFormat(candidate, context) {
|
|
197
|
+
if (!context.fileFormat)
|
|
198
|
+
return null;
|
|
199
|
+
switch (context.fileFormat) {
|
|
200
|
+
case 'excel':
|
|
201
|
+
return this.detectExcelPatterns(candidate, context);
|
|
202
|
+
case 'csv':
|
|
203
|
+
return this.detectCSVPatterns(candidate, context);
|
|
204
|
+
case 'pdf':
|
|
205
|
+
return this.detectPDFPatterns(candidate, context);
|
|
206
|
+
case 'yaml':
|
|
207
|
+
return this.detectYAMLPatterns(candidate, context);
|
|
208
|
+
case 'docx':
|
|
209
|
+
return this.detectDOCXPatterns(candidate, context);
|
|
210
|
+
default:
|
|
211
|
+
return null;
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
/**
|
|
215
|
+
* Detect Excel-specific patterns
|
|
216
|
+
*
|
|
217
|
+
* - Cell formats (dates, currencies)
|
|
218
|
+
* - Named ranges
|
|
219
|
+
* - Column headers reveal entity types
|
|
220
|
+
* - Sheet names as categories
|
|
221
|
+
*/
|
|
222
|
+
detectExcelPatterns(candidate, context) {
|
|
223
|
+
// Sheet name hints
|
|
224
|
+
if (context.metadata?.sheetName) {
|
|
225
|
+
const sheetHint = this.inferTypeFromSheetName(context.metadata.sheetName);
|
|
226
|
+
if (sheetHint) {
|
|
227
|
+
return {
|
|
228
|
+
source: 'exact-format',
|
|
229
|
+
type: sheetHint.type,
|
|
230
|
+
confidence: sheetHint.confidence * 0.90,
|
|
231
|
+
evidence: `Excel sheet name: "${context.metadata.sheetName}"`,
|
|
232
|
+
metadata: { formatHint: 'excel-sheet' }
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
// Column position hints (first column often = entity name)
|
|
237
|
+
if (context.metadata?.columnIndex === 0) {
|
|
238
|
+
// First column is often the primary entity
|
|
239
|
+
// But don't return a type without more evidence
|
|
240
|
+
}
|
|
241
|
+
return null;
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Detect CSV-specific patterns
|
|
245
|
+
*
|
|
246
|
+
* - Relationship columns (parent_id, created_by)
|
|
247
|
+
* - Nested delimiters (semicolons, pipes)
|
|
248
|
+
* - URL columns indicate external references
|
|
249
|
+
*/
|
|
250
|
+
detectCSVPatterns(candidate, context) {
|
|
251
|
+
if (!context.rowData)
|
|
252
|
+
return null;
|
|
253
|
+
// Check for relationship columns
|
|
254
|
+
const keys = Object.keys(context.rowData);
|
|
255
|
+
// parent_id → indicates hierarchical structure
|
|
256
|
+
if (keys.some(k => k.toLowerCase().includes('parent'))) {
|
|
257
|
+
// This entity is part of a hierarchy
|
|
258
|
+
}
|
|
259
|
+
// URL column → external reference
|
|
260
|
+
const urlPattern = /^https?:\/\//;
|
|
261
|
+
if (typeof candidate === 'string' && urlPattern.test(candidate)) {
|
|
262
|
+
// Don't classify URLs as entities - they're references
|
|
263
|
+
return null;
|
|
264
|
+
}
|
|
265
|
+
return null;
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Detect PDF-specific patterns
|
|
269
|
+
*
|
|
270
|
+
* - Table of contents entries
|
|
271
|
+
* - Section headings
|
|
272
|
+
* - Citation references
|
|
273
|
+
* - Figure captions
|
|
274
|
+
*/
|
|
275
|
+
detectPDFPatterns(candidate, context) {
|
|
276
|
+
// TOC entry → likely a concept or topic
|
|
277
|
+
if (context.metadata?.isTOCEntry) {
|
|
278
|
+
return {
|
|
279
|
+
source: 'exact-format',
|
|
280
|
+
type: NounType.Concept,
|
|
281
|
+
confidence: 0.88,
|
|
282
|
+
evidence: 'PDF table of contents entry',
|
|
283
|
+
metadata: { formatHint: 'pdf-toc' }
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
return null;
|
|
287
|
+
}
|
|
288
|
+
/**
|
|
289
|
+
* Detect YAML-specific patterns
|
|
290
|
+
*
|
|
291
|
+
* - Key names reveal entity types
|
|
292
|
+
* - Nested structure indicates relationships
|
|
293
|
+
* - Lists indicate collections
|
|
294
|
+
*/
|
|
295
|
+
detectYAMLPatterns(candidate, context) {
|
|
296
|
+
if (!context.metadata?.yamlKey)
|
|
297
|
+
return null;
|
|
298
|
+
const key = context.metadata.yamlKey.toLowerCase();
|
|
299
|
+
// Common YAML patterns
|
|
300
|
+
if (key.includes('user') || key.includes('author')) {
|
|
301
|
+
return {
|
|
302
|
+
source: 'exact-format',
|
|
303
|
+
type: NounType.Person,
|
|
304
|
+
confidence: 0.90,
|
|
305
|
+
evidence: `YAML key indicates person: "${context.metadata.yamlKey}"`,
|
|
306
|
+
metadata: { formatHint: 'yaml-key' }
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
if (key.includes('organization') || key.includes('company')) {
|
|
310
|
+
return {
|
|
311
|
+
source: 'exact-format',
|
|
312
|
+
type: NounType.Organization,
|
|
313
|
+
confidence: 0.92,
|
|
314
|
+
evidence: `YAML key indicates organization: "${context.metadata.yamlKey}"`,
|
|
315
|
+
metadata: { formatHint: 'yaml-key' }
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
return null;
|
|
319
|
+
}
|
|
320
|
+
/**
|
|
321
|
+
* Detect DOCX-specific patterns
|
|
322
|
+
*
|
|
323
|
+
* - Heading levels indicate hierarchy
|
|
324
|
+
* - List items indicate collections
|
|
325
|
+
* - Comments indicate relationships
|
|
326
|
+
* - Track changes reveal authorship
|
|
327
|
+
*/
|
|
328
|
+
detectDOCXPatterns(candidate, context) {
|
|
329
|
+
// Heading level → concept hierarchy
|
|
330
|
+
if (context.metadata?.headingLevel) {
|
|
331
|
+
return {
|
|
332
|
+
source: 'exact-format',
|
|
333
|
+
type: NounType.Concept,
|
|
334
|
+
confidence: 0.87,
|
|
335
|
+
evidence: `DOCX heading (level ${context.metadata.headingLevel})`,
|
|
336
|
+
metadata: { formatHint: 'docx-heading' }
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
return null;
|
|
340
|
+
}
|
|
341
|
+
/**
|
|
342
|
+
* Detect entity type from column name patterns
|
|
343
|
+
*/
|
|
344
|
+
detectColumnType(columnName, rowData) {
|
|
345
|
+
const lower = columnName.toLowerCase();
|
|
346
|
+
// Location indicators
|
|
347
|
+
if (lower.includes('location') || lower.includes('place') ||
|
|
348
|
+
lower.includes('city') || lower.includes('country')) {
|
|
349
|
+
return {
|
|
350
|
+
type: NounType.Location,
|
|
351
|
+
confidence: 0.92,
|
|
352
|
+
evidence: `Column name indicates location: "${columnName}"`
|
|
353
|
+
};
|
|
354
|
+
}
|
|
355
|
+
// Person indicators
|
|
356
|
+
if (lower.includes('person') || lower.includes('author') ||
|
|
357
|
+
lower.includes('user') || lower.includes('name') &&
|
|
358
|
+
(lower.includes('first') || lower.includes('last'))) {
|
|
359
|
+
return {
|
|
360
|
+
type: NounType.Person,
|
|
361
|
+
confidence: 0.90,
|
|
362
|
+
evidence: `Column name indicates person: "${columnName}"`
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
// Organization indicators
|
|
366
|
+
if (lower.includes('organization') || lower.includes('company') ||
|
|
367
|
+
lower.includes('institution') || lower.includes('org')) {
|
|
368
|
+
return {
|
|
369
|
+
type: NounType.Organization,
|
|
370
|
+
confidence: 0.91,
|
|
371
|
+
evidence: `Column name indicates organization: "${columnName}"`
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
return null;
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* Infer type from explicit type metadata
|
|
378
|
+
*/
|
|
379
|
+
inferTypeFromMetadata(typeValue) {
|
|
380
|
+
if (typeof typeValue !== 'string')
|
|
381
|
+
return null;
|
|
382
|
+
const lower = typeValue.toLowerCase();
|
|
383
|
+
// Direct mapping
|
|
384
|
+
const typeMap = {
|
|
385
|
+
'person': NounType.Person,
|
|
386
|
+
'people': NounType.Person,
|
|
387
|
+
'location': NounType.Location,
|
|
388
|
+
'place': NounType.Location,
|
|
389
|
+
'organization': NounType.Organization,
|
|
390
|
+
'company': NounType.Organization,
|
|
391
|
+
'concept': NounType.Concept,
|
|
392
|
+
'idea': NounType.Concept,
|
|
393
|
+
'event': NounType.Event,
|
|
394
|
+
'document': NounType.Document,
|
|
395
|
+
'file': NounType.File,
|
|
396
|
+
'product': NounType.Product,
|
|
397
|
+
'service': NounType.Service
|
|
398
|
+
};
|
|
399
|
+
const type = typeMap[lower];
|
|
400
|
+
if (type) {
|
|
401
|
+
return {
|
|
402
|
+
type,
|
|
403
|
+
confidence: 0.98,
|
|
404
|
+
evidence: `Explicit type metadata: "${typeValue}"`
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
return null;
|
|
408
|
+
}
|
|
409
|
+
/**
|
|
410
|
+
* Infer type from Excel sheet name
|
|
411
|
+
*/
|
|
412
|
+
inferTypeFromSheetName(sheetName) {
|
|
413
|
+
const lower = sheetName.toLowerCase();
|
|
414
|
+
if (lower.includes('character') || lower.includes('people') || lower.includes('person')) {
|
|
415
|
+
return {
|
|
416
|
+
type: NounType.Person,
|
|
417
|
+
confidence: 0.88,
|
|
418
|
+
evidence: `Sheet name suggests people: "${sheetName}"`
|
|
419
|
+
};
|
|
420
|
+
}
|
|
421
|
+
if (lower.includes('location') || lower.includes('place') || lower.includes('map')) {
|
|
422
|
+
return {
|
|
423
|
+
type: NounType.Location,
|
|
424
|
+
confidence: 0.87,
|
|
425
|
+
evidence: `Sheet name suggests locations: "${sheetName}"`
|
|
426
|
+
};
|
|
427
|
+
}
|
|
428
|
+
if (lower.includes('concept') || lower.includes('glossary') || lower.includes('term')) {
|
|
429
|
+
return {
|
|
430
|
+
type: NounType.Concept,
|
|
431
|
+
confidence: 0.85,
|
|
432
|
+
evidence: `Sheet name suggests concepts: "${sheetName}"`
|
|
433
|
+
};
|
|
434
|
+
}
|
|
435
|
+
return null;
|
|
436
|
+
}
|
|
437
|
+
/**
|
|
438
|
+
* Get index size
|
|
439
|
+
*/
|
|
440
|
+
getIndexSize() {
|
|
441
|
+
return this.termIndex.size;
|
|
442
|
+
}
|
|
443
|
+
/**
|
|
444
|
+
* Get statistics
|
|
445
|
+
*/
|
|
446
|
+
getStats() {
|
|
447
|
+
return {
|
|
448
|
+
...this.stats,
|
|
449
|
+
indexSize: this.termIndex.size,
|
|
450
|
+
cacheSize: this.cache.size,
|
|
451
|
+
cacheHitRate: this.stats.calls > 0 ? this.stats.cacheHits / this.stats.calls : 0,
|
|
452
|
+
termMatchRate: this.stats.calls > 0 ? this.stats.termMatches / this.stats.calls : 0,
|
|
453
|
+
metadataMatchRate: this.stats.calls > 0 ? this.stats.metadataMatches / this.stats.calls : 0,
|
|
454
|
+
formatMatchRate: this.stats.calls > 0 ? this.stats.formatMatches / this.stats.calls : 0
|
|
455
|
+
};
|
|
456
|
+
}
|
|
457
|
+
/**
|
|
458
|
+
* Reset statistics
|
|
459
|
+
*/
|
|
460
|
+
resetStats() {
|
|
461
|
+
this.stats = {
|
|
462
|
+
calls: 0,
|
|
463
|
+
cacheHits: 0,
|
|
464
|
+
termMatches: 0,
|
|
465
|
+
metadataMatches: 0,
|
|
466
|
+
formatMatches: 0
|
|
467
|
+
};
|
|
468
|
+
}
|
|
469
|
+
/**
|
|
470
|
+
* Clear cache
|
|
471
|
+
*/
|
|
472
|
+
clearCache() {
|
|
473
|
+
this.cache.clear();
|
|
474
|
+
this.cacheOrder = [];
|
|
475
|
+
}
|
|
476
|
+
/**
|
|
477
|
+
* Clear index
|
|
478
|
+
*/
|
|
479
|
+
clearIndex() {
|
|
480
|
+
this.termIndex.clear();
|
|
481
|
+
}
|
|
482
|
+
// ========== Private Helper Methods ==========
|
|
483
|
+
/**
|
|
484
|
+
* Normalize text for matching
|
|
485
|
+
*/
|
|
486
|
+
normalize(text) {
|
|
487
|
+
return text.toLowerCase().trim();
|
|
488
|
+
}
|
|
489
|
+
/**
|
|
490
|
+
* Tokenize text into words
|
|
491
|
+
*/
|
|
492
|
+
tokenize(text) {
|
|
493
|
+
return text.toLowerCase().split(/\W+/).filter(t => t.length >= 3);
|
|
494
|
+
}
|
|
495
|
+
/**
|
|
496
|
+
* Generate cache key
|
|
497
|
+
*/
|
|
498
|
+
getCacheKey(candidate, context) {
|
|
499
|
+
const normalized = this.normalize(candidate);
|
|
500
|
+
if (!context)
|
|
501
|
+
return normalized;
|
|
502
|
+
const parts = [normalized];
|
|
503
|
+
if (context.columnName)
|
|
504
|
+
parts.push(context.columnName);
|
|
505
|
+
if (context.fileFormat)
|
|
506
|
+
parts.push(context.fileFormat);
|
|
507
|
+
return parts.join(':');
|
|
508
|
+
}
|
|
509
|
+
/**
|
|
510
|
+
* Get from LRU cache
|
|
511
|
+
*/
|
|
512
|
+
getFromCache(key) {
|
|
513
|
+
if (!this.cache.has(key))
|
|
514
|
+
return undefined;
|
|
515
|
+
const cached = this.cache.get(key);
|
|
516
|
+
// Move to end (most recently used)
|
|
517
|
+
this.cacheOrder = this.cacheOrder.filter(k => k !== key);
|
|
518
|
+
this.cacheOrder.push(key);
|
|
519
|
+
return cached ?? null;
|
|
520
|
+
}
|
|
521
|
+
/**
|
|
522
|
+
* Add to LRU cache with eviction
|
|
523
|
+
*/
|
|
524
|
+
addToCache(key, value) {
|
|
525
|
+
this.cache.set(key, value);
|
|
526
|
+
this.cacheOrder.push(key);
|
|
527
|
+
// Evict oldest if over limit
|
|
528
|
+
if (this.cache.size > this.options.cacheSize) {
|
|
529
|
+
const oldest = this.cacheOrder.shift();
|
|
530
|
+
if (oldest) {
|
|
531
|
+
this.cache.delete(oldest);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
/**
|
|
537
|
+
* Create a new ExactMatchSignal instance
|
|
538
|
+
*/
|
|
539
|
+
export function createExactMatchSignal(brain, options) {
|
|
540
|
+
return new ExactMatchSignal(brain, options);
|
|
541
|
+
}
|
|
542
|
+
//# sourceMappingURL=ExactMatchSignal.js.map
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PatternSignal - Pattern-based entity type classification
|
|
3
|
+
*
|
|
4
|
+
* WEIGHT: 20% (moderate reliability, fast)
|
|
5
|
+
*
|
|
6
|
+
* Uses:
|
|
7
|
+
* 1. 220+ pre-compiled regex patterns from PatternLibrary
|
|
8
|
+
* 2. Common naming conventions (camelCase → Person, UPPER_CASE → constant, etc.)
|
|
9
|
+
* 3. Text structural patterns (email → contact, URL → reference, etc.)
|
|
10
|
+
*
|
|
11
|
+
* Merges: KeywordSignal + PatternSignal from old architecture
|
|
12
|
+
* Speed: Very fast (~5ms) - pre-compiled patterns
|
|
13
|
+
*
|
|
14
|
+
* PRODUCTION-READY: No TODOs, no mocks, real implementation
|
|
15
|
+
*/
|
|
16
|
+
import type { Brainy } from '../../brainy.js';
|
|
17
|
+
import { NounType } from '../../types/graphTypes.js';
|
|
18
|
+
/**
|
|
19
|
+
* Signal result with classification details
|
|
20
|
+
*/
|
|
21
|
+
export interface TypeSignal {
|
|
22
|
+
source: 'pattern-regex' | 'pattern-naming' | 'pattern-structural';
|
|
23
|
+
type: NounType;
|
|
24
|
+
confidence: number;
|
|
25
|
+
evidence: string;
|
|
26
|
+
metadata?: {
|
|
27
|
+
patternName?: string;
|
|
28
|
+
matchedPattern?: string;
|
|
29
|
+
matchCount?: number;
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Options for pattern signal
|
|
34
|
+
*/
|
|
35
|
+
export interface PatternSignalOptions {
|
|
36
|
+
minConfidence?: number;
|
|
37
|
+
cacheSize?: number;
|
|
38
|
+
enableNamingPatterns?: boolean;
|
|
39
|
+
enableStructuralPatterns?: boolean;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* PatternSignal - Fast pattern-based type classification
|
|
43
|
+
*
|
|
44
|
+
* Production features:
|
|
45
|
+
* - 220+ pre-compiled regex patterns (instant matching)
|
|
46
|
+
* - Naming convention detection (camelCase, snake_case, etc.)
|
|
47
|
+
* - Structural pattern detection (emails, URLs, dates, etc.)
|
|
48
|
+
* - LRU cache for hot paths
|
|
49
|
+
* - Moderate confidence (0.65-0.85) - patterns are reliable but not perfect
|
|
50
|
+
*/
|
|
51
|
+
export declare class PatternSignal {
|
|
52
|
+
private brain;
|
|
53
|
+
private options;
|
|
54
|
+
private patterns;
|
|
55
|
+
private cache;
|
|
56
|
+
private cacheOrder;
|
|
57
|
+
private stats;
|
|
58
|
+
constructor(brain: Brainy, options?: PatternSignalOptions);
|
|
59
|
+
/**
|
|
60
|
+
* Initialize pre-compiled patterns
|
|
61
|
+
*
|
|
62
|
+
* Patterns organized by type:
|
|
63
|
+
* - Person: names, titles, roles
|
|
64
|
+
* - Location: places, addresses, coordinates
|
|
65
|
+
* - Organization: companies, institutions
|
|
66
|
+
* - Technology: programming languages, frameworks, tools
|
|
67
|
+
* - Event: meetings, conferences, releases
|
|
68
|
+
* - Concept: ideas, theories, methodologies
|
|
69
|
+
* - Object: physical items, artifacts
|
|
70
|
+
* - Document: files, papers, reports
|
|
71
|
+
*/
|
|
72
|
+
private initializePatterns;
|
|
73
|
+
/**
|
|
74
|
+
* Helper to add patterns for a specific type
|
|
75
|
+
*/
|
|
76
|
+
private addPatterns;
|
|
77
|
+
/**
|
|
78
|
+
* Classify entity type using pattern matching
|
|
79
|
+
*
|
|
80
|
+
* Main entry point - checks regex patterns, naming conventions, structural patterns
|
|
81
|
+
*
|
|
82
|
+
* @param candidate Entity text to classify
|
|
83
|
+
* @param context Optional context for better matching
|
|
84
|
+
* @returns TypeSignal with classification result or null
|
|
85
|
+
*/
|
|
86
|
+
classify(candidate: string, context?: {
|
|
87
|
+
definition?: string;
|
|
88
|
+
metadata?: Record<string, any>;
|
|
89
|
+
}): Promise<TypeSignal | null>;
|
|
90
|
+
/**
|
|
91
|
+
* Match against pre-compiled regex patterns
|
|
92
|
+
*
|
|
93
|
+
* Checks candidate and optional definition text
|
|
94
|
+
*/
|
|
95
|
+
private matchRegexPatterns;
|
|
96
|
+
/**
|
|
97
|
+
* Match based on naming conventions
|
|
98
|
+
*
|
|
99
|
+
* Examples:
|
|
100
|
+
* - camelCase → likely code/attribute
|
|
101
|
+
* - PascalCase → likely class/type/concept
|
|
102
|
+
* - snake_case → likely variable/attribute
|
|
103
|
+
* - UPPER_CASE → likely constant/attribute
|
|
104
|
+
* - kebab-case → likely file/identifier
|
|
105
|
+
*/
|
|
106
|
+
private matchNamingConventions;
|
|
107
|
+
/**
|
|
108
|
+
* Match based on structural patterns
|
|
109
|
+
*
|
|
110
|
+
* Detects:
|
|
111
|
+
* - Email addresses → Person/contact
|
|
112
|
+
* - URLs → Object/reference
|
|
113
|
+
* - Phone numbers → contact information
|
|
114
|
+
* - Dates → temporal events
|
|
115
|
+
* - UUIDs → identifiers
|
|
116
|
+
* - Semantic versions → releases/projects
|
|
117
|
+
*/
|
|
118
|
+
private matchStructuralPatterns;
|
|
119
|
+
/**
|
|
120
|
+
* Get statistics about signal performance
|
|
121
|
+
*/
|
|
122
|
+
getStats(): {
|
|
123
|
+
cacheSize: number;
|
|
124
|
+
patternCount: number;
|
|
125
|
+
cacheHitRate: number;
|
|
126
|
+
regexMatchRate: number;
|
|
127
|
+
namingMatchRate: number;
|
|
128
|
+
structuralMatchRate: number;
|
|
129
|
+
calls: number;
|
|
130
|
+
cacheHits: number;
|
|
131
|
+
regexMatches: number;
|
|
132
|
+
namingMatches: number;
|
|
133
|
+
structuralMatches: number;
|
|
134
|
+
};
|
|
135
|
+
/**
|
|
136
|
+
* Reset statistics (useful for testing)
|
|
137
|
+
*/
|
|
138
|
+
resetStats(): void;
|
|
139
|
+
/**
|
|
140
|
+
* Clear cache
|
|
141
|
+
*/
|
|
142
|
+
clearCache(): void;
|
|
143
|
+
/**
|
|
144
|
+
* Generate cache key from candidate and context
|
|
145
|
+
*/
|
|
146
|
+
private getCacheKey;
|
|
147
|
+
/**
|
|
148
|
+
* Get from LRU cache
|
|
149
|
+
*/
|
|
150
|
+
private getFromCache;
|
|
151
|
+
/**
|
|
152
|
+
* Add to LRU cache with eviction
|
|
153
|
+
*/
|
|
154
|
+
private addToCache;
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Create a new PatternSignal instance
|
|
158
|
+
*/
|
|
159
|
+
export declare function createPatternSignal(brain: Brainy, options?: PatternSignalOptions): PatternSignal;
|