@soulcraft/brainy 4.1.4 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +35 -0
- package/dist/import/FormatDetector.d.ts +6 -1
- package/dist/import/FormatDetector.js +40 -1
- package/dist/import/ImportCoordinator.d.ts +102 -4
- package/dist/import/ImportCoordinator.js +248 -6
- package/dist/import/InstancePool.d.ts +136 -0
- package/dist/import/InstancePool.js +231 -0
- package/dist/importers/SmartCSVImporter.d.ts +2 -1
- package/dist/importers/SmartCSVImporter.js +11 -22
- package/dist/importers/SmartDOCXImporter.d.ts +125 -0
- package/dist/importers/SmartDOCXImporter.js +227 -0
- package/dist/importers/SmartExcelImporter.d.ts +12 -1
- package/dist/importers/SmartExcelImporter.js +40 -25
- package/dist/importers/SmartJSONImporter.d.ts +1 -0
- package/dist/importers/SmartJSONImporter.js +25 -6
- package/dist/importers/SmartMarkdownImporter.d.ts +2 -1
- package/dist/importers/SmartMarkdownImporter.js +11 -16
- package/dist/importers/SmartPDFImporter.d.ts +2 -1
- package/dist/importers/SmartPDFImporter.js +11 -22
- package/dist/importers/SmartYAMLImporter.d.ts +121 -0
- package/dist/importers/SmartYAMLImporter.js +275 -0
- package/dist/importers/VFSStructureGenerator.js +12 -0
- package/dist/neural/SmartExtractor.d.ts +279 -0
- package/dist/neural/SmartExtractor.js +592 -0
- package/dist/neural/SmartRelationshipExtractor.d.ts +217 -0
- package/dist/neural/SmartRelationshipExtractor.js +396 -0
- package/dist/neural/embeddedTypeEmbeddings.d.ts +1 -1
- package/dist/neural/embeddedTypeEmbeddings.js +2 -2
- package/dist/neural/entityExtractor.d.ts +3 -0
- package/dist/neural/entityExtractor.js +34 -36
- package/dist/neural/presets.d.ts +189 -0
- package/dist/neural/presets.js +365 -0
- package/dist/neural/signals/ContextSignal.d.ts +166 -0
- package/dist/neural/signals/ContextSignal.js +646 -0
- package/dist/neural/signals/EmbeddingSignal.d.ts +175 -0
- package/dist/neural/signals/EmbeddingSignal.js +435 -0
- package/dist/neural/signals/ExactMatchSignal.d.ts +220 -0
- package/dist/neural/signals/ExactMatchSignal.js +542 -0
- package/dist/neural/signals/PatternSignal.d.ts +159 -0
- package/dist/neural/signals/PatternSignal.js +478 -0
- package/dist/neural/signals/VerbContextSignal.d.ts +102 -0
- package/dist/neural/signals/VerbContextSignal.js +390 -0
- package/dist/neural/signals/VerbEmbeddingSignal.d.ts +131 -0
- package/dist/neural/signals/VerbEmbeddingSignal.js +304 -0
- package/dist/neural/signals/VerbExactMatchSignal.d.ts +115 -0
- package/dist/neural/signals/VerbExactMatchSignal.js +335 -0
- package/dist/neural/signals/VerbPatternSignal.d.ts +104 -0
- package/dist/neural/signals/VerbPatternSignal.js +457 -0
- package/dist/types/graphTypes.d.ts +2 -0
- package/dist/utils/metadataIndex.d.ts +22 -0
- package/dist/utils/metadataIndex.js +76 -0
- package/package.json +4 -1
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart YAML Importer
|
|
3
|
+
*
|
|
4
|
+
* Extracts entities and relationships from YAML files using:
|
|
5
|
+
* - YAML parsing to JSON-like structure
|
|
6
|
+
* - Recursive traversal of nested structures
|
|
7
|
+
* - NeuralEntityExtractor for entity extraction from text values
|
|
8
|
+
* - NaturalLanguageProcessor for relationship inference
|
|
9
|
+
* - Hierarchical relationship creation (parent-child, contains, etc.)
|
|
10
|
+
*
|
|
11
|
+
* v4.2.0: New format handler
|
|
12
|
+
* NO MOCKS - Production-ready implementation
|
|
13
|
+
*/
|
|
14
|
+
import { Brainy } from '../brainy.js';
|
|
15
|
+
import { NounType, VerbType } from '../types/graphTypes.js';
|
|
16
|
+
export interface SmartYAMLOptions {
|
|
17
|
+
/** Enable neural entity extraction from string values */
|
|
18
|
+
enableNeuralExtraction?: boolean;
|
|
19
|
+
/** Enable hierarchical relationship creation */
|
|
20
|
+
enableHierarchicalRelationships?: boolean;
|
|
21
|
+
/** Enable concept extraction for tagging */
|
|
22
|
+
enableConceptExtraction?: boolean;
|
|
23
|
+
/** Confidence threshold for entities (0-1) */
|
|
24
|
+
confidenceThreshold?: number;
|
|
25
|
+
/** Maximum depth to traverse */
|
|
26
|
+
maxDepth?: number;
|
|
27
|
+
/** Minimum string length to process for entity extraction */
|
|
28
|
+
minStringLength?: number;
|
|
29
|
+
/** Keys that indicate entity names */
|
|
30
|
+
nameKeys?: string[];
|
|
31
|
+
/** Keys that indicate entity descriptions */
|
|
32
|
+
descriptionKeys?: string[];
|
|
33
|
+
/** Keys that indicate entity types */
|
|
34
|
+
typeKeys?: string[];
|
|
35
|
+
/** Progress callback */
|
|
36
|
+
onProgress?: (stats: {
|
|
37
|
+
processed: number;
|
|
38
|
+
entities: number;
|
|
39
|
+
relationships: number;
|
|
40
|
+
}) => void;
|
|
41
|
+
}
|
|
42
|
+
export interface ExtractedYAMLEntity {
|
|
43
|
+
/** Entity ID */
|
|
44
|
+
id: string;
|
|
45
|
+
/** Entity name */
|
|
46
|
+
name: string;
|
|
47
|
+
/** Entity type */
|
|
48
|
+
type: NounType;
|
|
49
|
+
/** Entity description/value */
|
|
50
|
+
description: string;
|
|
51
|
+
/** Confidence score */
|
|
52
|
+
confidence: number;
|
|
53
|
+
/** Weight/importance score */
|
|
54
|
+
weight?: number;
|
|
55
|
+
/** YAML path to this entity */
|
|
56
|
+
path: string;
|
|
57
|
+
/** Parent path in YAML hierarchy */
|
|
58
|
+
parentPath: string | null;
|
|
59
|
+
/** Metadata */
|
|
60
|
+
metadata: Record<string, any>;
|
|
61
|
+
}
|
|
62
|
+
export interface ExtractedYAMLRelationship {
|
|
63
|
+
from: string;
|
|
64
|
+
to: string;
|
|
65
|
+
type: VerbType;
|
|
66
|
+
confidence: number;
|
|
67
|
+
weight?: number;
|
|
68
|
+
evidence: string;
|
|
69
|
+
}
|
|
70
|
+
export interface SmartYAMLResult {
|
|
71
|
+
/** Total nodes processed */
|
|
72
|
+
nodesProcessed: number;
|
|
73
|
+
/** Entities extracted */
|
|
74
|
+
entitiesExtracted: number;
|
|
75
|
+
/** Relationships inferred */
|
|
76
|
+
relationshipsInferred: number;
|
|
77
|
+
/** All extracted entities */
|
|
78
|
+
entities: ExtractedYAMLEntity[];
|
|
79
|
+
/** All relationships */
|
|
80
|
+
relationships: ExtractedYAMLRelationship[];
|
|
81
|
+
/** Entity ID mapping (path -> ID) */
|
|
82
|
+
entityMap: Map<string, string>;
|
|
83
|
+
/** Processing time in ms */
|
|
84
|
+
processingTime: number;
|
|
85
|
+
/** Extraction statistics */
|
|
86
|
+
stats: {
|
|
87
|
+
byType: Record<string, number>;
|
|
88
|
+
byDepth: Record<number, number>;
|
|
89
|
+
byConfidence: {
|
|
90
|
+
high: number;
|
|
91
|
+
medium: number;
|
|
92
|
+
low: number;
|
|
93
|
+
};
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* SmartYAMLImporter - Extracts structured knowledge from YAML files
|
|
98
|
+
*/
|
|
99
|
+
export declare class SmartYAMLImporter {
|
|
100
|
+
private brain;
|
|
101
|
+
private extractor;
|
|
102
|
+
private nlp;
|
|
103
|
+
private relationshipExtractor;
|
|
104
|
+
constructor(brain: Brainy);
|
|
105
|
+
/**
|
|
106
|
+
* Initialize the importer
|
|
107
|
+
*/
|
|
108
|
+
init(): Promise<void>;
|
|
109
|
+
/**
|
|
110
|
+
* Extract entities and relationships from YAML string or buffer
|
|
111
|
+
*/
|
|
112
|
+
extract(yamlContent: string | Buffer, options?: SmartYAMLOptions): Promise<SmartYAMLResult>;
|
|
113
|
+
/**
|
|
114
|
+
* Extract entities and relationships from parsed YAML data
|
|
115
|
+
*/
|
|
116
|
+
private extractFromData;
|
|
117
|
+
/**
|
|
118
|
+
* Extract an entity from a YAML object node
|
|
119
|
+
*/
|
|
120
|
+
private extractEntityFromObject;
|
|
121
|
+
}
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart YAML Importer
|
|
3
|
+
*
|
|
4
|
+
* Extracts entities and relationships from YAML files using:
|
|
5
|
+
* - YAML parsing to JSON-like structure
|
|
6
|
+
* - Recursive traversal of nested structures
|
|
7
|
+
* - NeuralEntityExtractor for entity extraction from text values
|
|
8
|
+
* - NaturalLanguageProcessor for relationship inference
|
|
9
|
+
* - Hierarchical relationship creation (parent-child, contains, etc.)
|
|
10
|
+
*
|
|
11
|
+
* v4.2.0: New format handler
|
|
12
|
+
* NO MOCKS - Production-ready implementation
|
|
13
|
+
*/
|
|
14
|
+
import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
|
|
15
|
+
import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
|
|
16
|
+
import { SmartRelationshipExtractor } from '../neural/SmartRelationshipExtractor.js';
|
|
17
|
+
import { NounType, VerbType } from '../types/graphTypes.js';
|
|
18
|
+
import * as yaml from 'js-yaml';
|
|
19
|
+
/**
|
|
20
|
+
* SmartYAMLImporter - Extracts structured knowledge from YAML files
|
|
21
|
+
*/
|
|
22
|
+
export class SmartYAMLImporter {
|
|
23
|
+
constructor(brain) {
|
|
24
|
+
this.brain = brain;
|
|
25
|
+
this.extractor = new NeuralEntityExtractor(brain);
|
|
26
|
+
this.nlp = new NaturalLanguageProcessor(brain);
|
|
27
|
+
this.relationshipExtractor = new SmartRelationshipExtractor(brain);
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Initialize the importer
|
|
31
|
+
*/
|
|
32
|
+
async init() {
|
|
33
|
+
await this.nlp.init();
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Extract entities and relationships from YAML string or buffer
|
|
37
|
+
*/
|
|
38
|
+
async extract(yamlContent, options = {}) {
|
|
39
|
+
const startTime = Date.now();
|
|
40
|
+
// Parse YAML to JavaScript object
|
|
41
|
+
const yamlString = typeof yamlContent === 'string'
|
|
42
|
+
? yamlContent
|
|
43
|
+
: yamlContent.toString('utf-8');
|
|
44
|
+
let data;
|
|
45
|
+
try {
|
|
46
|
+
data = yaml.load(yamlString);
|
|
47
|
+
}
|
|
48
|
+
catch (error) {
|
|
49
|
+
throw new Error(`Failed to parse YAML: ${error.message}`);
|
|
50
|
+
}
|
|
51
|
+
// Process as JSON-like structure
|
|
52
|
+
const result = await this.extractFromData(data, options);
|
|
53
|
+
result.processingTime = Date.now() - startTime;
|
|
54
|
+
return result;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Extract entities and relationships from parsed YAML data
|
|
58
|
+
*/
|
|
59
|
+
async extractFromData(data, options) {
|
|
60
|
+
const opts = {
|
|
61
|
+
enableNeuralExtraction: options.enableNeuralExtraction !== false,
|
|
62
|
+
enableHierarchicalRelationships: options.enableHierarchicalRelationships !== false,
|
|
63
|
+
enableConceptExtraction: options.enableConceptExtraction !== false,
|
|
64
|
+
confidenceThreshold: options.confidenceThreshold || 0.6,
|
|
65
|
+
maxDepth: options.maxDepth || 10,
|
|
66
|
+
minStringLength: options.minStringLength || 3,
|
|
67
|
+
nameKeys: options.nameKeys || ['name', 'title', 'label', 'id'],
|
|
68
|
+
descriptionKeys: options.descriptionKeys || ['description', 'desc', 'summary', 'value'],
|
|
69
|
+
typeKeys: options.typeKeys || ['type', 'kind', 'category'],
|
|
70
|
+
onProgress: options.onProgress
|
|
71
|
+
};
|
|
72
|
+
const entities = [];
|
|
73
|
+
const relationships = [];
|
|
74
|
+
const entityMap = new Map();
|
|
75
|
+
let nodesProcessed = 0;
|
|
76
|
+
const stats = {
|
|
77
|
+
byType: {},
|
|
78
|
+
byDepth: {},
|
|
79
|
+
byConfidence: { high: 0, medium: 0, low: 0 }
|
|
80
|
+
};
|
|
81
|
+
// Traverse YAML structure recursively
|
|
82
|
+
const traverse = async (obj, path = '$', depth = 0, parentPath = null) => {
|
|
83
|
+
if (depth > opts.maxDepth)
|
|
84
|
+
return;
|
|
85
|
+
nodesProcessed++;
|
|
86
|
+
stats.byDepth[depth] = (stats.byDepth[depth] || 0) + 1;
|
|
87
|
+
// Report progress
|
|
88
|
+
if (options.onProgress && nodesProcessed % 10 === 0) {
|
|
89
|
+
options.onProgress({
|
|
90
|
+
processed: nodesProcessed,
|
|
91
|
+
entities: entities.length,
|
|
92
|
+
relationships: relationships.length
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
// Handle different value types
|
|
96
|
+
if (obj === null || obj === undefined) {
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
// Handle arrays
|
|
100
|
+
if (Array.isArray(obj)) {
|
|
101
|
+
for (let i = 0; i < obj.length; i++) {
|
|
102
|
+
await traverse(obj[i], `${path}[${i}]`, depth + 1, path);
|
|
103
|
+
}
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
// Handle objects
|
|
107
|
+
if (typeof obj === 'object') {
|
|
108
|
+
// Extract entity from object
|
|
109
|
+
const entity = await this.extractEntityFromObject(obj, path, parentPath, depth, opts);
|
|
110
|
+
if (entity) {
|
|
111
|
+
entities.push(entity);
|
|
112
|
+
entityMap.set(path, entity.id);
|
|
113
|
+
// Update stats
|
|
114
|
+
stats.byType[entity.type] = (stats.byType[entity.type] || 0) + 1;
|
|
115
|
+
if (entity.confidence > 0.8)
|
|
116
|
+
stats.byConfidence.high++;
|
|
117
|
+
else if (entity.confidence >= 0.6)
|
|
118
|
+
stats.byConfidence.medium++;
|
|
119
|
+
else
|
|
120
|
+
stats.byConfidence.low++;
|
|
121
|
+
// Create hierarchical relationship
|
|
122
|
+
if (opts.enableHierarchicalRelationships && parentPath) {
|
|
123
|
+
const parentId = entityMap.get(parentPath);
|
|
124
|
+
if (parentId) {
|
|
125
|
+
// Extract parent name from path for better context
|
|
126
|
+
const parentName = parentPath.split('.').pop()?.replace(/\[(\d+)\]/, 'item $1') || 'parent';
|
|
127
|
+
const childName = entity.name;
|
|
128
|
+
// Infer relationship type using SmartRelationshipExtractor
|
|
129
|
+
const context = `Hierarchical YAML structure: ${parentName} contains ${childName}. Parent path: ${parentPath}, Child path: ${entity.path}`;
|
|
130
|
+
const inferredRelationship = await this.relationshipExtractor.infer(parentName, childName, context, {
|
|
131
|
+
objectType: entity.type // Pass child entity type as hint
|
|
132
|
+
});
|
|
133
|
+
relationships.push({
|
|
134
|
+
from: parentId,
|
|
135
|
+
to: entity.id,
|
|
136
|
+
type: inferredRelationship?.type || VerbType.Contains, // Fallback to Contains for hierarchical relationships
|
|
137
|
+
confidence: inferredRelationship?.confidence || 0.9,
|
|
138
|
+
weight: inferredRelationship?.weight || 1.0,
|
|
139
|
+
evidence: inferredRelationship?.evidence || 'Hierarchical parent-child relationship in YAML structure'
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
// Traverse nested objects
|
|
145
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
146
|
+
await traverse(value, `${path}.${key}`, depth + 1, path);
|
|
147
|
+
}
|
|
148
|
+
return;
|
|
149
|
+
}
|
|
150
|
+
// Handle primitive values (strings, numbers, booleans)
|
|
151
|
+
if (typeof obj === 'string' && obj.length >= opts.minStringLength) {
|
|
152
|
+
// Extract entities from string values
|
|
153
|
+
if (opts.enableNeuralExtraction) {
|
|
154
|
+
const extractedEntities = await this.extractor.extract(obj, {
|
|
155
|
+
confidence: opts.confidenceThreshold
|
|
156
|
+
});
|
|
157
|
+
for (const extracted of extractedEntities) {
|
|
158
|
+
const entityId = `${path}:${extracted.text}`;
|
|
159
|
+
const entity = {
|
|
160
|
+
id: entityId,
|
|
161
|
+
name: extracted.text,
|
|
162
|
+
type: extracted.type,
|
|
163
|
+
description: obj,
|
|
164
|
+
confidence: extracted.confidence,
|
|
165
|
+
weight: extracted.weight || 1.0,
|
|
166
|
+
path,
|
|
167
|
+
parentPath,
|
|
168
|
+
metadata: {
|
|
169
|
+
position: extracted.position,
|
|
170
|
+
extractedFrom: 'string-value'
|
|
171
|
+
}
|
|
172
|
+
};
|
|
173
|
+
entities.push(entity);
|
|
174
|
+
entityMap.set(entityId, entityId);
|
|
175
|
+
// Update stats
|
|
176
|
+
stats.byType[entity.type] = (stats.byType[entity.type] || 0) + 1;
|
|
177
|
+
if (entity.confidence > 0.8)
|
|
178
|
+
stats.byConfidence.high++;
|
|
179
|
+
else if (entity.confidence >= 0.6)
|
|
180
|
+
stats.byConfidence.medium++;
|
|
181
|
+
else
|
|
182
|
+
stats.byConfidence.low++;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
};
|
|
187
|
+
// Start traversal
|
|
188
|
+
await traverse(data);
|
|
189
|
+
// Final progress report
|
|
190
|
+
if (options.onProgress) {
|
|
191
|
+
options.onProgress({
|
|
192
|
+
processed: nodesProcessed,
|
|
193
|
+
entities: entities.length,
|
|
194
|
+
relationships: relationships.length
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
return {
|
|
198
|
+
nodesProcessed,
|
|
199
|
+
entitiesExtracted: entities.length,
|
|
200
|
+
relationshipsInferred: relationships.length,
|
|
201
|
+
entities,
|
|
202
|
+
relationships,
|
|
203
|
+
entityMap,
|
|
204
|
+
processingTime: 0, // Will be set by caller
|
|
205
|
+
stats
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Extract an entity from a YAML object node
|
|
210
|
+
*/
|
|
211
|
+
async extractEntityFromObject(obj, path, parentPath, depth, opts) {
|
|
212
|
+
// Try to find name
|
|
213
|
+
let name = null;
|
|
214
|
+
for (const key of opts.nameKeys) {
|
|
215
|
+
if (obj[key] && typeof obj[key] === 'string') {
|
|
216
|
+
name = obj[key];
|
|
217
|
+
break;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
// If no explicit name, use path segment
|
|
221
|
+
if (!name) {
|
|
222
|
+
const segments = path.split('.');
|
|
223
|
+
name = segments[segments.length - 1];
|
|
224
|
+
if (name === '$')
|
|
225
|
+
name = 'root';
|
|
226
|
+
}
|
|
227
|
+
// Try to find description
|
|
228
|
+
let description = name;
|
|
229
|
+
for (const key of opts.descriptionKeys) {
|
|
230
|
+
if (obj[key] && typeof obj[key] === 'string') {
|
|
231
|
+
description = obj[key];
|
|
232
|
+
break;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
// Try to find explicit type
|
|
236
|
+
let explicitType = null;
|
|
237
|
+
for (const key of opts.typeKeys) {
|
|
238
|
+
if (obj[key] && typeof obj[key] === 'string') {
|
|
239
|
+
explicitType = obj[key];
|
|
240
|
+
break;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
// Classify entity type using SmartExtractor
|
|
244
|
+
const classification = await this.extractor.extract(description, {
|
|
245
|
+
confidence: opts.confidenceThreshold
|
|
246
|
+
});
|
|
247
|
+
const entityType = classification.length > 0
|
|
248
|
+
? classification[0].type
|
|
249
|
+
: NounType.Thing;
|
|
250
|
+
const confidence = classification.length > 0
|
|
251
|
+
? classification[0].confidence
|
|
252
|
+
: 0.5;
|
|
253
|
+
const weight = classification.length > 0
|
|
254
|
+
? classification[0].weight || 1.0
|
|
255
|
+
: 1.0;
|
|
256
|
+
// Create entity
|
|
257
|
+
const entity = {
|
|
258
|
+
id: path,
|
|
259
|
+
name,
|
|
260
|
+
type: entityType,
|
|
261
|
+
description,
|
|
262
|
+
confidence,
|
|
263
|
+
weight,
|
|
264
|
+
path,
|
|
265
|
+
parentPath,
|
|
266
|
+
metadata: {
|
|
267
|
+
depth,
|
|
268
|
+
explicitType,
|
|
269
|
+
yamlKeys: Object.keys(obj)
|
|
270
|
+
}
|
|
271
|
+
};
|
|
272
|
+
return entity;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
//# sourceMappingURL=SmartYAMLImporter.js.map
|
|
@@ -187,6 +187,14 @@ export class VFSStructureGenerator {
|
|
|
187
187
|
*/
|
|
188
188
|
groupEntities(importResult, options) {
|
|
189
189
|
const groups = new Map();
|
|
190
|
+
// Handle sheet-based grouping (v4.2.0)
|
|
191
|
+
if (options.groupBy === 'sheet' && importResult.sheets && importResult.sheets.length > 0) {
|
|
192
|
+
for (const sheet of importResult.sheets) {
|
|
193
|
+
groups.set(sheet.name, sheet.rows);
|
|
194
|
+
}
|
|
195
|
+
return groups;
|
|
196
|
+
}
|
|
197
|
+
// Handle other grouping strategies
|
|
190
198
|
for (const extracted of importResult.rows) {
|
|
191
199
|
let groupName;
|
|
192
200
|
switch (options.groupBy) {
|
|
@@ -201,6 +209,10 @@ export class VFSStructureGenerator {
|
|
|
201
209
|
options.customGrouping(extracted.entity) :
|
|
202
210
|
'entities';
|
|
203
211
|
break;
|
|
212
|
+
case 'sheet':
|
|
213
|
+
// Fallback if sheets data not available
|
|
214
|
+
groupName = 'entities';
|
|
215
|
+
break;
|
|
204
216
|
default:
|
|
205
217
|
groupName = 'entities';
|
|
206
218
|
}
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SmartExtractor - Unified entity type extraction using ensemble of neural signals
|
|
3
|
+
*
|
|
4
|
+
* PRODUCTION-READY: Single orchestration class for all entity type classification
|
|
5
|
+
*
|
|
6
|
+
* Design Philosophy:
|
|
7
|
+
* - Simplicity over complexity (KISS principle)
|
|
8
|
+
* - One class instead of multiple strategy layers
|
|
9
|
+
* - Clear execution path for debugging
|
|
10
|
+
* - Comprehensive format intelligence built-in
|
|
11
|
+
*
|
|
12
|
+
* Ensemble Architecture:
|
|
13
|
+
* - ExactMatchSignal (40%) - Explicit patterns and exact keywords
|
|
14
|
+
* - EmbeddingSignal (35%) - Neural similarity with type embeddings
|
|
15
|
+
* - PatternSignal (20%) - Regex patterns and naming conventions
|
|
16
|
+
* - ContextSignal (5%) - Relationship-based inference
|
|
17
|
+
*
|
|
18
|
+
* Format Intelligence:
|
|
19
|
+
* Supports 7 major formats with automatic hint extraction:
|
|
20
|
+
* - Excel (.xlsx): Column headers, sheet names, "Related Terms" detection
|
|
21
|
+
* - CSV (.csv): Header row patterns, naming conventions
|
|
22
|
+
* - PDF (.pdf): Form field names and labels
|
|
23
|
+
* - YAML (.yaml, .yml): Semantic key names
|
|
24
|
+
* - DOCX (.docx): Heading levels and structure
|
|
25
|
+
* - JSON (.json): Field name patterns
|
|
26
|
+
* - Markdown (.md): Heading hierarchy
|
|
27
|
+
*
|
|
28
|
+
* Performance:
|
|
29
|
+
* - Parallel signal execution (~15ms total)
|
|
30
|
+
* - LRU caching for hot entities
|
|
31
|
+
* - Confidence boosting when signals agree
|
|
32
|
+
* - Graceful degradation on errors
|
|
33
|
+
*/
|
|
34
|
+
import type { Brainy } from '../brainy.js';
|
|
35
|
+
import type { NounType } from '../types/graphTypes.js';
|
|
36
|
+
/**
|
|
37
|
+
* Extraction result with full traceability
|
|
38
|
+
*/
|
|
39
|
+
export interface ExtractionResult {
|
|
40
|
+
type: NounType;
|
|
41
|
+
confidence: number;
|
|
42
|
+
source: 'ensemble' | 'exact-match' | 'pattern' | 'embedding' | 'context';
|
|
43
|
+
evidence: string;
|
|
44
|
+
metadata?: {
|
|
45
|
+
signalResults?: Array<{
|
|
46
|
+
signal: string;
|
|
47
|
+
type: NounType;
|
|
48
|
+
confidence: number;
|
|
49
|
+
weight: number;
|
|
50
|
+
}>;
|
|
51
|
+
agreementBoost?: number;
|
|
52
|
+
formatHints?: string[];
|
|
53
|
+
formatContext?: FormatContext;
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Format context for classification
|
|
58
|
+
*/
|
|
59
|
+
export interface FormatContext {
|
|
60
|
+
format?: 'excel' | 'csv' | 'pdf' | 'yaml' | 'docx' | 'json' | 'markdown';
|
|
61
|
+
columnHeader?: string;
|
|
62
|
+
fieldName?: string;
|
|
63
|
+
yamlKey?: string;
|
|
64
|
+
headingLevel?: number;
|
|
65
|
+
sheetName?: string;
|
|
66
|
+
metadata?: Record<string, any>;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Options for SmartExtractor
|
|
70
|
+
*/
|
|
71
|
+
export interface SmartExtractorOptions {
|
|
72
|
+
minConfidence?: number;
|
|
73
|
+
enableFormatHints?: boolean;
|
|
74
|
+
enableEnsemble?: boolean;
|
|
75
|
+
cacheSize?: number;
|
|
76
|
+
weights?: {
|
|
77
|
+
exactMatch?: number;
|
|
78
|
+
embedding?: number;
|
|
79
|
+
pattern?: number;
|
|
80
|
+
context?: number;
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* SmartExtractor - Unified entity type classification
|
|
85
|
+
*
|
|
86
|
+
* This is the single entry point for all entity type extraction.
|
|
87
|
+
* It orchestrates all 4 signals, applies format intelligence,
|
|
88
|
+
* and combines results using ensemble weighting.
|
|
89
|
+
*
|
|
90
|
+
* Production features:
|
|
91
|
+
* - Parallel signal execution for performance
|
|
92
|
+
* - Format-specific hint extraction
|
|
93
|
+
* - Ensemble voting with confidence boosting
|
|
94
|
+
* - Comprehensive statistics and observability
|
|
95
|
+
* - LRU caching for hot paths
|
|
96
|
+
* - Graceful error handling
|
|
97
|
+
*/
|
|
98
|
+
export declare class SmartExtractor {
|
|
99
|
+
private brain;
|
|
100
|
+
private options;
|
|
101
|
+
private exactMatchSignal;
|
|
102
|
+
private patternSignal;
|
|
103
|
+
private embeddingSignal;
|
|
104
|
+
private contextSignal;
|
|
105
|
+
private cache;
|
|
106
|
+
private cacheOrder;
|
|
107
|
+
private stats;
|
|
108
|
+
constructor(brain: Brainy, options?: SmartExtractorOptions);
|
|
109
|
+
/**
|
|
110
|
+
* Extract entity type using ensemble of signals
|
|
111
|
+
*
|
|
112
|
+
* Main entry point - orchestrates all signals and combines results
|
|
113
|
+
*
|
|
114
|
+
* @param candidate Entity text to classify
|
|
115
|
+
* @param context Classification context with format hints
|
|
116
|
+
* @returns ExtractionResult with type and confidence
|
|
117
|
+
*/
|
|
118
|
+
extract(candidate: string, context?: {
|
|
119
|
+
definition?: string;
|
|
120
|
+
formatContext?: FormatContext;
|
|
121
|
+
allTerms?: string[];
|
|
122
|
+
metadata?: any;
|
|
123
|
+
}): Promise<ExtractionResult | null>;
|
|
124
|
+
/**
|
|
125
|
+
* Extract format-specific hints from context
|
|
126
|
+
*
|
|
127
|
+
* Returns array of hint strings that can help with classification
|
|
128
|
+
*/
|
|
129
|
+
private extractFormatHints;
|
|
130
|
+
/**
|
|
131
|
+
* Extract Excel-specific hints
|
|
132
|
+
*/
|
|
133
|
+
private extractExcelHints;
|
|
134
|
+
/**
|
|
135
|
+
* Extract CSV-specific hints
|
|
136
|
+
*/
|
|
137
|
+
private extractCsvHints;
|
|
138
|
+
/**
|
|
139
|
+
* Extract PDF-specific hints
|
|
140
|
+
*/
|
|
141
|
+
private extractPdfHints;
|
|
142
|
+
/**
|
|
143
|
+
* Extract YAML-specific hints
|
|
144
|
+
*/
|
|
145
|
+
private extractYamlHints;
|
|
146
|
+
/**
|
|
147
|
+
* Extract DOCX-specific hints
|
|
148
|
+
*/
|
|
149
|
+
private extractDocxHints;
|
|
150
|
+
/**
|
|
151
|
+
* Extract JSON-specific hints
|
|
152
|
+
*/
|
|
153
|
+
private extractJsonHints;
|
|
154
|
+
/**
|
|
155
|
+
* Extract Markdown-specific hints
|
|
156
|
+
*/
|
|
157
|
+
private extractMarkdownHints;
|
|
158
|
+
/**
|
|
159
|
+
* Combine signal results using ensemble voting
|
|
160
|
+
*
|
|
161
|
+
* Applies weighted voting with confidence boosting when signals agree
|
|
162
|
+
*/
|
|
163
|
+
private combineEnsemble;
|
|
164
|
+
/**
|
|
165
|
+
* Select best single signal (when ensemble is disabled)
|
|
166
|
+
*/
|
|
167
|
+
private selectBestSignal;
|
|
168
|
+
/**
|
|
169
|
+
* Update statistics based on result
|
|
170
|
+
*/
|
|
171
|
+
private updateStatistics;
|
|
172
|
+
/**
|
|
173
|
+
* Get cache key from candidate and context
|
|
174
|
+
*/
|
|
175
|
+
private getCacheKey;
|
|
176
|
+
/**
|
|
177
|
+
* Get from LRU cache
|
|
178
|
+
*/
|
|
179
|
+
private getFromCache;
|
|
180
|
+
/**
|
|
181
|
+
* Add to LRU cache with eviction
|
|
182
|
+
*/
|
|
183
|
+
private addToCache;
|
|
184
|
+
/**
|
|
185
|
+
* Get comprehensive statistics
|
|
186
|
+
*/
|
|
187
|
+
getStats(): {
|
|
188
|
+
cacheSize: number;
|
|
189
|
+
cacheHitRate: number;
|
|
190
|
+
ensembleRate: number;
|
|
191
|
+
formatHintRate: number;
|
|
192
|
+
signalStats: {
|
|
193
|
+
exactMatch: {
|
|
194
|
+
indexSize: number;
|
|
195
|
+
cacheSize: number;
|
|
196
|
+
cacheHitRate: number;
|
|
197
|
+
termMatchRate: number;
|
|
198
|
+
metadataMatchRate: number;
|
|
199
|
+
formatMatchRate: number;
|
|
200
|
+
calls: number;
|
|
201
|
+
cacheHits: number;
|
|
202
|
+
termMatches: number;
|
|
203
|
+
metadataMatches: number;
|
|
204
|
+
formatMatches: number;
|
|
205
|
+
};
|
|
206
|
+
pattern: {
|
|
207
|
+
cacheSize: number;
|
|
208
|
+
patternCount: number;
|
|
209
|
+
cacheHitRate: number;
|
|
210
|
+
regexMatchRate: number;
|
|
211
|
+
namingMatchRate: number;
|
|
212
|
+
structuralMatchRate: number;
|
|
213
|
+
calls: number;
|
|
214
|
+
cacheHits: number;
|
|
215
|
+
regexMatches: number;
|
|
216
|
+
namingMatches: number;
|
|
217
|
+
structuralMatches: number;
|
|
218
|
+
};
|
|
219
|
+
embedding: {
|
|
220
|
+
cacheSize: number;
|
|
221
|
+
historySize: number;
|
|
222
|
+
cacheHitRate: number;
|
|
223
|
+
typeMatchRate: number;
|
|
224
|
+
graphMatchRate: number;
|
|
225
|
+
historyMatchRate: number;
|
|
226
|
+
calls: number;
|
|
227
|
+
cacheHits: number;
|
|
228
|
+
typeMatches: number;
|
|
229
|
+
graphMatches: number;
|
|
230
|
+
historyMatches: number;
|
|
231
|
+
combinedBoosts: number;
|
|
232
|
+
};
|
|
233
|
+
context: {
|
|
234
|
+
cacheSize: number;
|
|
235
|
+
cacheHitRate: number;
|
|
236
|
+
relationshipMatchRate: number;
|
|
237
|
+
attributeMatchRate: number;
|
|
238
|
+
calls: number;
|
|
239
|
+
cacheHits: number;
|
|
240
|
+
relationshipMatches: number;
|
|
241
|
+
attributeMatches: number;
|
|
242
|
+
combinedMatches: number;
|
|
243
|
+
};
|
|
244
|
+
};
|
|
245
|
+
calls: number;
|
|
246
|
+
cacheHits: number;
|
|
247
|
+
exactMatchWins: number;
|
|
248
|
+
patternWins: number;
|
|
249
|
+
embeddingWins: number;
|
|
250
|
+
contextWins: number;
|
|
251
|
+
ensembleWins: number;
|
|
252
|
+
agreementBoosts: number;
|
|
253
|
+
formatHintsUsed: number;
|
|
254
|
+
averageConfidence: number;
|
|
255
|
+
averageSignalsUsed: number;
|
|
256
|
+
};
|
|
257
|
+
/**
|
|
258
|
+
* Reset all statistics
|
|
259
|
+
*/
|
|
260
|
+
resetStats(): void;
|
|
261
|
+
/**
|
|
262
|
+
* Clear all caches
|
|
263
|
+
*/
|
|
264
|
+
clearCache(): void;
|
|
265
|
+
/**
|
|
266
|
+
* Add entity to historical data (for embedding signal temporal boosting)
|
|
267
|
+
*/
|
|
268
|
+
addToHistory(text: string, type: NounType, vector: number[]): void;
|
|
269
|
+
/**
|
|
270
|
+
* Clear historical data
|
|
271
|
+
*/
|
|
272
|
+
clearHistory(): void;
|
|
273
|
+
}
|
|
274
|
+
/**
|
|
275
|
+
* Create a new SmartExtractor instance
|
|
276
|
+
*
|
|
277
|
+
* Convenience factory function
|
|
278
|
+
*/
|
|
279
|
+
export declare function createSmartExtractor(brain: Brainy, options?: SmartExtractorOptions): SmartExtractor;
|