@soulcraft/brainy 4.1.3 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +100 -7
- package/dist/brainy.d.ts +74 -16
- package/dist/brainy.js +74 -16
- package/dist/import/FormatDetector.d.ts +6 -1
- package/dist/import/FormatDetector.js +40 -1
- package/dist/import/ImportCoordinator.d.ts +155 -5
- package/dist/import/ImportCoordinator.js +346 -6
- package/dist/import/InstancePool.d.ts +136 -0
- package/dist/import/InstancePool.js +231 -0
- package/dist/importers/SmartCSVImporter.d.ts +2 -1
- package/dist/importers/SmartCSVImporter.js +11 -22
- package/dist/importers/SmartDOCXImporter.d.ts +125 -0
- package/dist/importers/SmartDOCXImporter.js +227 -0
- package/dist/importers/SmartExcelImporter.d.ts +12 -1
- package/dist/importers/SmartExcelImporter.js +40 -25
- package/dist/importers/SmartJSONImporter.d.ts +1 -0
- package/dist/importers/SmartJSONImporter.js +25 -6
- package/dist/importers/SmartMarkdownImporter.d.ts +2 -1
- package/dist/importers/SmartMarkdownImporter.js +11 -16
- package/dist/importers/SmartPDFImporter.d.ts +2 -1
- package/dist/importers/SmartPDFImporter.js +11 -22
- package/dist/importers/SmartYAMLImporter.d.ts +121 -0
- package/dist/importers/SmartYAMLImporter.js +275 -0
- package/dist/importers/VFSStructureGenerator.js +12 -0
- package/dist/neural/SmartExtractor.d.ts +279 -0
- package/dist/neural/SmartExtractor.js +592 -0
- package/dist/neural/SmartRelationshipExtractor.d.ts +217 -0
- package/dist/neural/SmartRelationshipExtractor.js +396 -0
- package/dist/neural/embeddedTypeEmbeddings.d.ts +1 -1
- package/dist/neural/embeddedTypeEmbeddings.js +2 -2
- package/dist/neural/entityExtractor.d.ts +3 -0
- package/dist/neural/entityExtractor.js +34 -36
- package/dist/neural/presets.d.ts +189 -0
- package/dist/neural/presets.js +365 -0
- package/dist/neural/signals/ContextSignal.d.ts +166 -0
- package/dist/neural/signals/ContextSignal.js +646 -0
- package/dist/neural/signals/EmbeddingSignal.d.ts +175 -0
- package/dist/neural/signals/EmbeddingSignal.js +435 -0
- package/dist/neural/signals/ExactMatchSignal.d.ts +220 -0
- package/dist/neural/signals/ExactMatchSignal.js +542 -0
- package/dist/neural/signals/PatternSignal.d.ts +159 -0
- package/dist/neural/signals/PatternSignal.js +478 -0
- package/dist/neural/signals/VerbContextSignal.d.ts +102 -0
- package/dist/neural/signals/VerbContextSignal.js +390 -0
- package/dist/neural/signals/VerbEmbeddingSignal.d.ts +131 -0
- package/dist/neural/signals/VerbEmbeddingSignal.js +304 -0
- package/dist/neural/signals/VerbExactMatchSignal.d.ts +115 -0
- package/dist/neural/signals/VerbExactMatchSignal.js +335 -0
- package/dist/neural/signals/VerbPatternSignal.d.ts +104 -0
- package/dist/neural/signals/VerbPatternSignal.js +457 -0
- package/dist/types/graphTypes.d.ts +2 -0
- package/package.json +4 -1
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* InstancePool - Shared instance management for memory efficiency
|
|
3
|
+
*
|
|
4
|
+
* Production-grade instance pooling to prevent memory leaks during imports.
|
|
5
|
+
* Critical for scaling to billions of entities.
|
|
6
|
+
*
|
|
7
|
+
* Problem: Creating new NLP/Extractor instances in loops → memory leak
|
|
8
|
+
* Solution: Reuse shared instances across entire import session
|
|
9
|
+
*
|
|
10
|
+
* Memory savings:
|
|
11
|
+
* - Without pooling: 100K rows × 50MB per instance = 5TB RAM (OOM!)
|
|
12
|
+
* - With pooling: 50MB total (shared across all rows)
|
|
13
|
+
*/
|
|
14
|
+
import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
|
|
15
|
+
import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
|
|
16
|
+
/**
|
|
17
|
+
* InstancePool - Manages shared instances for memory efficiency
|
|
18
|
+
*
|
|
19
|
+
* Lifecycle:
|
|
20
|
+
* 1. Create pool at import start
|
|
21
|
+
* 2. Reuse instances across all rows
|
|
22
|
+
* 3. Pool is garbage collected when import completes
|
|
23
|
+
*
|
|
24
|
+
* Thread safety: Not thread-safe (single import session per pool)
|
|
25
|
+
*/
|
|
26
|
+
export class InstancePool {
|
|
27
|
+
constructor(brain) {
|
|
28
|
+
// Shared instances (created lazily)
|
|
29
|
+
this.nlpInstance = null;
|
|
30
|
+
this.extractorInstance = null;
|
|
31
|
+
// Initialization state
|
|
32
|
+
this.nlpInitialized = false;
|
|
33
|
+
this.initializationPromise = null;
|
|
34
|
+
// Statistics
|
|
35
|
+
this.stats = {
|
|
36
|
+
nlpReuses: 0,
|
|
37
|
+
extractorReuses: 0,
|
|
38
|
+
creationTime: 0
|
|
39
|
+
};
|
|
40
|
+
this.brain = brain;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Get shared NaturalLanguageProcessor instance
|
|
44
|
+
*
|
|
45
|
+
* Lazy initialization - created on first access
|
|
46
|
+
* All subsequent calls return same instance
|
|
47
|
+
*
|
|
48
|
+
* @returns Shared NLP instance
|
|
49
|
+
*/
|
|
50
|
+
async getNLP() {
|
|
51
|
+
if (!this.nlpInstance) {
|
|
52
|
+
const startTime = Date.now();
|
|
53
|
+
this.nlpInstance = new NaturalLanguageProcessor(this.brain);
|
|
54
|
+
this.stats.creationTime += Date.now() - startTime;
|
|
55
|
+
}
|
|
56
|
+
// Ensure initialized before returning
|
|
57
|
+
if (!this.nlpInitialized) {
|
|
58
|
+
await this.ensureNLPInitialized();
|
|
59
|
+
}
|
|
60
|
+
this.stats.nlpReuses++;
|
|
61
|
+
return this.nlpInstance;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Get shared NeuralEntityExtractor instance
|
|
65
|
+
*
|
|
66
|
+
* Lazy initialization - created on first access
|
|
67
|
+
* All subsequent calls return same instance
|
|
68
|
+
*
|
|
69
|
+
* @returns Shared extractor instance
|
|
70
|
+
*/
|
|
71
|
+
getExtractor() {
|
|
72
|
+
if (!this.extractorInstance) {
|
|
73
|
+
const startTime = Date.now();
|
|
74
|
+
this.extractorInstance = new NeuralEntityExtractor(this.brain);
|
|
75
|
+
this.stats.creationTime += Date.now() - startTime;
|
|
76
|
+
}
|
|
77
|
+
this.stats.extractorReuses++;
|
|
78
|
+
return this.extractorInstance;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Get shared NLP instance (synchronous, may return uninitialized)
|
|
82
|
+
*
|
|
83
|
+
* Use when you need NLP synchronously and will handle initialization yourself.
|
|
84
|
+
* Prefer getNLP() for async code.
|
|
85
|
+
*
|
|
86
|
+
* @returns Shared NLP instance (possibly uninitialized)
|
|
87
|
+
*/
|
|
88
|
+
getNLPSync() {
|
|
89
|
+
if (!this.nlpInstance) {
|
|
90
|
+
this.nlpInstance = new NaturalLanguageProcessor(this.brain);
|
|
91
|
+
}
|
|
92
|
+
this.stats.nlpReuses++;
|
|
93
|
+
return this.nlpInstance;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Initialize all instances upfront
|
|
97
|
+
*
|
|
98
|
+
* Call at start of import to avoid lazy initialization overhead
|
|
99
|
+
* during processing. Improves predictability and first-row performance.
|
|
100
|
+
*
|
|
101
|
+
* @returns Promise that resolves when all instances are ready
|
|
102
|
+
*/
|
|
103
|
+
async init() {
|
|
104
|
+
// Prevent duplicate initialization
|
|
105
|
+
if (this.initializationPromise) {
|
|
106
|
+
return this.initializationPromise;
|
|
107
|
+
}
|
|
108
|
+
this.initializationPromise = this.initializeInternal();
|
|
109
|
+
return this.initializationPromise;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Internal initialization implementation
|
|
113
|
+
*/
|
|
114
|
+
async initializeInternal() {
|
|
115
|
+
const startTime = Date.now();
|
|
116
|
+
// Create instances
|
|
117
|
+
if (!this.nlpInstance) {
|
|
118
|
+
this.nlpInstance = new NaturalLanguageProcessor(this.brain);
|
|
119
|
+
}
|
|
120
|
+
if (!this.extractorInstance) {
|
|
121
|
+
this.extractorInstance = new NeuralEntityExtractor(this.brain);
|
|
122
|
+
}
|
|
123
|
+
// Initialize NLP (loads pattern library)
|
|
124
|
+
await this.ensureNLPInitialized();
|
|
125
|
+
this.stats.creationTime = Date.now() - startTime;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Ensure NLP is initialized (loads 220 patterns)
|
|
129
|
+
*
|
|
130
|
+
* Handles concurrent initialization requests safely
|
|
131
|
+
*/
|
|
132
|
+
async ensureNLPInitialized() {
|
|
133
|
+
if (this.nlpInitialized) {
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
if (!this.nlpInstance) {
|
|
137
|
+
throw new Error('NLP instance not created yet');
|
|
138
|
+
}
|
|
139
|
+
await this.nlpInstance.init();
|
|
140
|
+
this.nlpInitialized = true;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Check if instances are initialized
|
|
144
|
+
*
|
|
145
|
+
* @returns True if NLP is initialized and ready to use
|
|
146
|
+
*/
|
|
147
|
+
isInitialized() {
|
|
148
|
+
return this.nlpInitialized && this.nlpInstance !== null;
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Get pool statistics
|
|
152
|
+
*
|
|
153
|
+
* Useful for performance monitoring and memory leak detection
|
|
154
|
+
*
|
|
155
|
+
* @returns Statistics about instance reuse
|
|
156
|
+
*/
|
|
157
|
+
getStats() {
|
|
158
|
+
return {
|
|
159
|
+
...this.stats,
|
|
160
|
+
nlpCreated: this.nlpInstance !== null,
|
|
161
|
+
extractorCreated: this.extractorInstance !== null,
|
|
162
|
+
initialized: this.isInitialized(),
|
|
163
|
+
// Memory savings estimate
|
|
164
|
+
memorySaved: this.calculateMemorySaved()
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Calculate estimated memory saved by pooling
|
|
169
|
+
*
|
|
170
|
+
* Assumes ~50MB per NLP instance, ~10MB per extractor instance
|
|
171
|
+
*
|
|
172
|
+
* @returns Estimated memory saved in bytes
|
|
173
|
+
*/
|
|
174
|
+
calculateMemorySaved() {
|
|
175
|
+
const nlpSize = 50 * 1024 * 1024; // 50MB per instance
|
|
176
|
+
const extractorSize = 10 * 1024 * 1024; // 10MB per instance
|
|
177
|
+
// Without pooling: size × reuses
|
|
178
|
+
// With pooling: size × 1
|
|
179
|
+
// Saved: size × (reuses - 1)
|
|
180
|
+
const nlpSaved = nlpSize * Math.max(0, this.stats.nlpReuses - 1);
|
|
181
|
+
const extractorSaved = extractorSize * Math.max(0, this.stats.extractorReuses - 1);
|
|
182
|
+
return nlpSaved + extractorSaved;
|
|
183
|
+
}
|
|
184
|
+
/**
|
|
185
|
+
* Reset statistics (useful for testing)
|
|
186
|
+
*/
|
|
187
|
+
resetStats() {
|
|
188
|
+
this.stats = {
|
|
189
|
+
nlpReuses: 0,
|
|
190
|
+
extractorReuses: 0,
|
|
191
|
+
creationTime: 0
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Get string representation (for debugging)
|
|
196
|
+
*/
|
|
197
|
+
toString() {
|
|
198
|
+
const stats = this.getStats();
|
|
199
|
+
return `InstancePool(nlp=${stats.nlpCreated}, extractor=${stats.extractorCreated}, initialized=${stats.initialized}, nlpReuses=${stats.nlpReuses}, extractorReuses=${stats.extractorReuses})`;
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Cleanup method (for explicit resource management)
|
|
203
|
+
*
|
|
204
|
+
* Note: Usually not needed - pool is garbage collected when import completes.
|
|
205
|
+
* Use only if you need explicit cleanup for some reason.
|
|
206
|
+
*/
|
|
207
|
+
cleanup() {
|
|
208
|
+
// Clear references to allow garbage collection
|
|
209
|
+
this.nlpInstance = null;
|
|
210
|
+
this.extractorInstance = null;
|
|
211
|
+
this.nlpInitialized = false;
|
|
212
|
+
this.initializationPromise = null;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Create a new instance pool
|
|
217
|
+
*
|
|
218
|
+
* Convenience factory function
|
|
219
|
+
*
|
|
220
|
+
* @param brain Brainy instance
|
|
221
|
+
* @param autoInit Whether to initialize instances immediately
|
|
222
|
+
* @returns Instance pool
|
|
223
|
+
*/
|
|
224
|
+
export async function createInstancePool(brain, autoInit = true) {
|
|
225
|
+
const pool = new InstancePool(brain);
|
|
226
|
+
if (autoInit) {
|
|
227
|
+
await pool.init();
|
|
228
|
+
}
|
|
229
|
+
return pool;
|
|
230
|
+
}
|
|
231
|
+
//# sourceMappingURL=InstancePool.js.map
|
|
@@ -101,6 +101,7 @@ export declare class SmartCSVImporter {
|
|
|
101
101
|
private brain;
|
|
102
102
|
private extractor;
|
|
103
103
|
private nlp;
|
|
104
|
+
private relationshipExtractor;
|
|
104
105
|
private csvHandler;
|
|
105
106
|
constructor(brain: Brainy);
|
|
106
107
|
/**
|
|
@@ -124,7 +125,7 @@ export declare class SmartCSVImporter {
|
|
|
124
125
|
*/
|
|
125
126
|
private mapTypeString;
|
|
126
127
|
/**
|
|
127
|
-
* Infer relationship type from context
|
|
128
|
+
* Infer relationship type from context using SmartRelationshipExtractor
|
|
128
129
|
*/
|
|
129
130
|
private inferRelationship;
|
|
130
131
|
/**
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
|
|
14
14
|
import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
|
|
15
|
+
import { SmartRelationshipExtractor } from '../neural/SmartRelationshipExtractor.js';
|
|
15
16
|
import { NounType, VerbType } from '../types/graphTypes.js';
|
|
16
17
|
import { CSVHandler } from '../augmentations/intelligentImport/handlers/csvHandler.js';
|
|
17
18
|
/**
|
|
@@ -22,6 +23,7 @@ export class SmartCSVImporter {
|
|
|
22
23
|
this.brain = brain;
|
|
23
24
|
this.extractor = new NeuralEntityExtractor(brain);
|
|
24
25
|
this.nlp = new NaturalLanguageProcessor(brain);
|
|
26
|
+
this.relationshipExtractor = new SmartRelationshipExtractor(brain);
|
|
25
27
|
this.csvHandler = new CSVHandler();
|
|
26
28
|
}
|
|
27
29
|
/**
|
|
@@ -266,29 +268,16 @@ export class SmartCSVImporter {
|
|
|
266
268
|
return mapping[normalized] || NounType.Thing;
|
|
267
269
|
}
|
|
268
270
|
/**
|
|
269
|
-
* Infer relationship type from context
|
|
271
|
+
* Infer relationship type from context using SmartRelationshipExtractor
|
|
270
272
|
*/
|
|
271
|
-
async inferRelationship(fromTerm, toTerm, context) {
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
[new RegExp(`capital.*${toTerm}`, 'i'), VerbType.Contains],
|
|
280
|
-
[new RegExp(`created by.*${toTerm}`, 'i'), VerbType.CreatedBy],
|
|
281
|
-
[new RegExp(`authored by.*${toTerm}`, 'i'), VerbType.CreatedBy],
|
|
282
|
-
[new RegExp(`part of.*${toTerm}`, 'i'), VerbType.PartOf],
|
|
283
|
-
[new RegExp(`related to.*${toTerm}`, 'i'), VerbType.RelatedTo]
|
|
284
|
-
];
|
|
285
|
-
for (const [pattern, verbType] of patterns) {
|
|
286
|
-
if (pattern.test(lowerContext)) {
|
|
287
|
-
return verbType;
|
|
288
|
-
}
|
|
289
|
-
}
|
|
290
|
-
// Default to RelatedTo
|
|
291
|
-
return VerbType.RelatedTo;
|
|
273
|
+
async inferRelationship(fromTerm, toTerm, context, fromType, toType) {
|
|
274
|
+
// Use SmartRelationshipExtractor for robust relationship classification
|
|
275
|
+
const result = await this.relationshipExtractor.infer(fromTerm, toTerm, context, {
|
|
276
|
+
subjectType: fromType,
|
|
277
|
+
objectType: toType
|
|
278
|
+
});
|
|
279
|
+
// Return inferred type or fallback to RelatedTo
|
|
280
|
+
return result?.type || VerbType.RelatedTo;
|
|
292
281
|
}
|
|
293
282
|
/**
|
|
294
283
|
* Generate consistent entity ID from name
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart DOCX Importer
|
|
3
|
+
*
|
|
4
|
+
* Extracts entities and relationships from Word documents using:
|
|
5
|
+
* - Mammoth parser for DOCX → HTML/text conversion
|
|
6
|
+
* - Heading extraction for document structure
|
|
7
|
+
* - Table extraction for structured data
|
|
8
|
+
* - NeuralEntityExtractor for entity extraction from paragraphs
|
|
9
|
+
* - NaturalLanguageProcessor for relationship inference
|
|
10
|
+
* - Hierarchical relationship creation based on heading hierarchy
|
|
11
|
+
*
|
|
12
|
+
* v4.2.0: New format handler
|
|
13
|
+
* NO MOCKS - Production-ready implementation
|
|
14
|
+
*/
|
|
15
|
+
import { Brainy } from '../brainy.js';
|
|
16
|
+
import { NounType, VerbType } from '../types/graphTypes.js';
|
|
17
|
+
export interface SmartDOCXOptions {
|
|
18
|
+
/** Enable neural entity extraction from paragraphs */
|
|
19
|
+
enableNeuralExtraction?: boolean;
|
|
20
|
+
/** Enable hierarchical relationship creation based on headings */
|
|
21
|
+
enableHierarchicalRelationships?: boolean;
|
|
22
|
+
/** Enable concept extraction for tagging */
|
|
23
|
+
enableConceptExtraction?: boolean;
|
|
24
|
+
/** Confidence threshold for entities (0-1) */
|
|
25
|
+
confidenceThreshold?: number;
|
|
26
|
+
/** Minimum paragraph length to process */
|
|
27
|
+
minParagraphLength?: number;
|
|
28
|
+
/** Progress callback */
|
|
29
|
+
onProgress?: (stats: {
|
|
30
|
+
processed: number;
|
|
31
|
+
entities: number;
|
|
32
|
+
relationships: number;
|
|
33
|
+
}) => void;
|
|
34
|
+
}
|
|
35
|
+
export interface ExtractedDOCXEntity {
|
|
36
|
+
/** Entity ID */
|
|
37
|
+
id: string;
|
|
38
|
+
/** Entity name */
|
|
39
|
+
name: string;
|
|
40
|
+
/** Entity type */
|
|
41
|
+
type: NounType;
|
|
42
|
+
/** Entity description/context */
|
|
43
|
+
description: string;
|
|
44
|
+
/** Confidence score */
|
|
45
|
+
confidence: number;
|
|
46
|
+
/** Weight/importance score */
|
|
47
|
+
weight?: number;
|
|
48
|
+
/** Section/heading context */
|
|
49
|
+
section: string | null;
|
|
50
|
+
/** Paragraph index in document */
|
|
51
|
+
paragraphIndex: number;
|
|
52
|
+
/** Metadata */
|
|
53
|
+
metadata: Record<string, any>;
|
|
54
|
+
}
|
|
55
|
+
export interface ExtractedDOCXRelationship {
|
|
56
|
+
from: string;
|
|
57
|
+
to: string;
|
|
58
|
+
type: VerbType;
|
|
59
|
+
confidence: number;
|
|
60
|
+
weight?: number;
|
|
61
|
+
evidence: string;
|
|
62
|
+
}
|
|
63
|
+
export interface SmartDOCXResult {
|
|
64
|
+
/** Total paragraphs processed */
|
|
65
|
+
paragraphsProcessed: number;
|
|
66
|
+
/** Entities extracted */
|
|
67
|
+
entitiesExtracted: number;
|
|
68
|
+
/** Relationships inferred */
|
|
69
|
+
relationshipsInferred: number;
|
|
70
|
+
/** All extracted entities */
|
|
71
|
+
entities: ExtractedDOCXEntity[];
|
|
72
|
+
/** All relationships */
|
|
73
|
+
relationships: ExtractedDOCXRelationship[];
|
|
74
|
+
/** Entity ID mapping (index -> ID) */
|
|
75
|
+
entityMap: Map<string, string>;
|
|
76
|
+
/** Processing time in ms */
|
|
77
|
+
processingTime: number;
|
|
78
|
+
/** Document structure */
|
|
79
|
+
structure: {
|
|
80
|
+
headings: Array<{
|
|
81
|
+
level: number;
|
|
82
|
+
text: string;
|
|
83
|
+
index: number;
|
|
84
|
+
}>;
|
|
85
|
+
paragraphCount: number;
|
|
86
|
+
tableCount: number;
|
|
87
|
+
};
|
|
88
|
+
/** Extraction statistics */
|
|
89
|
+
stats: {
|
|
90
|
+
byType: Record<string, number>;
|
|
91
|
+
bySection: Record<string, number>;
|
|
92
|
+
byConfidence: {
|
|
93
|
+
high: number;
|
|
94
|
+
medium: number;
|
|
95
|
+
low: number;
|
|
96
|
+
};
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* SmartDOCXImporter - Extracts structured knowledge from Word documents
|
|
101
|
+
*/
|
|
102
|
+
export declare class SmartDOCXImporter {
|
|
103
|
+
private brain;
|
|
104
|
+
private extractor;
|
|
105
|
+
private nlp;
|
|
106
|
+
private relationshipExtractor;
|
|
107
|
+
private mammothLoaded;
|
|
108
|
+
constructor(brain: Brainy);
|
|
109
|
+
/**
|
|
110
|
+
* Initialize the importer
|
|
111
|
+
*/
|
|
112
|
+
init(): Promise<void>;
|
|
113
|
+
/**
|
|
114
|
+
* Extract entities and relationships from DOCX buffer
|
|
115
|
+
*/
|
|
116
|
+
extract(buffer: Buffer, options?: SmartDOCXOptions): Promise<SmartDOCXResult>;
|
|
117
|
+
/**
|
|
118
|
+
* Extract entities and relationships from parsed DOCX content
|
|
119
|
+
*/
|
|
120
|
+
private extractFromContent;
|
|
121
|
+
/**
|
|
122
|
+
* Parse document structure from HTML
|
|
123
|
+
*/
|
|
124
|
+
private parseStructure;
|
|
125
|
+
}
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart DOCX Importer
|
|
3
|
+
*
|
|
4
|
+
* Extracts entities and relationships from Word documents using:
|
|
5
|
+
* - Mammoth parser for DOCX → HTML/text conversion
|
|
6
|
+
* - Heading extraction for document structure
|
|
7
|
+
* - Table extraction for structured data
|
|
8
|
+
* - NeuralEntityExtractor for entity extraction from paragraphs
|
|
9
|
+
* - NaturalLanguageProcessor for relationship inference
|
|
10
|
+
* - Hierarchical relationship creation based on heading hierarchy
|
|
11
|
+
*
|
|
12
|
+
* v4.2.0: New format handler
|
|
13
|
+
* NO MOCKS - Production-ready implementation
|
|
14
|
+
*/
|
|
15
|
+
import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
|
|
16
|
+
import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
|
|
17
|
+
import { SmartRelationshipExtractor } from '../neural/SmartRelationshipExtractor.js';
|
|
18
|
+
import { VerbType } from '../types/graphTypes.js';
|
|
19
|
+
// Dynamic import for mammoth (ESM compatibility)
|
|
20
|
+
let mammoth;
|
|
21
|
+
/**
|
|
22
|
+
* SmartDOCXImporter - Extracts structured knowledge from Word documents
|
|
23
|
+
*/
|
|
24
|
+
export class SmartDOCXImporter {
|
|
25
|
+
constructor(brain) {
|
|
26
|
+
this.mammothLoaded = false;
|
|
27
|
+
this.brain = brain;
|
|
28
|
+
this.extractor = new NeuralEntityExtractor(brain);
|
|
29
|
+
this.nlp = new NaturalLanguageProcessor(brain);
|
|
30
|
+
this.relationshipExtractor = new SmartRelationshipExtractor(brain);
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Initialize the importer
|
|
34
|
+
*/
|
|
35
|
+
async init() {
|
|
36
|
+
await this.nlp.init();
|
|
37
|
+
// Lazy load mammoth
|
|
38
|
+
if (!this.mammothLoaded) {
|
|
39
|
+
try {
|
|
40
|
+
mammoth = await import('mammoth');
|
|
41
|
+
this.mammothLoaded = true;
|
|
42
|
+
}
|
|
43
|
+
catch (error) {
|
|
44
|
+
throw new Error(`Failed to load mammoth parser: ${error.message}`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Extract entities and relationships from DOCX buffer
|
|
50
|
+
*/
|
|
51
|
+
async extract(buffer, options = {}) {
|
|
52
|
+
const startTime = Date.now();
|
|
53
|
+
// Ensure mammoth is loaded
|
|
54
|
+
if (!this.mammothLoaded) {
|
|
55
|
+
await this.init();
|
|
56
|
+
}
|
|
57
|
+
// Extract raw text for entity extraction
|
|
58
|
+
const textResult = await mammoth.extractRawText({ buffer });
|
|
59
|
+
// Extract HTML for structure analysis (headings, tables)
|
|
60
|
+
const htmlResult = await mammoth.convertToHtml({ buffer });
|
|
61
|
+
// Process the document
|
|
62
|
+
const result = await this.extractFromContent(textResult.value, htmlResult.value, options);
|
|
63
|
+
result.processingTime = Date.now() - startTime;
|
|
64
|
+
return result;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Extract entities and relationships from parsed DOCX content
|
|
68
|
+
*/
|
|
69
|
+
async extractFromContent(rawText, html, options) {
|
|
70
|
+
const opts = {
|
|
71
|
+
enableNeuralExtraction: options.enableNeuralExtraction !== false,
|
|
72
|
+
enableHierarchicalRelationships: options.enableHierarchicalRelationships !== false,
|
|
73
|
+
enableConceptExtraction: options.enableConceptExtraction !== false,
|
|
74
|
+
confidenceThreshold: options.confidenceThreshold || 0.6,
|
|
75
|
+
minParagraphLength: options.minParagraphLength || 20
|
|
76
|
+
};
|
|
77
|
+
const entities = [];
|
|
78
|
+
const relationships = [];
|
|
79
|
+
const entityMap = new Map();
|
|
80
|
+
const stats = {
|
|
81
|
+
byType: {},
|
|
82
|
+
bySection: {},
|
|
83
|
+
byConfidence: { high: 0, medium: 0, low: 0 }
|
|
84
|
+
};
|
|
85
|
+
// Parse document structure from HTML
|
|
86
|
+
const structure = this.parseStructure(html);
|
|
87
|
+
// Split into paragraphs
|
|
88
|
+
const paragraphs = rawText.split(/\n\n+/).filter(p => p.trim().length >= opts.minParagraphLength);
|
|
89
|
+
let currentSection = 'Introduction';
|
|
90
|
+
let headingIndex = 0;
|
|
91
|
+
// Process each paragraph
|
|
92
|
+
for (let i = 0; i < paragraphs.length; i++) {
|
|
93
|
+
const paragraph = paragraphs[i].trim();
|
|
94
|
+
// Check if this paragraph is a heading
|
|
95
|
+
if (headingIndex < structure.headings.length) {
|
|
96
|
+
const heading = structure.headings[headingIndex];
|
|
97
|
+
if (paragraph.startsWith(heading.text) || heading.text.includes(paragraph.substring(0, 50))) {
|
|
98
|
+
currentSection = heading.text;
|
|
99
|
+
headingIndex++;
|
|
100
|
+
stats.bySection[currentSection] = 0;
|
|
101
|
+
continue;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
// Extract entities from paragraph
|
|
105
|
+
if (opts.enableNeuralExtraction) {
|
|
106
|
+
const extractedEntities = await this.extractor.extract(paragraph, {
|
|
107
|
+
confidence: opts.confidenceThreshold
|
|
108
|
+
});
|
|
109
|
+
for (const extracted of extractedEntities) {
|
|
110
|
+
const entityId = `para${i}:${extracted.text}`;
|
|
111
|
+
const entity = {
|
|
112
|
+
id: entityId,
|
|
113
|
+
name: extracted.text,
|
|
114
|
+
type: extracted.type,
|
|
115
|
+
description: paragraph,
|
|
116
|
+
confidence: extracted.confidence,
|
|
117
|
+
weight: extracted.weight || 1.0,
|
|
118
|
+
section: currentSection,
|
|
119
|
+
paragraphIndex: i,
|
|
120
|
+
metadata: {
|
|
121
|
+
position: extracted.position,
|
|
122
|
+
headingContext: currentSection
|
|
123
|
+
}
|
|
124
|
+
};
|
|
125
|
+
entities.push(entity);
|
|
126
|
+
entityMap.set(entityId, entityId);
|
|
127
|
+
// Update stats
|
|
128
|
+
stats.byType[entity.type] = (stats.byType[entity.type] || 0) + 1;
|
|
129
|
+
stats.bySection[currentSection] = (stats.bySection[currentSection] || 0) + 1;
|
|
130
|
+
if (entity.confidence > 0.8)
|
|
131
|
+
stats.byConfidence.high++;
|
|
132
|
+
else if (entity.confidence >= 0.6)
|
|
133
|
+
stats.byConfidence.medium++;
|
|
134
|
+
else
|
|
135
|
+
stats.byConfidence.low++;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
// Report progress
|
|
139
|
+
if (options.onProgress && i % 10 === 0) {
|
|
140
|
+
options.onProgress({
|
|
141
|
+
processed: i,
|
|
142
|
+
entities: entities.length,
|
|
143
|
+
relationships: relationships.length
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// Create hierarchical relationships based on sections
|
|
148
|
+
if (opts.enableHierarchicalRelationships) {
|
|
149
|
+
const entitiesBySection = new Map();
|
|
150
|
+
for (const entity of entities) {
|
|
151
|
+
const section = entity.section || 'Unknown';
|
|
152
|
+
if (!entitiesBySection.has(section)) {
|
|
153
|
+
entitiesBySection.set(section, []);
|
|
154
|
+
}
|
|
155
|
+
entitiesBySection.get(section).push(entity);
|
|
156
|
+
}
|
|
157
|
+
// Create relationships within sections
|
|
158
|
+
for (const [section, sectionEntities] of entitiesBySection) {
|
|
159
|
+
for (let i = 0; i < sectionEntities.length - 1; i++) {
|
|
160
|
+
for (let j = i + 1; j < Math.min(i + 3, sectionEntities.length); j++) {
|
|
161
|
+
const entityA = sectionEntities[i];
|
|
162
|
+
const entityB = sectionEntities[j];
|
|
163
|
+
// Infer relationship type using SmartRelationshipExtractor
|
|
164
|
+
// Combine entity descriptions for better context
|
|
165
|
+
const context = `In section "${section}": ${entityA.description.substring(0, 150)}... ${entityB.description.substring(0, 150)}...`;
|
|
166
|
+
const inferredRelationship = await this.relationshipExtractor.infer(entityA.name, entityB.name, context, {
|
|
167
|
+
subjectType: entityA.type,
|
|
168
|
+
objectType: entityB.type
|
|
169
|
+
});
|
|
170
|
+
relationships.push({
|
|
171
|
+
from: entityA.id,
|
|
172
|
+
to: entityB.id,
|
|
173
|
+
type: inferredRelationship?.type || VerbType.RelatedTo, // Fallback to RelatedTo for co-occurrence
|
|
174
|
+
confidence: inferredRelationship?.confidence || 0.7,
|
|
175
|
+
weight: inferredRelationship?.weight || 0.8,
|
|
176
|
+
evidence: inferredRelationship?.evidence || `Both entities appear in section: ${section}`
|
|
177
|
+
});
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
// Final progress report
|
|
183
|
+
if (options.onProgress) {
|
|
184
|
+
options.onProgress({
|
|
185
|
+
processed: paragraphs.length,
|
|
186
|
+
entities: entities.length,
|
|
187
|
+
relationships: relationships.length
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
return {
|
|
191
|
+
paragraphsProcessed: paragraphs.length,
|
|
192
|
+
entitiesExtracted: entities.length,
|
|
193
|
+
relationshipsInferred: relationships.length,
|
|
194
|
+
entities,
|
|
195
|
+
relationships,
|
|
196
|
+
entityMap,
|
|
197
|
+
processingTime: 0, // Will be set by caller
|
|
198
|
+
structure,
|
|
199
|
+
stats
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
/**
|
|
203
|
+
* Parse document structure from HTML
|
|
204
|
+
*/
|
|
205
|
+
parseStructure(html) {
|
|
206
|
+
const headings = [];
|
|
207
|
+
// Extract headings (h1-h6)
|
|
208
|
+
const headingRegex = /<h([1-6])>(.*?)<\/h\1>/gi;
|
|
209
|
+
let match;
|
|
210
|
+
let index = 0;
|
|
211
|
+
while ((match = headingRegex.exec(html)) !== null) {
|
|
212
|
+
const level = parseInt(match[1]);
|
|
213
|
+
const text = match[2].replace(/<[^>]+>/g, '').trim(); // Strip HTML tags
|
|
214
|
+
headings.push({ level, text, index: index++ });
|
|
215
|
+
}
|
|
216
|
+
// Count paragraphs
|
|
217
|
+
const paragraphCount = (html.match(/<p>/g) || []).length;
|
|
218
|
+
// Count tables
|
|
219
|
+
const tableCount = (html.match(/<table>/g) || []).length;
|
|
220
|
+
return {
|
|
221
|
+
headings,
|
|
222
|
+
paragraphCount,
|
|
223
|
+
tableCount
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
//# sourceMappingURL=SmartDOCXImporter.js.map
|
|
@@ -88,6 +88,16 @@ export interface SmartExcelResult {
|
|
|
88
88
|
low: number;
|
|
89
89
|
};
|
|
90
90
|
};
|
|
91
|
+
/** Sheet-specific data for VFS extraction (v4.2.0) */
|
|
92
|
+
sheets?: Array<{
|
|
93
|
+
name: string;
|
|
94
|
+
rows: ExtractedRow[];
|
|
95
|
+
stats: {
|
|
96
|
+
rowCount: number;
|
|
97
|
+
entityCount: number;
|
|
98
|
+
relationshipCount: number;
|
|
99
|
+
};
|
|
100
|
+
}>;
|
|
91
101
|
}
|
|
92
102
|
/**
|
|
93
103
|
* SmartExcelImporter - Extracts structured knowledge from Excel files
|
|
@@ -96,6 +106,7 @@ export declare class SmartExcelImporter {
|
|
|
96
106
|
private brain;
|
|
97
107
|
private extractor;
|
|
98
108
|
private nlp;
|
|
109
|
+
private relationshipExtractor;
|
|
99
110
|
private excelHandler;
|
|
100
111
|
constructor(brain: Brainy);
|
|
101
112
|
/**
|
|
@@ -119,7 +130,7 @@ export declare class SmartExcelImporter {
|
|
|
119
130
|
*/
|
|
120
131
|
private mapTypeString;
|
|
121
132
|
/**
|
|
122
|
-
* Infer relationship type from context
|
|
133
|
+
* Infer relationship type from context using SmartRelationshipExtractor
|
|
123
134
|
*/
|
|
124
135
|
private inferRelationship;
|
|
125
136
|
/**
|