@soulcraft/brainy 3.27.1 → 3.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +5 -0
- package/dist/brainy.d.ts +50 -0
- package/dist/brainy.js +36 -0
- package/dist/import/EntityDeduplicator.d.ts +84 -0
- package/dist/import/EntityDeduplicator.js +255 -0
- package/dist/import/FormatDetector.d.ts +65 -0
- package/dist/import/FormatDetector.js +263 -0
- package/dist/import/ImportCoordinator.d.ts +160 -0
- package/dist/import/ImportCoordinator.js +498 -0
- package/dist/import/ImportHistory.d.ts +92 -0
- package/dist/import/ImportHistory.js +183 -0
- package/dist/import/index.d.ts +16 -0
- package/dist/import/index.js +14 -0
- package/dist/importers/SmartCSVImporter.d.ts +136 -0
- package/dist/importers/SmartCSVImporter.js +308 -0
- package/dist/importers/SmartExcelImporter.d.ts +131 -0
- package/dist/importers/SmartExcelImporter.js +302 -0
- package/dist/importers/SmartImportOrchestrator.d.ts +125 -0
- package/dist/importers/SmartImportOrchestrator.js +531 -0
- package/dist/importers/SmartJSONImporter.d.ts +135 -0
- package/dist/importers/SmartJSONImporter.js +325 -0
- package/dist/importers/SmartMarkdownImporter.d.ts +159 -0
- package/dist/importers/SmartMarkdownImporter.js +369 -0
- package/dist/importers/SmartPDFImporter.d.ts +154 -0
- package/dist/importers/SmartPDFImporter.js +337 -0
- package/dist/importers/VFSStructureGenerator.d.ts +82 -0
- package/dist/importers/VFSStructureGenerator.js +260 -0
- package/dist/importers/index.d.ts +28 -0
- package/dist/importers/index.js +29 -0
- package/package.json +1 -1
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart PDF Importer
|
|
3
|
+
*
|
|
4
|
+
* Extracts entities and relationships from PDF files using:
|
|
5
|
+
* - NeuralEntityExtractor for entity extraction
|
|
6
|
+
* - NaturalLanguageProcessor for relationship inference
|
|
7
|
+
* - brain.extractConcepts() for tagging
|
|
8
|
+
* - Section-based organization (by page or detected structure)
|
|
9
|
+
*
|
|
10
|
+
* NO MOCKS - Production-ready implementation
|
|
11
|
+
*/
|
|
12
|
+
import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
|
|
13
|
+
import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
|
|
14
|
+
import { VerbType } from '../types/graphTypes.js';
|
|
15
|
+
import { PDFHandler } from '../augmentations/intelligentImport/handlers/pdfHandler.js';
|
|
16
|
+
/**
|
|
17
|
+
* SmartPDFImporter - Extracts structured knowledge from PDF files
|
|
18
|
+
*/
|
|
19
|
+
export class SmartPDFImporter {
|
|
20
|
+
constructor(brain) {
|
|
21
|
+
this.brain = brain;
|
|
22
|
+
this.extractor = new NeuralEntityExtractor(brain);
|
|
23
|
+
this.nlp = new NaturalLanguageProcessor(brain);
|
|
24
|
+
this.pdfHandler = new PDFHandler();
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Initialize the importer
|
|
28
|
+
*/
|
|
29
|
+
async init() {
|
|
30
|
+
await this.nlp.init();
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Extract entities and relationships from PDF file
|
|
34
|
+
*/
|
|
35
|
+
async extract(buffer, options = {}) {
|
|
36
|
+
const startTime = Date.now();
|
|
37
|
+
// Set defaults
|
|
38
|
+
const opts = {
|
|
39
|
+
enableNeuralExtraction: true,
|
|
40
|
+
enableRelationshipInference: true,
|
|
41
|
+
enableConceptExtraction: true,
|
|
42
|
+
confidenceThreshold: 0.6,
|
|
43
|
+
minParagraphLength: 50,
|
|
44
|
+
extractFromTables: true,
|
|
45
|
+
groupBy: 'document',
|
|
46
|
+
onProgress: () => { },
|
|
47
|
+
...options
|
|
48
|
+
};
|
|
49
|
+
// Parse PDF using existing handler
|
|
50
|
+
const processedData = await this.pdfHandler.process(buffer, options);
|
|
51
|
+
const data = processedData.data;
|
|
52
|
+
const pdfMetadata = processedData.metadata.additionalInfo?.pdfMetadata || {};
|
|
53
|
+
if (data.length === 0) {
|
|
54
|
+
return this.emptyResult(startTime, pdfMetadata);
|
|
55
|
+
}
|
|
56
|
+
// Group data by page or combine into single document
|
|
57
|
+
const grouped = this.groupData(data, opts);
|
|
58
|
+
// Process each group
|
|
59
|
+
const sections = [];
|
|
60
|
+
const entityMap = new Map();
|
|
61
|
+
const stats = {
|
|
62
|
+
byType: {},
|
|
63
|
+
byConfidence: { high: 0, medium: 0, low: 0 },
|
|
64
|
+
bySource: { paragraphs: 0, tables: 0 }
|
|
65
|
+
};
|
|
66
|
+
let processedCount = 0;
|
|
67
|
+
const totalGroups = grouped.length;
|
|
68
|
+
for (const group of grouped) {
|
|
69
|
+
const sectionResult = await this.processSection(group, opts, stats, entityMap);
|
|
70
|
+
sections.push(sectionResult);
|
|
71
|
+
processedCount++;
|
|
72
|
+
opts.onProgress({
|
|
73
|
+
processed: processedCount,
|
|
74
|
+
total: totalGroups,
|
|
75
|
+
entities: sections.reduce((sum, s) => sum + s.entities.length, 0),
|
|
76
|
+
relationships: sections.reduce((sum, s) => sum + s.relationships.length, 0)
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
const pagesProcessed = new Set(data.map(d => d._page)).size;
|
|
80
|
+
return {
|
|
81
|
+
sectionsProcessed: sections.length,
|
|
82
|
+
pagesProcessed,
|
|
83
|
+
entitiesExtracted: sections.reduce((sum, s) => sum + s.entities.length, 0),
|
|
84
|
+
relationshipsInferred: sections.reduce((sum, s) => sum + s.relationships.length, 0),
|
|
85
|
+
sections,
|
|
86
|
+
entityMap,
|
|
87
|
+
processingTime: Date.now() - startTime,
|
|
88
|
+
stats,
|
|
89
|
+
pdfMetadata: {
|
|
90
|
+
pageCount: pdfMetadata.pageCount || pagesProcessed,
|
|
91
|
+
title: pdfMetadata.title,
|
|
92
|
+
author: pdfMetadata.author,
|
|
93
|
+
subject: pdfMetadata.subject
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Group data by strategy
|
|
99
|
+
*/
|
|
100
|
+
groupData(data, options) {
|
|
101
|
+
if (options.groupBy === 'page') {
|
|
102
|
+
// Group by page
|
|
103
|
+
const pageGroups = new Map();
|
|
104
|
+
for (const item of data) {
|
|
105
|
+
const page = item._page || 1;
|
|
106
|
+
if (!pageGroups.has(page)) {
|
|
107
|
+
pageGroups.set(page, []);
|
|
108
|
+
}
|
|
109
|
+
pageGroups.get(page).push(item);
|
|
110
|
+
}
|
|
111
|
+
return Array.from(pageGroups.entries()).map(([page, items]) => ({
|
|
112
|
+
id: `page_${page}`,
|
|
113
|
+
type: 'page',
|
|
114
|
+
items
|
|
115
|
+
}));
|
|
116
|
+
}
|
|
117
|
+
else {
|
|
118
|
+
// Single document group
|
|
119
|
+
return [{
|
|
120
|
+
id: 'document',
|
|
121
|
+
type: 'paragraph',
|
|
122
|
+
items: data
|
|
123
|
+
}];
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Process a single section
|
|
128
|
+
*/
|
|
129
|
+
async processSection(group, options, stats, entityMap) {
|
|
130
|
+
// Combine all text from the group
|
|
131
|
+
const texts = [];
|
|
132
|
+
for (const item of group.items) {
|
|
133
|
+
if (item._type === 'paragraph') {
|
|
134
|
+
const text = item.text || '';
|
|
135
|
+
if (text.length >= (options.minParagraphLength || 50)) {
|
|
136
|
+
texts.push(text);
|
|
137
|
+
stats.bySource.paragraphs++;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
else if (item._type === 'table_row' && options.extractFromTables) {
|
|
141
|
+
// For table rows, combine all column values
|
|
142
|
+
const values = Object.entries(item)
|
|
143
|
+
.filter(([key]) => !key.startsWith('_'))
|
|
144
|
+
.map(([_, value]) => String(value))
|
|
145
|
+
.filter(Boolean);
|
|
146
|
+
if (values.length > 0) {
|
|
147
|
+
texts.push(values.join(' '));
|
|
148
|
+
stats.bySource.tables++;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
const combinedText = texts.join('\n\n');
|
|
153
|
+
// Extract entities if enabled
|
|
154
|
+
let extractedEntities = [];
|
|
155
|
+
if (options.enableNeuralExtraction && combinedText.length > 0) {
|
|
156
|
+
extractedEntities = await this.extractor.extract(combinedText, {
|
|
157
|
+
confidence: options.confidenceThreshold || 0.6,
|
|
158
|
+
neuralMatching: true,
|
|
159
|
+
cache: { enabled: true }
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
// Extract concepts if enabled
|
|
163
|
+
let concepts = [];
|
|
164
|
+
if (options.enableConceptExtraction && combinedText.length > 0) {
|
|
165
|
+
try {
|
|
166
|
+
concepts = await this.brain.extractConcepts(combinedText, { limit: 15 });
|
|
167
|
+
}
|
|
168
|
+
catch (error) {
|
|
169
|
+
concepts = [];
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
// Create entity objects
|
|
173
|
+
const entities = extractedEntities.map(e => {
|
|
174
|
+
const entityId = this.generateEntityId(e.text, group.id);
|
|
175
|
+
entityMap.set(e.text.toLowerCase(), entityId);
|
|
176
|
+
// Update statistics
|
|
177
|
+
this.updateStats(stats, e.type, e.confidence);
|
|
178
|
+
return {
|
|
179
|
+
id: entityId,
|
|
180
|
+
name: e.text,
|
|
181
|
+
type: e.type,
|
|
182
|
+
description: this.extractContextAroundEntity(combinedText, e.text),
|
|
183
|
+
confidence: e.confidence,
|
|
184
|
+
metadata: {
|
|
185
|
+
source: 'pdf',
|
|
186
|
+
section: group.id,
|
|
187
|
+
sectionType: group.type,
|
|
188
|
+
extractedAt: Date.now()
|
|
189
|
+
}
|
|
190
|
+
};
|
|
191
|
+
});
|
|
192
|
+
// Infer relationships if enabled
|
|
193
|
+
const relationships = [];
|
|
194
|
+
if (options.enableRelationshipInference && entities.length > 1) {
|
|
195
|
+
// Find relationships between entities in this section
|
|
196
|
+
for (let i = 0; i < entities.length; i++) {
|
|
197
|
+
for (let j = i + 1; j < entities.length; j++) {
|
|
198
|
+
const entity1 = entities[i];
|
|
199
|
+
const entity2 = entities[j];
|
|
200
|
+
// Check if entities appear near each other in text
|
|
201
|
+
if (this.entitiesAreRelated(combinedText, entity1.name, entity2.name)) {
|
|
202
|
+
const verbType = await this.inferRelationship(entity1.name, entity2.name, combinedText);
|
|
203
|
+
const context = this.extractRelationshipContext(combinedText, entity1.name, entity2.name);
|
|
204
|
+
relationships.push({
|
|
205
|
+
from: entity1.id,
|
|
206
|
+
to: entity2.id,
|
|
207
|
+
type: verbType,
|
|
208
|
+
confidence: Math.min(entity1.confidence, entity2.confidence) * 0.9,
|
|
209
|
+
evidence: context
|
|
210
|
+
});
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
return {
|
|
216
|
+
sectionId: group.id,
|
|
217
|
+
sectionType: group.type,
|
|
218
|
+
entities,
|
|
219
|
+
relationships,
|
|
220
|
+
concepts,
|
|
221
|
+
text: combinedText.substring(0, 1000) // Store first 1000 chars as preview
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Extract context around an entity mention
|
|
226
|
+
*/
|
|
227
|
+
extractContextAroundEntity(text, entityName, contextLength = 200) {
|
|
228
|
+
const index = text.toLowerCase().indexOf(entityName.toLowerCase());
|
|
229
|
+
if (index === -1)
|
|
230
|
+
return text.substring(0, contextLength);
|
|
231
|
+
const start = Math.max(0, index - contextLength / 2);
|
|
232
|
+
const end = Math.min(text.length, index + entityName.length + contextLength / 2);
|
|
233
|
+
return text.substring(start, end).trim();
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Check if two entities are related based on proximity in text
|
|
237
|
+
*/
|
|
238
|
+
entitiesAreRelated(text, entity1, entity2) {
|
|
239
|
+
const lowerText = text.toLowerCase();
|
|
240
|
+
const index1 = lowerText.indexOf(entity1.toLowerCase());
|
|
241
|
+
const index2 = lowerText.indexOf(entity2.toLowerCase());
|
|
242
|
+
if (index1 === -1 || index2 === -1)
|
|
243
|
+
return false;
|
|
244
|
+
// Entities are related if they appear within 500 characters of each other
|
|
245
|
+
return Math.abs(index1 - index2) < 500;
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Extract context showing relationship between entities
|
|
249
|
+
*/
|
|
250
|
+
extractRelationshipContext(text, entity1, entity2) {
|
|
251
|
+
const lowerText = text.toLowerCase();
|
|
252
|
+
const index1 = lowerText.indexOf(entity1.toLowerCase());
|
|
253
|
+
const index2 = lowerText.indexOf(entity2.toLowerCase());
|
|
254
|
+
if (index1 === -1 || index2 === -1)
|
|
255
|
+
return '';
|
|
256
|
+
const start = Math.min(index1, index2);
|
|
257
|
+
const end = Math.max(index1 + entity1.length, index2 + entity2.length);
|
|
258
|
+
return text.substring(start, end + 100).trim();
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Infer relationship type from context
|
|
262
|
+
*/
|
|
263
|
+
async inferRelationship(fromEntity, toEntity, context) {
|
|
264
|
+
const lowerContext = context.toLowerCase();
|
|
265
|
+
// Pattern-based relationship detection
|
|
266
|
+
const patterns = [
|
|
267
|
+
[new RegExp(`${toEntity}.*of.*${fromEntity}`, 'i'), VerbType.PartOf],
|
|
268
|
+
[new RegExp(`${fromEntity}.*contains.*${toEntity}`, 'i'), VerbType.Contains],
|
|
269
|
+
[new RegExp(`${fromEntity}.*in.*${toEntity}`, 'i'), VerbType.LocatedAt],
|
|
270
|
+
[new RegExp(`${fromEntity}.*by.*${toEntity}`, 'i'), VerbType.CreatedBy],
|
|
271
|
+
[new RegExp(`${fromEntity}.*created.*${toEntity}`, 'i'), VerbType.Creates],
|
|
272
|
+
[new RegExp(`${fromEntity}.*authored.*${toEntity}`, 'i'), VerbType.CreatedBy],
|
|
273
|
+
[new RegExp(`${fromEntity}.*part of.*${toEntity}`, 'i'), VerbType.PartOf],
|
|
274
|
+
[new RegExp(`${fromEntity}.*related to.*${toEntity}`, 'i'), VerbType.RelatedTo],
|
|
275
|
+
[new RegExp(`${fromEntity}.*and.*${toEntity}`, 'i'), VerbType.RelatedTo]
|
|
276
|
+
];
|
|
277
|
+
for (const [pattern, verbType] of patterns) {
|
|
278
|
+
if (pattern.test(lowerContext)) {
|
|
279
|
+
return verbType;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
// Default to RelatedTo
|
|
283
|
+
return VerbType.RelatedTo;
|
|
284
|
+
}
|
|
285
|
+
/**
|
|
286
|
+
* Generate consistent entity ID
|
|
287
|
+
*/
|
|
288
|
+
generateEntityId(name, section) {
|
|
289
|
+
const normalized = name.toLowerCase().trim().replace(/\s+/g, '_');
|
|
290
|
+
const sectionNorm = section.replace(/\s+/g, '_');
|
|
291
|
+
return `ent_${normalized}_${sectionNorm}_${Date.now()}`;
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Update statistics
|
|
295
|
+
*/
|
|
296
|
+
updateStats(stats, type, confidence) {
|
|
297
|
+
// Track by type
|
|
298
|
+
const typeName = String(type);
|
|
299
|
+
stats.byType[typeName] = (stats.byType[typeName] || 0) + 1;
|
|
300
|
+
// Track by confidence
|
|
301
|
+
if (confidence > 0.8) {
|
|
302
|
+
stats.byConfidence.high++;
|
|
303
|
+
}
|
|
304
|
+
else if (confidence >= 0.6) {
|
|
305
|
+
stats.byConfidence.medium++;
|
|
306
|
+
}
|
|
307
|
+
else {
|
|
308
|
+
stats.byConfidence.low++;
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
/**
|
|
312
|
+
* Create empty result
|
|
313
|
+
*/
|
|
314
|
+
emptyResult(startTime, pdfMetadata = {}) {
|
|
315
|
+
return {
|
|
316
|
+
sectionsProcessed: 0,
|
|
317
|
+
pagesProcessed: 0,
|
|
318
|
+
entitiesExtracted: 0,
|
|
319
|
+
relationshipsInferred: 0,
|
|
320
|
+
sections: [],
|
|
321
|
+
entityMap: new Map(),
|
|
322
|
+
processingTime: Date.now() - startTime,
|
|
323
|
+
stats: {
|
|
324
|
+
byType: {},
|
|
325
|
+
byConfidence: { high: 0, medium: 0, low: 0 },
|
|
326
|
+
bySource: { paragraphs: 0, tables: 0 }
|
|
327
|
+
},
|
|
328
|
+
pdfMetadata: {
|
|
329
|
+
pageCount: pdfMetadata.pageCount || 0,
|
|
330
|
+
title: pdfMetadata.title,
|
|
331
|
+
author: pdfMetadata.author,
|
|
332
|
+
subject: pdfMetadata.subject
|
|
333
|
+
}
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
//# sourceMappingURL=SmartPDFImporter.js.map
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* VFS Structure Generator
|
|
3
|
+
*
|
|
4
|
+
* Organizes imported entities into structured VFS directories
|
|
5
|
+
* - Type-based grouping (Place/, Character/, Concept/)
|
|
6
|
+
* - Metadata files (_metadata.json, _relationships.json)
|
|
7
|
+
* - Source file preservation
|
|
8
|
+
*
|
|
9
|
+
* NO MOCKS - Production-ready implementation
|
|
10
|
+
*/
|
|
11
|
+
import { Brainy } from '../brainy.js';
|
|
12
|
+
import type { SmartExcelResult } from './SmartExcelImporter.js';
|
|
13
|
+
export interface VFSStructureOptions {
|
|
14
|
+
/** Root path in VFS for import */
|
|
15
|
+
rootPath: string;
|
|
16
|
+
/** Grouping strategy */
|
|
17
|
+
groupBy: 'type' | 'sheet' | 'flat' | 'custom';
|
|
18
|
+
/** Custom grouping function */
|
|
19
|
+
customGrouping?: (entity: any) => string;
|
|
20
|
+
/** Preserve source file */
|
|
21
|
+
preserveSource?: boolean;
|
|
22
|
+
/** Source file buffer (if preserving) */
|
|
23
|
+
sourceBuffer?: Buffer;
|
|
24
|
+
/** Source filename */
|
|
25
|
+
sourceFilename?: string;
|
|
26
|
+
/** Create relationship file */
|
|
27
|
+
createRelationshipFile?: boolean;
|
|
28
|
+
/** Create metadata file */
|
|
29
|
+
createMetadataFile?: boolean;
|
|
30
|
+
}
|
|
31
|
+
export interface VFSStructureResult {
|
|
32
|
+
/** Root path created */
|
|
33
|
+
rootPath: string;
|
|
34
|
+
/** Directories created */
|
|
35
|
+
directories: string[];
|
|
36
|
+
/** Files created */
|
|
37
|
+
files: Array<{
|
|
38
|
+
path: string;
|
|
39
|
+
entityId?: string;
|
|
40
|
+
type: 'entity' | 'metadata' | 'source' | 'relationships';
|
|
41
|
+
}>;
|
|
42
|
+
/** Total operations */
|
|
43
|
+
operations: number;
|
|
44
|
+
/** Time taken in ms */
|
|
45
|
+
duration: number;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* VFSStructureGenerator - Organizes imported data into VFS
|
|
49
|
+
*/
|
|
50
|
+
export declare class VFSStructureGenerator {
|
|
51
|
+
private brain;
|
|
52
|
+
private vfs;
|
|
53
|
+
constructor(brain: Brainy);
|
|
54
|
+
/**
|
|
55
|
+
* Initialize the generator
|
|
56
|
+
*/
|
|
57
|
+
init(): Promise<void>;
|
|
58
|
+
/**
|
|
59
|
+
* Generate VFS structure from import result
|
|
60
|
+
*/
|
|
61
|
+
generate(importResult: SmartExcelResult, options: VFSStructureOptions): Promise<VFSStructureResult>;
|
|
62
|
+
/**
|
|
63
|
+
* Group entities by strategy
|
|
64
|
+
*/
|
|
65
|
+
private groupEntities;
|
|
66
|
+
/**
|
|
67
|
+
* Get directory name for entity type
|
|
68
|
+
*/
|
|
69
|
+
private getTypeGroupName;
|
|
70
|
+
/**
|
|
71
|
+
* Sanitize filename
|
|
72
|
+
*/
|
|
73
|
+
private sanitizeFilename;
|
|
74
|
+
/**
|
|
75
|
+
* Get file extension
|
|
76
|
+
*/
|
|
77
|
+
private getExtension;
|
|
78
|
+
/**
|
|
79
|
+
* Group items by property
|
|
80
|
+
*/
|
|
81
|
+
private groupByType;
|
|
82
|
+
}
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* VFS Structure Generator
|
|
3
|
+
*
|
|
4
|
+
* Organizes imported entities into structured VFS directories
|
|
5
|
+
* - Type-based grouping (Place/, Character/, Concept/)
|
|
6
|
+
* - Metadata files (_metadata.json, _relationships.json)
|
|
7
|
+
* - Source file preservation
|
|
8
|
+
*
|
|
9
|
+
* NO MOCKS - Production-ready implementation
|
|
10
|
+
*/
|
|
11
|
+
import { VirtualFileSystem } from '../vfs/VirtualFileSystem.js';
|
|
12
|
+
import { NounType } from '../types/graphTypes.js';
|
|
13
|
+
/**
|
|
14
|
+
* VFSStructureGenerator - Organizes imported data into VFS
|
|
15
|
+
*/
|
|
16
|
+
export class VFSStructureGenerator {
|
|
17
|
+
constructor(brain) {
|
|
18
|
+
this.brain = brain;
|
|
19
|
+
this.vfs = new VirtualFileSystem(brain);
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Initialize the generator
|
|
23
|
+
*/
|
|
24
|
+
async init() {
|
|
25
|
+
// Always ensure VFS is initialized
|
|
26
|
+
try {
|
|
27
|
+
// Check if VFS is initialized by trying to access root
|
|
28
|
+
await this.vfs.stat('/');
|
|
29
|
+
}
|
|
30
|
+
catch (error) {
|
|
31
|
+
// VFS not initialized, initialize it
|
|
32
|
+
await this.vfs.init();
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Generate VFS structure from import result
|
|
37
|
+
*/
|
|
38
|
+
async generate(importResult, options) {
|
|
39
|
+
const startTime = Date.now();
|
|
40
|
+
const result = {
|
|
41
|
+
rootPath: options.rootPath,
|
|
42
|
+
directories: [],
|
|
43
|
+
files: [],
|
|
44
|
+
operations: 0,
|
|
45
|
+
duration: 0
|
|
46
|
+
};
|
|
47
|
+
// Ensure VFS is initialized
|
|
48
|
+
await this.init();
|
|
49
|
+
// Create root directory
|
|
50
|
+
try {
|
|
51
|
+
await this.vfs.mkdir(options.rootPath, { recursive: true });
|
|
52
|
+
result.directories.push(options.rootPath);
|
|
53
|
+
result.operations++;
|
|
54
|
+
}
|
|
55
|
+
catch (error) {
|
|
56
|
+
// Directory might already exist, that's fine
|
|
57
|
+
if (error.code !== 'EEXIST') {
|
|
58
|
+
throw error;
|
|
59
|
+
}
|
|
60
|
+
result.directories.push(options.rootPath);
|
|
61
|
+
}
|
|
62
|
+
// Preserve source file if requested
|
|
63
|
+
if (options.preserveSource && options.sourceBuffer && options.sourceFilename) {
|
|
64
|
+
const sourcePath = `${options.rootPath}/_source${this.getExtension(options.sourceFilename)}`;
|
|
65
|
+
await this.vfs.writeFile(sourcePath, options.sourceBuffer);
|
|
66
|
+
result.files.push({
|
|
67
|
+
path: sourcePath,
|
|
68
|
+
type: 'source'
|
|
69
|
+
});
|
|
70
|
+
result.operations++;
|
|
71
|
+
}
|
|
72
|
+
// Group entities
|
|
73
|
+
const groups = this.groupEntities(importResult, options);
|
|
74
|
+
// Create directories and files for each group
|
|
75
|
+
for (const [groupName, entities] of groups.entries()) {
|
|
76
|
+
const groupPath = `${options.rootPath}/${groupName}`;
|
|
77
|
+
// Create group directory
|
|
78
|
+
try {
|
|
79
|
+
await this.vfs.mkdir(groupPath, { recursive: true });
|
|
80
|
+
result.directories.push(groupPath);
|
|
81
|
+
result.operations++;
|
|
82
|
+
}
|
|
83
|
+
catch (error) {
|
|
84
|
+
// Directory might already exist
|
|
85
|
+
if (error.code !== 'EEXIST') {
|
|
86
|
+
throw error;
|
|
87
|
+
}
|
|
88
|
+
result.directories.push(groupPath);
|
|
89
|
+
}
|
|
90
|
+
// Create entity files
|
|
91
|
+
for (const extracted of entities) {
|
|
92
|
+
const sanitizedName = this.sanitizeFilename(extracted.entity.name);
|
|
93
|
+
const entityPath = `${groupPath}/${sanitizedName}.json`;
|
|
94
|
+
// Create entity JSON
|
|
95
|
+
const entityJson = {
|
|
96
|
+
id: extracted.entity.id,
|
|
97
|
+
name: extracted.entity.name,
|
|
98
|
+
type: extracted.entity.type,
|
|
99
|
+
description: extracted.entity.description,
|
|
100
|
+
confidence: extracted.entity.confidence,
|
|
101
|
+
metadata: extracted.entity.metadata,
|
|
102
|
+
concepts: extracted.concepts || [],
|
|
103
|
+
relatedEntities: extracted.relatedEntities,
|
|
104
|
+
relationships: extracted.relationships.map(rel => ({
|
|
105
|
+
from: rel.from,
|
|
106
|
+
to: rel.to,
|
|
107
|
+
type: rel.type,
|
|
108
|
+
confidence: rel.confidence,
|
|
109
|
+
evidence: rel.evidence
|
|
110
|
+
}))
|
|
111
|
+
};
|
|
112
|
+
await this.vfs.writeFile(entityPath, JSON.stringify(entityJson, null, 2));
|
|
113
|
+
result.files.push({
|
|
114
|
+
path: entityPath,
|
|
115
|
+
entityId: extracted.entity.id,
|
|
116
|
+
type: 'entity'
|
|
117
|
+
});
|
|
118
|
+
result.operations++;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
// Create relationships file
|
|
122
|
+
if (options.createRelationshipFile !== false) {
|
|
123
|
+
const relationshipsPath = `${options.rootPath}/_relationships.json`;
|
|
124
|
+
const allRelationships = importResult.rows.flatMap(row => row.relationships);
|
|
125
|
+
const relationshipsJson = {
|
|
126
|
+
source: options.sourceFilename || 'unknown',
|
|
127
|
+
count: allRelationships.length,
|
|
128
|
+
relationships: allRelationships,
|
|
129
|
+
stats: {
|
|
130
|
+
byType: this.groupByType(allRelationships, 'type'),
|
|
131
|
+
byConfidence: {
|
|
132
|
+
high: allRelationships.filter(r => r.confidence > 0.8).length,
|
|
133
|
+
medium: allRelationships.filter(r => r.confidence >= 0.6 && r.confidence <= 0.8).length,
|
|
134
|
+
low: allRelationships.filter(r => r.confidence < 0.6).length
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
await this.vfs.writeFile(relationshipsPath, JSON.stringify(relationshipsJson, null, 2));
|
|
139
|
+
result.files.push({
|
|
140
|
+
path: relationshipsPath,
|
|
141
|
+
type: 'relationships'
|
|
142
|
+
});
|
|
143
|
+
result.operations++;
|
|
144
|
+
}
|
|
145
|
+
// Create metadata file
|
|
146
|
+
if (options.createMetadataFile !== false) {
|
|
147
|
+
const metadataPath = `${options.rootPath}/_metadata.json`;
|
|
148
|
+
const metadataJson = {
|
|
149
|
+
import: {
|
|
150
|
+
timestamp: new Date().toISOString(),
|
|
151
|
+
source: {
|
|
152
|
+
filename: options.sourceFilename || 'unknown',
|
|
153
|
+
format: 'excel'
|
|
154
|
+
},
|
|
155
|
+
options: {
|
|
156
|
+
groupBy: options.groupBy,
|
|
157
|
+
preserveSource: options.preserveSource
|
|
158
|
+
},
|
|
159
|
+
stats: {
|
|
160
|
+
rowsProcessed: importResult.rowsProcessed,
|
|
161
|
+
entitiesExtracted: importResult.entitiesExtracted,
|
|
162
|
+
relationshipsInferred: importResult.relationshipsInferred,
|
|
163
|
+
processingTime: importResult.processingTime,
|
|
164
|
+
byType: importResult.stats.byType,
|
|
165
|
+
byConfidence: importResult.stats.byConfidence
|
|
166
|
+
}
|
|
167
|
+
},
|
|
168
|
+
structure: {
|
|
169
|
+
rootPath: options.rootPath,
|
|
170
|
+
groupingStrategy: options.groupBy,
|
|
171
|
+
directories: result.directories,
|
|
172
|
+
fileCount: result.files.length
|
|
173
|
+
}
|
|
174
|
+
};
|
|
175
|
+
await this.vfs.writeFile(metadataPath, JSON.stringify(metadataJson, null, 2));
|
|
176
|
+
result.files.push({
|
|
177
|
+
path: metadataPath,
|
|
178
|
+
type: 'metadata'
|
|
179
|
+
});
|
|
180
|
+
result.operations++;
|
|
181
|
+
}
|
|
182
|
+
result.duration = Date.now() - startTime;
|
|
183
|
+
return result;
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* Group entities by strategy
|
|
187
|
+
*/
|
|
188
|
+
groupEntities(importResult, options) {
|
|
189
|
+
const groups = new Map();
|
|
190
|
+
for (const extracted of importResult.rows) {
|
|
191
|
+
let groupName;
|
|
192
|
+
switch (options.groupBy) {
|
|
193
|
+
case 'type':
|
|
194
|
+
groupName = this.getTypeGroupName(extracted.entity.type);
|
|
195
|
+
break;
|
|
196
|
+
case 'flat':
|
|
197
|
+
groupName = 'entities';
|
|
198
|
+
break;
|
|
199
|
+
case 'custom':
|
|
200
|
+
groupName = options.customGrouping ?
|
|
201
|
+
options.customGrouping(extracted.entity) :
|
|
202
|
+
'entities';
|
|
203
|
+
break;
|
|
204
|
+
default:
|
|
205
|
+
groupName = 'entities';
|
|
206
|
+
}
|
|
207
|
+
if (!groups.has(groupName)) {
|
|
208
|
+
groups.set(groupName, []);
|
|
209
|
+
}
|
|
210
|
+
groups.get(groupName).push(extracted);
|
|
211
|
+
}
|
|
212
|
+
return groups;
|
|
213
|
+
}
|
|
214
|
+
/**
|
|
215
|
+
* Get directory name for entity type
|
|
216
|
+
*/
|
|
217
|
+
getTypeGroupName(type) {
|
|
218
|
+
const typeMap = {
|
|
219
|
+
[NounType.Person]: 'Characters',
|
|
220
|
+
[NounType.Location]: 'Places',
|
|
221
|
+
[NounType.Organization]: 'Organizations',
|
|
222
|
+
[NounType.Concept]: 'Concepts',
|
|
223
|
+
[NounType.Event]: 'Events',
|
|
224
|
+
[NounType.Product]: 'Items',
|
|
225
|
+
[NounType.Document]: 'Documents',
|
|
226
|
+
[NounType.Project]: 'Projects',
|
|
227
|
+
[NounType.Thing]: 'Other'
|
|
228
|
+
};
|
|
229
|
+
return typeMap[type] || 'Other';
|
|
230
|
+
}
|
|
231
|
+
/**
|
|
232
|
+
* Sanitize filename
|
|
233
|
+
*/
|
|
234
|
+
sanitizeFilename(name) {
|
|
235
|
+
return name
|
|
236
|
+
.replace(/[<>:"/\\|?*]/g, '_') // Replace invalid chars
|
|
237
|
+
.replace(/\s+/g, '_') // Replace spaces
|
|
238
|
+
.replace(/_{2,}/g, '_') // Collapse multiple underscores
|
|
239
|
+
.substring(0, 200); // Limit length
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Get file extension
|
|
243
|
+
*/
|
|
244
|
+
getExtension(filename) {
|
|
245
|
+
const lastDot = filename.lastIndexOf('.');
|
|
246
|
+
return lastDot !== -1 ? filename.substring(lastDot) : '.bin';
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Group items by property
|
|
250
|
+
*/
|
|
251
|
+
groupByType(items, property) {
|
|
252
|
+
const groups = {};
|
|
253
|
+
for (const item of items) {
|
|
254
|
+
const key = String(item[property]);
|
|
255
|
+
groups[key] = (groups[key] || 0) + 1;
|
|
256
|
+
}
|
|
257
|
+
return groups;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
//# sourceMappingURL=VFSStructureGenerator.js.map
|