@soulcraft/brainy 3.27.1 → 3.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/dist/brainy.d.ts +50 -0
  3. package/dist/brainy.js +54 -2
  4. package/dist/config/storageAutoConfig.d.ts +2 -1
  5. package/dist/config/storageAutoConfig.js +5 -4
  6. package/dist/import/EntityDeduplicator.d.ts +84 -0
  7. package/dist/import/EntityDeduplicator.js +255 -0
  8. package/dist/import/FormatDetector.d.ts +65 -0
  9. package/dist/import/FormatDetector.js +263 -0
  10. package/dist/import/ImportCoordinator.d.ts +160 -0
  11. package/dist/import/ImportCoordinator.js +498 -0
  12. package/dist/import/ImportHistory.d.ts +92 -0
  13. package/dist/import/ImportHistory.js +183 -0
  14. package/dist/import/index.d.ts +16 -0
  15. package/dist/import/index.js +14 -0
  16. package/dist/importers/SmartCSVImporter.d.ts +136 -0
  17. package/dist/importers/SmartCSVImporter.js +308 -0
  18. package/dist/importers/SmartExcelImporter.d.ts +131 -0
  19. package/dist/importers/SmartExcelImporter.js +302 -0
  20. package/dist/importers/SmartImportOrchestrator.d.ts +125 -0
  21. package/dist/importers/SmartImportOrchestrator.js +531 -0
  22. package/dist/importers/SmartJSONImporter.d.ts +135 -0
  23. package/dist/importers/SmartJSONImporter.js +325 -0
  24. package/dist/importers/SmartMarkdownImporter.d.ts +159 -0
  25. package/dist/importers/SmartMarkdownImporter.js +369 -0
  26. package/dist/importers/SmartPDFImporter.d.ts +154 -0
  27. package/dist/importers/SmartPDFImporter.js +337 -0
  28. package/dist/importers/VFSStructureGenerator.d.ts +82 -0
  29. package/dist/importers/VFSStructureGenerator.js +260 -0
  30. package/dist/importers/index.d.ts +28 -0
  31. package/dist/importers/index.js +29 -0
  32. package/package.json +1 -1
@@ -0,0 +1,369 @@
1
+ /**
2
+ * Smart Markdown Importer
3
+ *
4
+ * Extracts entities and relationships from Markdown files using:
5
+ * - Heading structure for entity organization
6
+ * - Link relationships
7
+ * - NeuralEntityExtractor for entity extraction from text
8
+ * - Section-based grouping
9
+ *
10
+ * NO MOCKS - Production-ready implementation
11
+ */
12
+ import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
13
+ import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
14
+ import { NounType, VerbType } from '../types/graphTypes.js';
15
+ /**
16
+ * SmartMarkdownImporter - Extracts structured knowledge from Markdown files
17
+ */
18
+ export class SmartMarkdownImporter {
19
+ constructor(brain) {
20
+ this.brain = brain;
21
+ this.extractor = new NeuralEntityExtractor(brain);
22
+ this.nlp = new NaturalLanguageProcessor(brain);
23
+ }
24
+ /**
25
+ * Initialize the importer
26
+ */
27
+ async init() {
28
+ await this.nlp.init();
29
+ }
30
+ /**
31
+ * Extract entities and relationships from Markdown content
32
+ */
33
+ async extract(markdown, options = {}) {
34
+ const startTime = Date.now();
35
+ // Set defaults
36
+ const opts = {
37
+ enableNeuralExtraction: true,
38
+ enableRelationshipInference: true,
39
+ enableConceptExtraction: true,
40
+ confidenceThreshold: 0.6,
41
+ extractCodeBlocks: true,
42
+ minSectionLength: 50,
43
+ groupByHeading: true,
44
+ onProgress: () => { },
45
+ ...options
46
+ };
47
+ // Parse markdown into sections
48
+ const parsedSections = this.parseMarkdown(markdown, opts);
49
+ // Process each section
50
+ const sections = [];
51
+ const entityMap = new Map();
52
+ const stats = {
53
+ byType: {},
54
+ byHeadingLevel: {},
55
+ byConfidence: { high: 0, medium: 0, low: 0 },
56
+ linksFound: 0,
57
+ codeBlocksFound: 0
58
+ };
59
+ for (let i = 0; i < parsedSections.length; i++) {
60
+ const parsed = parsedSections[i];
61
+ const section = await this.processSection(parsed, opts, stats, entityMap);
62
+ sections.push(section);
63
+ opts.onProgress({
64
+ processed: i + 1,
65
+ total: parsedSections.length,
66
+ entities: sections.reduce((sum, s) => sum + s.entities.length, 0),
67
+ relationships: sections.reduce((sum, s) => sum + s.relationships.length, 0)
68
+ });
69
+ }
70
+ return {
71
+ sectionsProcessed: sections.length,
72
+ entitiesExtracted: sections.reduce((sum, s) => sum + s.entities.length, 0),
73
+ relationshipsInferred: sections.reduce((sum, s) => sum + s.relationships.length, 0),
74
+ sections,
75
+ entityMap,
76
+ processingTime: Date.now() - startTime,
77
+ stats
78
+ };
79
+ }
80
+ /**
81
+ * Parse markdown into sections
82
+ */
83
+ parseMarkdown(markdown, options) {
84
+ const lines = markdown.split('\n');
85
+ const sections = [];
86
+ let currentSection = {
87
+ heading: null,
88
+ level: 0,
89
+ lines: []
90
+ };
91
+ let sectionCounter = 0;
92
+ for (const line of lines) {
93
+ // Check for heading
94
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
95
+ if (headingMatch) {
96
+ // Save current section if it has content
97
+ if (currentSection.lines.length > 0) {
98
+ const content = currentSection.lines.join('\n').trim();
99
+ if (content.length >= (options.minSectionLength || 50)) {
100
+ sections.push({
101
+ id: `section_${sectionCounter++}`,
102
+ heading: currentSection.heading,
103
+ level: currentSection.level,
104
+ content
105
+ });
106
+ }
107
+ }
108
+ // Start new section
109
+ const level = headingMatch[1].length;
110
+ const heading = headingMatch[2].trim();
111
+ currentSection = {
112
+ heading,
113
+ level,
114
+ lines: []
115
+ };
116
+ }
117
+ else {
118
+ currentSection.lines.push(line);
119
+ }
120
+ }
121
+ // Add last section
122
+ if (currentSection.lines.length > 0) {
123
+ const content = currentSection.lines.join('\n').trim();
124
+ if (content.length >= (options.minSectionLength || 50)) {
125
+ sections.push({
126
+ id: `section_${sectionCounter}`,
127
+ heading: currentSection.heading,
128
+ level: currentSection.level,
129
+ content
130
+ });
131
+ }
132
+ }
133
+ return sections;
134
+ }
135
+ /**
136
+ * Process a single section
137
+ */
138
+ async processSection(parsed, options, stats, entityMap) {
139
+ // Track heading level
140
+ stats.byHeadingLevel[parsed.level] = (stats.byHeadingLevel[parsed.level] || 0) + 1;
141
+ // Extract links
142
+ const links = this.extractLinks(parsed.content);
143
+ stats.linksFound += links.length;
144
+ // Extract code blocks
145
+ const codeBlocks = options.extractCodeBlocks ? this.extractCodeBlocks(parsed.content) : [];
146
+ stats.codeBlocksFound += codeBlocks.length;
147
+ // Remove code blocks from content for entity extraction
148
+ const contentWithoutCode = this.removeCodeBlocks(parsed.content);
149
+ // Extract entities
150
+ let extractedEntities = [];
151
+ if (options.enableNeuralExtraction && contentWithoutCode.length > 0) {
152
+ extractedEntities = await this.extractor.extract(contentWithoutCode, {
153
+ confidence: options.confidenceThreshold || 0.6,
154
+ neuralMatching: true,
155
+ cache: { enabled: true }
156
+ });
157
+ }
158
+ // If section has a heading, treat it as an entity
159
+ if (parsed.heading) {
160
+ const headingEntity = {
161
+ text: parsed.heading,
162
+ type: this.inferTypeFromHeading(parsed.heading, parsed.level),
163
+ confidence: 0.9,
164
+ position: { start: 0, end: parsed.heading.length }
165
+ };
166
+ extractedEntities.unshift(headingEntity);
167
+ }
168
+ // Extract concepts
169
+ let concepts = [];
170
+ if (options.enableConceptExtraction && contentWithoutCode.length > 0) {
171
+ try {
172
+ concepts = await this.brain.extractConcepts(contentWithoutCode, { limit: 10 });
173
+ }
174
+ catch (error) {
175
+ concepts = [];
176
+ }
177
+ }
178
+ // Create entity objects
179
+ const entities = extractedEntities.map(e => {
180
+ const entityId = this.generateEntityId(e.text, parsed.id);
181
+ entityMap.set(e.text.toLowerCase(), entityId);
182
+ // Update statistics
183
+ this.updateStats(stats, e.type, e.confidence);
184
+ return {
185
+ id: entityId,
186
+ name: e.text,
187
+ type: e.type,
188
+ description: contentWithoutCode.substring(0, 200),
189
+ confidence: e.confidence,
190
+ metadata: {
191
+ source: 'markdown',
192
+ section: parsed.id,
193
+ heading: parsed.heading,
194
+ level: parsed.level,
195
+ extractedAt: Date.now()
196
+ }
197
+ };
198
+ });
199
+ // Infer relationships
200
+ const relationships = [];
201
+ // Link-based relationships
202
+ if (options.enableRelationshipInference) {
203
+ for (const link of links) {
204
+ // Find entity that might be the source
205
+ const sourceEntity = entities.find(e => contentWithoutCode.toLowerCase().includes(e.name.toLowerCase()));
206
+ if (sourceEntity) {
207
+ // Create relationship to linked entity
208
+ const targetId = this.generateEntityId(link.text, 'link');
209
+ relationships.push({
210
+ from: sourceEntity.id,
211
+ to: link.text,
212
+ type: VerbType.References,
213
+ confidence: 0.85,
214
+ evidence: `Markdown link: [${link.text}](${link.url})`
215
+ });
216
+ }
217
+ }
218
+ // Entity proximity-based relationships
219
+ for (let i = 0; i < entities.length; i++) {
220
+ for (let j = i + 1; j < entities.length; j++) {
221
+ const entity1 = entities[i];
222
+ const entity2 = entities[j];
223
+ if (this.entitiesAreRelated(contentWithoutCode, entity1.name, entity2.name)) {
224
+ const verbType = await this.inferRelationship(entity1.name, entity2.name, contentWithoutCode);
225
+ relationships.push({
226
+ from: entity1.id,
227
+ to: entity2.id,
228
+ type: verbType,
229
+ confidence: Math.min(entity1.confidence, entity2.confidence) * 0.8,
230
+ evidence: `Co-occurrence in section: ${parsed.heading || parsed.id}`
231
+ });
232
+ }
233
+ }
234
+ }
235
+ }
236
+ return {
237
+ id: parsed.id,
238
+ heading: parsed.heading,
239
+ level: parsed.level,
240
+ content: parsed.content,
241
+ entities,
242
+ links,
243
+ codeBlocks,
244
+ relationships,
245
+ concepts
246
+ };
247
+ }
248
+ /**
249
+ * Extract markdown links
250
+ */
251
+ extractLinks(content) {
252
+ const links = [];
253
+ const linkRegex = /\[([^\]]+)\]\(([^)]+)\)/g;
254
+ let match;
255
+ while ((match = linkRegex.exec(content)) !== null) {
256
+ const text = match[1];
257
+ const url = match[2];
258
+ const type = url.startsWith('http') ? 'external' : 'internal';
259
+ links.push({ text, url, type });
260
+ }
261
+ return links;
262
+ }
263
+ /**
264
+ * Extract code blocks
265
+ */
266
+ extractCodeBlocks(content) {
267
+ const codeBlocks = [];
268
+ const codeBlockRegex = /```(\w+)?\n([\s\S]*?)```/g;
269
+ let match;
270
+ while ((match = codeBlockRegex.exec(content)) !== null) {
271
+ const language = match[1] || 'text';
272
+ const code = match[2].trim();
273
+ codeBlocks.push({ language, code });
274
+ }
275
+ return codeBlocks;
276
+ }
277
+ /**
278
+ * Remove code blocks from content
279
+ */
280
+ removeCodeBlocks(content) {
281
+ return content.replace(/```[\s\S]*?```/g, '');
282
+ }
283
+ /**
284
+ * Infer type from heading
285
+ */
286
+ inferTypeFromHeading(heading, level) {
287
+ const lower = heading.toLowerCase();
288
+ if (lower.includes('person') || lower.includes('people') || lower.includes('author') || lower.includes('user')) {
289
+ return NounType.Person;
290
+ }
291
+ if (lower.includes('location') || lower.includes('place')) {
292
+ return NounType.Location;
293
+ }
294
+ if (lower.includes('organization') || lower.includes('company')) {
295
+ return NounType.Organization;
296
+ }
297
+ if (lower.includes('event')) {
298
+ return NounType.Event;
299
+ }
300
+ if (lower.includes('project')) {
301
+ return NounType.Project;
302
+ }
303
+ if (lower.includes('document') || lower.includes('file')) {
304
+ return NounType.Document;
305
+ }
306
+ // Top-level headings are often concepts/topics
307
+ if (level <= 2) {
308
+ return NounType.Concept;
309
+ }
310
+ return NounType.Thing;
311
+ }
312
+ /**
313
+ * Check if entities are related by proximity
314
+ */
315
+ entitiesAreRelated(text, entity1, entity2) {
316
+ const lowerText = text.toLowerCase();
317
+ const index1 = lowerText.indexOf(entity1.toLowerCase());
318
+ const index2 = lowerText.indexOf(entity2.toLowerCase());
319
+ if (index1 === -1 || index2 === -1)
320
+ return false;
321
+ return Math.abs(index1 - index2) < 300;
322
+ }
323
+ /**
324
+ * Infer relationship type from context
325
+ */
326
+ async inferRelationship(fromEntity, toEntity, context) {
327
+ const lowerContext = context.toLowerCase();
328
+ const patterns = [
329
+ [new RegExp(`${toEntity}.*of.*${fromEntity}`, 'i'), VerbType.PartOf],
330
+ [new RegExp(`${fromEntity}.*contains.*${toEntity}`, 'i'), VerbType.Contains],
331
+ [new RegExp(`${fromEntity}.*in.*${toEntity}`, 'i'), VerbType.LocatedAt],
332
+ [new RegExp(`${fromEntity}.*created.*${toEntity}`, 'i'), VerbType.Creates],
333
+ [new RegExp(`${fromEntity}.*and.*${toEntity}`, 'i'), VerbType.RelatedTo]
334
+ ];
335
+ for (const [pattern, verbType] of patterns) {
336
+ if (pattern.test(lowerContext)) {
337
+ return verbType;
338
+ }
339
+ }
340
+ return VerbType.RelatedTo;
341
+ }
342
+ /**
343
+ * Generate consistent entity ID
344
+ */
345
+ generateEntityId(name, section) {
346
+ const normalized = name.toLowerCase().trim().replace(/\s+/g, '_');
347
+ const sectionNorm = section.replace(/\s+/g, '_');
348
+ return `ent_${normalized}_${sectionNorm}_${Date.now()}`;
349
+ }
350
+ /**
351
+ * Update statistics
352
+ */
353
+ updateStats(stats, type, confidence) {
354
+ // Track by type
355
+ const typeName = String(type);
356
+ stats.byType[typeName] = (stats.byType[typeName] || 0) + 1;
357
+ // Track by confidence
358
+ if (confidence > 0.8) {
359
+ stats.byConfidence.high++;
360
+ }
361
+ else if (confidence >= 0.6) {
362
+ stats.byConfidence.medium++;
363
+ }
364
+ else {
365
+ stats.byConfidence.low++;
366
+ }
367
+ }
368
+ }
369
+ //# sourceMappingURL=SmartMarkdownImporter.js.map
@@ -0,0 +1,154 @@
1
+ /**
2
+ * Smart PDF Importer
3
+ *
4
+ * Extracts entities and relationships from PDF files using:
5
+ * - NeuralEntityExtractor for entity extraction
6
+ * - NaturalLanguageProcessor for relationship inference
7
+ * - brain.extractConcepts() for tagging
8
+ * - Section-based organization (by page or detected structure)
9
+ *
10
+ * NO MOCKS - Production-ready implementation
11
+ */
12
+ import { Brainy } from '../brainy.js';
13
+ import { NounType, VerbType } from '../types/graphTypes.js';
14
+ import type { FormatHandlerOptions } from '../augmentations/intelligentImport/types.js';
15
+ export interface SmartPDFOptions extends FormatHandlerOptions {
16
+ /** Enable neural entity extraction */
17
+ enableNeuralExtraction?: boolean;
18
+ /** Enable relationship inference from text */
19
+ enableRelationshipInference?: boolean;
20
+ /** Enable concept extraction for tagging */
21
+ enableConceptExtraction?: boolean;
22
+ /** Confidence threshold for entities (0-1) */
23
+ confidenceThreshold?: number;
24
+ /** Minimum paragraph length to process (characters) */
25
+ minParagraphLength?: number;
26
+ /** Extract from tables */
27
+ extractFromTables?: boolean;
28
+ /** Group by page or full document */
29
+ groupBy?: 'page' | 'document';
30
+ /** Progress callback */
31
+ onProgress?: (stats: {
32
+ processed: number;
33
+ total: number;
34
+ entities: number;
35
+ relationships: number;
36
+ }) => void;
37
+ }
38
+ export interface ExtractedSection {
39
+ /** Section identifier (page number or section name) */
40
+ sectionId: string;
41
+ /** Section type */
42
+ sectionType: 'page' | 'paragraph' | 'table';
43
+ /** Entities extracted from this section */
44
+ entities: Array<{
45
+ id: string;
46
+ name: string;
47
+ type: NounType;
48
+ description: string;
49
+ confidence: number;
50
+ metadata: Record<string, any>;
51
+ }>;
52
+ /** Relationships inferred in this section */
53
+ relationships: Array<{
54
+ from: string;
55
+ to: string;
56
+ type: VerbType;
57
+ confidence: number;
58
+ evidence: string;
59
+ }>;
60
+ /** Concepts extracted */
61
+ concepts?: string[];
62
+ /** Original text */
63
+ text: string;
64
+ }
65
+ export interface SmartPDFResult {
66
+ /** Total sections processed */
67
+ sectionsProcessed: number;
68
+ /** Total pages processed */
69
+ pagesProcessed: number;
70
+ /** Entities extracted */
71
+ entitiesExtracted: number;
72
+ /** Relationships inferred */
73
+ relationshipsInferred: number;
74
+ /** All extracted sections */
75
+ sections: ExtractedSection[];
76
+ /** Entity ID mapping (name -> ID) */
77
+ entityMap: Map<string, string>;
78
+ /** Processing time in ms */
79
+ processingTime: number;
80
+ /** Extraction statistics */
81
+ stats: {
82
+ byType: Record<string, number>;
83
+ byConfidence: {
84
+ high: number;
85
+ medium: number;
86
+ low: number;
87
+ };
88
+ bySource: {
89
+ paragraphs: number;
90
+ tables: number;
91
+ };
92
+ };
93
+ /** PDF metadata */
94
+ pdfMetadata: {
95
+ pageCount: number;
96
+ title?: string;
97
+ author?: string;
98
+ subject?: string;
99
+ };
100
+ }
101
+ /**
102
+ * SmartPDFImporter - Extracts structured knowledge from PDF files
103
+ */
104
+ export declare class SmartPDFImporter {
105
+ private brain;
106
+ private extractor;
107
+ private nlp;
108
+ private pdfHandler;
109
+ constructor(brain: Brainy);
110
+ /**
111
+ * Initialize the importer
112
+ */
113
+ init(): Promise<void>;
114
+ /**
115
+ * Extract entities and relationships from PDF file
116
+ */
117
+ extract(buffer: Buffer, options?: SmartPDFOptions): Promise<SmartPDFResult>;
118
+ /**
119
+ * Group data by strategy
120
+ */
121
+ private groupData;
122
+ /**
123
+ * Process a single section
124
+ */
125
+ private processSection;
126
+ /**
127
+ * Extract context around an entity mention
128
+ */
129
+ private extractContextAroundEntity;
130
+ /**
131
+ * Check if two entities are related based on proximity in text
132
+ */
133
+ private entitiesAreRelated;
134
+ /**
135
+ * Extract context showing relationship between entities
136
+ */
137
+ private extractRelationshipContext;
138
+ /**
139
+ * Infer relationship type from context
140
+ */
141
+ private inferRelationship;
142
+ /**
143
+ * Generate consistent entity ID
144
+ */
145
+ private generateEntityId;
146
+ /**
147
+ * Update statistics
148
+ */
149
+ private updateStats;
150
+ /**
151
+ * Create empty result
152
+ */
153
+ private emptyResult;
154
+ }