@soulcraft/brainy 3.27.1 → 3.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/brainy.d.ts +50 -0
- package/dist/brainy.js +54 -2
- package/dist/config/storageAutoConfig.d.ts +2 -1
- package/dist/config/storageAutoConfig.js +5 -4
- package/dist/import/EntityDeduplicator.d.ts +84 -0
- package/dist/import/EntityDeduplicator.js +255 -0
- package/dist/import/FormatDetector.d.ts +65 -0
- package/dist/import/FormatDetector.js +263 -0
- package/dist/import/ImportCoordinator.d.ts +160 -0
- package/dist/import/ImportCoordinator.js +498 -0
- package/dist/import/ImportHistory.d.ts +92 -0
- package/dist/import/ImportHistory.js +183 -0
- package/dist/import/index.d.ts +16 -0
- package/dist/import/index.js +14 -0
- package/dist/importers/SmartCSVImporter.d.ts +136 -0
- package/dist/importers/SmartCSVImporter.js +308 -0
- package/dist/importers/SmartExcelImporter.d.ts +131 -0
- package/dist/importers/SmartExcelImporter.js +302 -0
- package/dist/importers/SmartImportOrchestrator.d.ts +125 -0
- package/dist/importers/SmartImportOrchestrator.js +531 -0
- package/dist/importers/SmartJSONImporter.d.ts +135 -0
- package/dist/importers/SmartJSONImporter.js +325 -0
- package/dist/importers/SmartMarkdownImporter.d.ts +159 -0
- package/dist/importers/SmartMarkdownImporter.js +369 -0
- package/dist/importers/SmartPDFImporter.d.ts +154 -0
- package/dist/importers/SmartPDFImporter.js +337 -0
- package/dist/importers/VFSStructureGenerator.d.ts +82 -0
- package/dist/importers/VFSStructureGenerator.js +260 -0
- package/dist/importers/index.d.ts +28 -0
- package/dist/importers/index.js +29 -0
- package/package.json +1 -1
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart Markdown Importer
|
|
3
|
+
*
|
|
4
|
+
* Extracts entities and relationships from Markdown files using:
|
|
5
|
+
* - Heading structure for entity organization
|
|
6
|
+
* - Link relationships
|
|
7
|
+
* - NeuralEntityExtractor for entity extraction from text
|
|
8
|
+
* - Section-based grouping
|
|
9
|
+
*
|
|
10
|
+
* NO MOCKS - Production-ready implementation
|
|
11
|
+
*/
|
|
12
|
+
import { NeuralEntityExtractor } from '../neural/entityExtractor.js';
|
|
13
|
+
import { NaturalLanguageProcessor } from '../neural/naturalLanguageProcessor.js';
|
|
14
|
+
import { NounType, VerbType } from '../types/graphTypes.js';
|
|
15
|
+
/**
|
|
16
|
+
* SmartMarkdownImporter - Extracts structured knowledge from Markdown files
|
|
17
|
+
*/
|
|
18
|
+
export class SmartMarkdownImporter {
|
|
19
|
+
constructor(brain) {
|
|
20
|
+
this.brain = brain;
|
|
21
|
+
this.extractor = new NeuralEntityExtractor(brain);
|
|
22
|
+
this.nlp = new NaturalLanguageProcessor(brain);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Initialize the importer
|
|
26
|
+
*/
|
|
27
|
+
async init() {
|
|
28
|
+
await this.nlp.init();
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Extract entities and relationships from Markdown content
|
|
32
|
+
*/
|
|
33
|
+
async extract(markdown, options = {}) {
|
|
34
|
+
const startTime = Date.now();
|
|
35
|
+
// Set defaults
|
|
36
|
+
const opts = {
|
|
37
|
+
enableNeuralExtraction: true,
|
|
38
|
+
enableRelationshipInference: true,
|
|
39
|
+
enableConceptExtraction: true,
|
|
40
|
+
confidenceThreshold: 0.6,
|
|
41
|
+
extractCodeBlocks: true,
|
|
42
|
+
minSectionLength: 50,
|
|
43
|
+
groupByHeading: true,
|
|
44
|
+
onProgress: () => { },
|
|
45
|
+
...options
|
|
46
|
+
};
|
|
47
|
+
// Parse markdown into sections
|
|
48
|
+
const parsedSections = this.parseMarkdown(markdown, opts);
|
|
49
|
+
// Process each section
|
|
50
|
+
const sections = [];
|
|
51
|
+
const entityMap = new Map();
|
|
52
|
+
const stats = {
|
|
53
|
+
byType: {},
|
|
54
|
+
byHeadingLevel: {},
|
|
55
|
+
byConfidence: { high: 0, medium: 0, low: 0 },
|
|
56
|
+
linksFound: 0,
|
|
57
|
+
codeBlocksFound: 0
|
|
58
|
+
};
|
|
59
|
+
for (let i = 0; i < parsedSections.length; i++) {
|
|
60
|
+
const parsed = parsedSections[i];
|
|
61
|
+
const section = await this.processSection(parsed, opts, stats, entityMap);
|
|
62
|
+
sections.push(section);
|
|
63
|
+
opts.onProgress({
|
|
64
|
+
processed: i + 1,
|
|
65
|
+
total: parsedSections.length,
|
|
66
|
+
entities: sections.reduce((sum, s) => sum + s.entities.length, 0),
|
|
67
|
+
relationships: sections.reduce((sum, s) => sum + s.relationships.length, 0)
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
return {
|
|
71
|
+
sectionsProcessed: sections.length,
|
|
72
|
+
entitiesExtracted: sections.reduce((sum, s) => sum + s.entities.length, 0),
|
|
73
|
+
relationshipsInferred: sections.reduce((sum, s) => sum + s.relationships.length, 0),
|
|
74
|
+
sections,
|
|
75
|
+
entityMap,
|
|
76
|
+
processingTime: Date.now() - startTime,
|
|
77
|
+
stats
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Parse markdown into sections
|
|
82
|
+
*/
|
|
83
|
+
parseMarkdown(markdown, options) {
|
|
84
|
+
const lines = markdown.split('\n');
|
|
85
|
+
const sections = [];
|
|
86
|
+
let currentSection = {
|
|
87
|
+
heading: null,
|
|
88
|
+
level: 0,
|
|
89
|
+
lines: []
|
|
90
|
+
};
|
|
91
|
+
let sectionCounter = 0;
|
|
92
|
+
for (const line of lines) {
|
|
93
|
+
// Check for heading
|
|
94
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
95
|
+
if (headingMatch) {
|
|
96
|
+
// Save current section if it has content
|
|
97
|
+
if (currentSection.lines.length > 0) {
|
|
98
|
+
const content = currentSection.lines.join('\n').trim();
|
|
99
|
+
if (content.length >= (options.minSectionLength || 50)) {
|
|
100
|
+
sections.push({
|
|
101
|
+
id: `section_${sectionCounter++}`,
|
|
102
|
+
heading: currentSection.heading,
|
|
103
|
+
level: currentSection.level,
|
|
104
|
+
content
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// Start new section
|
|
109
|
+
const level = headingMatch[1].length;
|
|
110
|
+
const heading = headingMatch[2].trim();
|
|
111
|
+
currentSection = {
|
|
112
|
+
heading,
|
|
113
|
+
level,
|
|
114
|
+
lines: []
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
else {
|
|
118
|
+
currentSection.lines.push(line);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
// Add last section
|
|
122
|
+
if (currentSection.lines.length > 0) {
|
|
123
|
+
const content = currentSection.lines.join('\n').trim();
|
|
124
|
+
if (content.length >= (options.minSectionLength || 50)) {
|
|
125
|
+
sections.push({
|
|
126
|
+
id: `section_${sectionCounter}`,
|
|
127
|
+
heading: currentSection.heading,
|
|
128
|
+
level: currentSection.level,
|
|
129
|
+
content
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return sections;
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Process a single section
|
|
137
|
+
*/
|
|
138
|
+
async processSection(parsed, options, stats, entityMap) {
|
|
139
|
+
// Track heading level
|
|
140
|
+
stats.byHeadingLevel[parsed.level] = (stats.byHeadingLevel[parsed.level] || 0) + 1;
|
|
141
|
+
// Extract links
|
|
142
|
+
const links = this.extractLinks(parsed.content);
|
|
143
|
+
stats.linksFound += links.length;
|
|
144
|
+
// Extract code blocks
|
|
145
|
+
const codeBlocks = options.extractCodeBlocks ? this.extractCodeBlocks(parsed.content) : [];
|
|
146
|
+
stats.codeBlocksFound += codeBlocks.length;
|
|
147
|
+
// Remove code blocks from content for entity extraction
|
|
148
|
+
const contentWithoutCode = this.removeCodeBlocks(parsed.content);
|
|
149
|
+
// Extract entities
|
|
150
|
+
let extractedEntities = [];
|
|
151
|
+
if (options.enableNeuralExtraction && contentWithoutCode.length > 0) {
|
|
152
|
+
extractedEntities = await this.extractor.extract(contentWithoutCode, {
|
|
153
|
+
confidence: options.confidenceThreshold || 0.6,
|
|
154
|
+
neuralMatching: true,
|
|
155
|
+
cache: { enabled: true }
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
// If section has a heading, treat it as an entity
|
|
159
|
+
if (parsed.heading) {
|
|
160
|
+
const headingEntity = {
|
|
161
|
+
text: parsed.heading,
|
|
162
|
+
type: this.inferTypeFromHeading(parsed.heading, parsed.level),
|
|
163
|
+
confidence: 0.9,
|
|
164
|
+
position: { start: 0, end: parsed.heading.length }
|
|
165
|
+
};
|
|
166
|
+
extractedEntities.unshift(headingEntity);
|
|
167
|
+
}
|
|
168
|
+
// Extract concepts
|
|
169
|
+
let concepts = [];
|
|
170
|
+
if (options.enableConceptExtraction && contentWithoutCode.length > 0) {
|
|
171
|
+
try {
|
|
172
|
+
concepts = await this.brain.extractConcepts(contentWithoutCode, { limit: 10 });
|
|
173
|
+
}
|
|
174
|
+
catch (error) {
|
|
175
|
+
concepts = [];
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
// Create entity objects
|
|
179
|
+
const entities = extractedEntities.map(e => {
|
|
180
|
+
const entityId = this.generateEntityId(e.text, parsed.id);
|
|
181
|
+
entityMap.set(e.text.toLowerCase(), entityId);
|
|
182
|
+
// Update statistics
|
|
183
|
+
this.updateStats(stats, e.type, e.confidence);
|
|
184
|
+
return {
|
|
185
|
+
id: entityId,
|
|
186
|
+
name: e.text,
|
|
187
|
+
type: e.type,
|
|
188
|
+
description: contentWithoutCode.substring(0, 200),
|
|
189
|
+
confidence: e.confidence,
|
|
190
|
+
metadata: {
|
|
191
|
+
source: 'markdown',
|
|
192
|
+
section: parsed.id,
|
|
193
|
+
heading: parsed.heading,
|
|
194
|
+
level: parsed.level,
|
|
195
|
+
extractedAt: Date.now()
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
});
|
|
199
|
+
// Infer relationships
|
|
200
|
+
const relationships = [];
|
|
201
|
+
// Link-based relationships
|
|
202
|
+
if (options.enableRelationshipInference) {
|
|
203
|
+
for (const link of links) {
|
|
204
|
+
// Find entity that might be the source
|
|
205
|
+
const sourceEntity = entities.find(e => contentWithoutCode.toLowerCase().includes(e.name.toLowerCase()));
|
|
206
|
+
if (sourceEntity) {
|
|
207
|
+
// Create relationship to linked entity
|
|
208
|
+
const targetId = this.generateEntityId(link.text, 'link');
|
|
209
|
+
relationships.push({
|
|
210
|
+
from: sourceEntity.id,
|
|
211
|
+
to: link.text,
|
|
212
|
+
type: VerbType.References,
|
|
213
|
+
confidence: 0.85,
|
|
214
|
+
evidence: `Markdown link: [${link.text}](${link.url})`
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
// Entity proximity-based relationships
|
|
219
|
+
for (let i = 0; i < entities.length; i++) {
|
|
220
|
+
for (let j = i + 1; j < entities.length; j++) {
|
|
221
|
+
const entity1 = entities[i];
|
|
222
|
+
const entity2 = entities[j];
|
|
223
|
+
if (this.entitiesAreRelated(contentWithoutCode, entity1.name, entity2.name)) {
|
|
224
|
+
const verbType = await this.inferRelationship(entity1.name, entity2.name, contentWithoutCode);
|
|
225
|
+
relationships.push({
|
|
226
|
+
from: entity1.id,
|
|
227
|
+
to: entity2.id,
|
|
228
|
+
type: verbType,
|
|
229
|
+
confidence: Math.min(entity1.confidence, entity2.confidence) * 0.8,
|
|
230
|
+
evidence: `Co-occurrence in section: ${parsed.heading || parsed.id}`
|
|
231
|
+
});
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
return {
|
|
237
|
+
id: parsed.id,
|
|
238
|
+
heading: parsed.heading,
|
|
239
|
+
level: parsed.level,
|
|
240
|
+
content: parsed.content,
|
|
241
|
+
entities,
|
|
242
|
+
links,
|
|
243
|
+
codeBlocks,
|
|
244
|
+
relationships,
|
|
245
|
+
concepts
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Extract markdown links
|
|
250
|
+
*/
|
|
251
|
+
extractLinks(content) {
|
|
252
|
+
const links = [];
|
|
253
|
+
const linkRegex = /\[([^\]]+)\]\(([^)]+)\)/g;
|
|
254
|
+
let match;
|
|
255
|
+
while ((match = linkRegex.exec(content)) !== null) {
|
|
256
|
+
const text = match[1];
|
|
257
|
+
const url = match[2];
|
|
258
|
+
const type = url.startsWith('http') ? 'external' : 'internal';
|
|
259
|
+
links.push({ text, url, type });
|
|
260
|
+
}
|
|
261
|
+
return links;
|
|
262
|
+
}
|
|
263
|
+
/**
|
|
264
|
+
* Extract code blocks
|
|
265
|
+
*/
|
|
266
|
+
extractCodeBlocks(content) {
|
|
267
|
+
const codeBlocks = [];
|
|
268
|
+
const codeBlockRegex = /```(\w+)?\n([\s\S]*?)```/g;
|
|
269
|
+
let match;
|
|
270
|
+
while ((match = codeBlockRegex.exec(content)) !== null) {
|
|
271
|
+
const language = match[1] || 'text';
|
|
272
|
+
const code = match[2].trim();
|
|
273
|
+
codeBlocks.push({ language, code });
|
|
274
|
+
}
|
|
275
|
+
return codeBlocks;
|
|
276
|
+
}
|
|
277
|
+
/**
|
|
278
|
+
* Remove code blocks from content
|
|
279
|
+
*/
|
|
280
|
+
removeCodeBlocks(content) {
|
|
281
|
+
return content.replace(/```[\s\S]*?```/g, '');
|
|
282
|
+
}
|
|
283
|
+
/**
|
|
284
|
+
* Infer type from heading
|
|
285
|
+
*/
|
|
286
|
+
inferTypeFromHeading(heading, level) {
|
|
287
|
+
const lower = heading.toLowerCase();
|
|
288
|
+
if (lower.includes('person') || lower.includes('people') || lower.includes('author') || lower.includes('user')) {
|
|
289
|
+
return NounType.Person;
|
|
290
|
+
}
|
|
291
|
+
if (lower.includes('location') || lower.includes('place')) {
|
|
292
|
+
return NounType.Location;
|
|
293
|
+
}
|
|
294
|
+
if (lower.includes('organization') || lower.includes('company')) {
|
|
295
|
+
return NounType.Organization;
|
|
296
|
+
}
|
|
297
|
+
if (lower.includes('event')) {
|
|
298
|
+
return NounType.Event;
|
|
299
|
+
}
|
|
300
|
+
if (lower.includes('project')) {
|
|
301
|
+
return NounType.Project;
|
|
302
|
+
}
|
|
303
|
+
if (lower.includes('document') || lower.includes('file')) {
|
|
304
|
+
return NounType.Document;
|
|
305
|
+
}
|
|
306
|
+
// Top-level headings are often concepts/topics
|
|
307
|
+
if (level <= 2) {
|
|
308
|
+
return NounType.Concept;
|
|
309
|
+
}
|
|
310
|
+
return NounType.Thing;
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Check if entities are related by proximity
|
|
314
|
+
*/
|
|
315
|
+
entitiesAreRelated(text, entity1, entity2) {
|
|
316
|
+
const lowerText = text.toLowerCase();
|
|
317
|
+
const index1 = lowerText.indexOf(entity1.toLowerCase());
|
|
318
|
+
const index2 = lowerText.indexOf(entity2.toLowerCase());
|
|
319
|
+
if (index1 === -1 || index2 === -1)
|
|
320
|
+
return false;
|
|
321
|
+
return Math.abs(index1 - index2) < 300;
|
|
322
|
+
}
|
|
323
|
+
/**
|
|
324
|
+
* Infer relationship type from context
|
|
325
|
+
*/
|
|
326
|
+
async inferRelationship(fromEntity, toEntity, context) {
|
|
327
|
+
const lowerContext = context.toLowerCase();
|
|
328
|
+
const patterns = [
|
|
329
|
+
[new RegExp(`${toEntity}.*of.*${fromEntity}`, 'i'), VerbType.PartOf],
|
|
330
|
+
[new RegExp(`${fromEntity}.*contains.*${toEntity}`, 'i'), VerbType.Contains],
|
|
331
|
+
[new RegExp(`${fromEntity}.*in.*${toEntity}`, 'i'), VerbType.LocatedAt],
|
|
332
|
+
[new RegExp(`${fromEntity}.*created.*${toEntity}`, 'i'), VerbType.Creates],
|
|
333
|
+
[new RegExp(`${fromEntity}.*and.*${toEntity}`, 'i'), VerbType.RelatedTo]
|
|
334
|
+
];
|
|
335
|
+
for (const [pattern, verbType] of patterns) {
|
|
336
|
+
if (pattern.test(lowerContext)) {
|
|
337
|
+
return verbType;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
return VerbType.RelatedTo;
|
|
341
|
+
}
|
|
342
|
+
/**
|
|
343
|
+
* Generate consistent entity ID
|
|
344
|
+
*/
|
|
345
|
+
generateEntityId(name, section) {
|
|
346
|
+
const normalized = name.toLowerCase().trim().replace(/\s+/g, '_');
|
|
347
|
+
const sectionNorm = section.replace(/\s+/g, '_');
|
|
348
|
+
return `ent_${normalized}_${sectionNorm}_${Date.now()}`;
|
|
349
|
+
}
|
|
350
|
+
/**
|
|
351
|
+
* Update statistics
|
|
352
|
+
*/
|
|
353
|
+
updateStats(stats, type, confidence) {
|
|
354
|
+
// Track by type
|
|
355
|
+
const typeName = String(type);
|
|
356
|
+
stats.byType[typeName] = (stats.byType[typeName] || 0) + 1;
|
|
357
|
+
// Track by confidence
|
|
358
|
+
if (confidence > 0.8) {
|
|
359
|
+
stats.byConfidence.high++;
|
|
360
|
+
}
|
|
361
|
+
else if (confidence >= 0.6) {
|
|
362
|
+
stats.byConfidence.medium++;
|
|
363
|
+
}
|
|
364
|
+
else {
|
|
365
|
+
stats.byConfidence.low++;
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
//# sourceMappingURL=SmartMarkdownImporter.js.map
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart PDF Importer
|
|
3
|
+
*
|
|
4
|
+
* Extracts entities and relationships from PDF files using:
|
|
5
|
+
* - NeuralEntityExtractor for entity extraction
|
|
6
|
+
* - NaturalLanguageProcessor for relationship inference
|
|
7
|
+
* - brain.extractConcepts() for tagging
|
|
8
|
+
* - Section-based organization (by page or detected structure)
|
|
9
|
+
*
|
|
10
|
+
* NO MOCKS - Production-ready implementation
|
|
11
|
+
*/
|
|
12
|
+
import { Brainy } from '../brainy.js';
|
|
13
|
+
import { NounType, VerbType } from '../types/graphTypes.js';
|
|
14
|
+
import type { FormatHandlerOptions } from '../augmentations/intelligentImport/types.js';
|
|
15
|
+
export interface SmartPDFOptions extends FormatHandlerOptions {
|
|
16
|
+
/** Enable neural entity extraction */
|
|
17
|
+
enableNeuralExtraction?: boolean;
|
|
18
|
+
/** Enable relationship inference from text */
|
|
19
|
+
enableRelationshipInference?: boolean;
|
|
20
|
+
/** Enable concept extraction for tagging */
|
|
21
|
+
enableConceptExtraction?: boolean;
|
|
22
|
+
/** Confidence threshold for entities (0-1) */
|
|
23
|
+
confidenceThreshold?: number;
|
|
24
|
+
/** Minimum paragraph length to process (characters) */
|
|
25
|
+
minParagraphLength?: number;
|
|
26
|
+
/** Extract from tables */
|
|
27
|
+
extractFromTables?: boolean;
|
|
28
|
+
/** Group by page or full document */
|
|
29
|
+
groupBy?: 'page' | 'document';
|
|
30
|
+
/** Progress callback */
|
|
31
|
+
onProgress?: (stats: {
|
|
32
|
+
processed: number;
|
|
33
|
+
total: number;
|
|
34
|
+
entities: number;
|
|
35
|
+
relationships: number;
|
|
36
|
+
}) => void;
|
|
37
|
+
}
|
|
38
|
+
export interface ExtractedSection {
|
|
39
|
+
/** Section identifier (page number or section name) */
|
|
40
|
+
sectionId: string;
|
|
41
|
+
/** Section type */
|
|
42
|
+
sectionType: 'page' | 'paragraph' | 'table';
|
|
43
|
+
/** Entities extracted from this section */
|
|
44
|
+
entities: Array<{
|
|
45
|
+
id: string;
|
|
46
|
+
name: string;
|
|
47
|
+
type: NounType;
|
|
48
|
+
description: string;
|
|
49
|
+
confidence: number;
|
|
50
|
+
metadata: Record<string, any>;
|
|
51
|
+
}>;
|
|
52
|
+
/** Relationships inferred in this section */
|
|
53
|
+
relationships: Array<{
|
|
54
|
+
from: string;
|
|
55
|
+
to: string;
|
|
56
|
+
type: VerbType;
|
|
57
|
+
confidence: number;
|
|
58
|
+
evidence: string;
|
|
59
|
+
}>;
|
|
60
|
+
/** Concepts extracted */
|
|
61
|
+
concepts?: string[];
|
|
62
|
+
/** Original text */
|
|
63
|
+
text: string;
|
|
64
|
+
}
|
|
65
|
+
export interface SmartPDFResult {
|
|
66
|
+
/** Total sections processed */
|
|
67
|
+
sectionsProcessed: number;
|
|
68
|
+
/** Total pages processed */
|
|
69
|
+
pagesProcessed: number;
|
|
70
|
+
/** Entities extracted */
|
|
71
|
+
entitiesExtracted: number;
|
|
72
|
+
/** Relationships inferred */
|
|
73
|
+
relationshipsInferred: number;
|
|
74
|
+
/** All extracted sections */
|
|
75
|
+
sections: ExtractedSection[];
|
|
76
|
+
/** Entity ID mapping (name -> ID) */
|
|
77
|
+
entityMap: Map<string, string>;
|
|
78
|
+
/** Processing time in ms */
|
|
79
|
+
processingTime: number;
|
|
80
|
+
/** Extraction statistics */
|
|
81
|
+
stats: {
|
|
82
|
+
byType: Record<string, number>;
|
|
83
|
+
byConfidence: {
|
|
84
|
+
high: number;
|
|
85
|
+
medium: number;
|
|
86
|
+
low: number;
|
|
87
|
+
};
|
|
88
|
+
bySource: {
|
|
89
|
+
paragraphs: number;
|
|
90
|
+
tables: number;
|
|
91
|
+
};
|
|
92
|
+
};
|
|
93
|
+
/** PDF metadata */
|
|
94
|
+
pdfMetadata: {
|
|
95
|
+
pageCount: number;
|
|
96
|
+
title?: string;
|
|
97
|
+
author?: string;
|
|
98
|
+
subject?: string;
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* SmartPDFImporter - Extracts structured knowledge from PDF files
|
|
103
|
+
*/
|
|
104
|
+
export declare class SmartPDFImporter {
|
|
105
|
+
private brain;
|
|
106
|
+
private extractor;
|
|
107
|
+
private nlp;
|
|
108
|
+
private pdfHandler;
|
|
109
|
+
constructor(brain: Brainy);
|
|
110
|
+
/**
|
|
111
|
+
* Initialize the importer
|
|
112
|
+
*/
|
|
113
|
+
init(): Promise<void>;
|
|
114
|
+
/**
|
|
115
|
+
* Extract entities and relationships from PDF file
|
|
116
|
+
*/
|
|
117
|
+
extract(buffer: Buffer, options?: SmartPDFOptions): Promise<SmartPDFResult>;
|
|
118
|
+
/**
|
|
119
|
+
* Group data by strategy
|
|
120
|
+
*/
|
|
121
|
+
private groupData;
|
|
122
|
+
/**
|
|
123
|
+
* Process a single section
|
|
124
|
+
*/
|
|
125
|
+
private processSection;
|
|
126
|
+
/**
|
|
127
|
+
* Extract context around an entity mention
|
|
128
|
+
*/
|
|
129
|
+
private extractContextAroundEntity;
|
|
130
|
+
/**
|
|
131
|
+
* Check if two entities are related based on proximity in text
|
|
132
|
+
*/
|
|
133
|
+
private entitiesAreRelated;
|
|
134
|
+
/**
|
|
135
|
+
* Extract context showing relationship between entities
|
|
136
|
+
*/
|
|
137
|
+
private extractRelationshipContext;
|
|
138
|
+
/**
|
|
139
|
+
* Infer relationship type from context
|
|
140
|
+
*/
|
|
141
|
+
private inferRelationship;
|
|
142
|
+
/**
|
|
143
|
+
* Generate consistent entity ID
|
|
144
|
+
*/
|
|
145
|
+
private generateEntityId;
|
|
146
|
+
/**
|
|
147
|
+
* Update statistics
|
|
148
|
+
*/
|
|
149
|
+
private updateStats;
|
|
150
|
+
/**
|
|
151
|
+
* Create empty result
|
|
152
|
+
*/
|
|
153
|
+
private emptyResult;
|
|
154
|
+
}
|