@soulcraft/brainy 3.27.0 → 3.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/dist/brainy.d.ts +50 -0
- package/dist/brainy.js +36 -0
- package/dist/import/EntityDeduplicator.d.ts +84 -0
- package/dist/import/EntityDeduplicator.js +255 -0
- package/dist/import/FormatDetector.d.ts +65 -0
- package/dist/import/FormatDetector.js +263 -0
- package/dist/import/ImportCoordinator.d.ts +160 -0
- package/dist/import/ImportCoordinator.js +498 -0
- package/dist/import/ImportHistory.d.ts +92 -0
- package/dist/import/ImportHistory.js +183 -0
- package/dist/import/index.d.ts +16 -0
- package/dist/import/index.js +14 -0
- package/dist/importers/SmartCSVImporter.d.ts +136 -0
- package/dist/importers/SmartCSVImporter.js +308 -0
- package/dist/importers/SmartExcelImporter.d.ts +131 -0
- package/dist/importers/SmartExcelImporter.js +302 -0
- package/dist/importers/SmartImportOrchestrator.d.ts +125 -0
- package/dist/importers/SmartImportOrchestrator.js +531 -0
- package/dist/importers/SmartJSONImporter.d.ts +135 -0
- package/dist/importers/SmartJSONImporter.js +325 -0
- package/dist/importers/SmartMarkdownImporter.d.ts +159 -0
- package/dist/importers/SmartMarkdownImporter.js +369 -0
- package/dist/importers/SmartPDFImporter.d.ts +154 -0
- package/dist/importers/SmartPDFImporter.js +337 -0
- package/dist/importers/VFSStructureGenerator.d.ts +82 -0
- package/dist/importers/VFSStructureGenerator.js +260 -0
- package/dist/importers/index.d.ts +28 -0
- package/dist/importers/index.js +29 -0
- package/package.json +1 -1
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Format Detector
|
|
3
|
+
*
|
|
4
|
+
* Unified format detection for all import types using:
|
|
5
|
+
* - Magic byte signatures (PDF, Excel, images)
|
|
6
|
+
* - File extensions
|
|
7
|
+
* - Content analysis (JSON, Markdown, CSV)
|
|
8
|
+
*
|
|
9
|
+
* NO MOCKS - Production-ready implementation
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
* FormatDetector - Detect file format from various inputs
|
|
13
|
+
*/
|
|
14
|
+
export class FormatDetector {
|
|
15
|
+
/**
|
|
16
|
+
* Detect format from buffer
|
|
17
|
+
*/
|
|
18
|
+
detectFromBuffer(buffer) {
|
|
19
|
+
// Check magic bytes first (most reliable)
|
|
20
|
+
const magicResult = this.detectByMagicBytes(buffer);
|
|
21
|
+
if (magicResult)
|
|
22
|
+
return magicResult;
|
|
23
|
+
// Try content analysis
|
|
24
|
+
const contentResult = this.detectByContent(buffer);
|
|
25
|
+
if (contentResult)
|
|
26
|
+
return contentResult;
|
|
27
|
+
return null;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Detect format from file path
|
|
31
|
+
*/
|
|
32
|
+
detectFromPath(path) {
|
|
33
|
+
const ext = this.getExtension(path).toLowerCase();
|
|
34
|
+
const extensionMap = {
|
|
35
|
+
'.xlsx': 'excel',
|
|
36
|
+
'.xls': 'excel',
|
|
37
|
+
'.pdf': 'pdf',
|
|
38
|
+
'.csv': 'csv',
|
|
39
|
+
'.json': 'json',
|
|
40
|
+
'.md': 'markdown',
|
|
41
|
+
'.markdown': 'markdown'
|
|
42
|
+
};
|
|
43
|
+
const format = extensionMap[ext];
|
|
44
|
+
if (format) {
|
|
45
|
+
return {
|
|
46
|
+
format,
|
|
47
|
+
confidence: 0.9,
|
|
48
|
+
evidence: [`File extension: ${ext}`]
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Detect format from string content
|
|
55
|
+
*/
|
|
56
|
+
detectFromString(content) {
|
|
57
|
+
const trimmed = content.trim();
|
|
58
|
+
// JSON detection
|
|
59
|
+
if (this.looksLikeJSON(trimmed)) {
|
|
60
|
+
return {
|
|
61
|
+
format: 'json',
|
|
62
|
+
confidence: 0.95,
|
|
63
|
+
evidence: ['Content starts with { or [', 'Valid JSON structure']
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
// Markdown detection
|
|
67
|
+
if (this.looksLikeMarkdown(trimmed)) {
|
|
68
|
+
return {
|
|
69
|
+
format: 'markdown',
|
|
70
|
+
confidence: 0.85,
|
|
71
|
+
evidence: ['Contains markdown heading markers (#)', 'Text-based content']
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
// CSV detection
|
|
75
|
+
if (this.looksLikeCSV(trimmed)) {
|
|
76
|
+
return {
|
|
77
|
+
format: 'csv',
|
|
78
|
+
confidence: 0.8,
|
|
79
|
+
evidence: ['Contains delimiter-separated values', 'Consistent column structure']
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
return null;
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Detect format from object
|
|
86
|
+
*/
|
|
87
|
+
detectFromObject(obj) {
|
|
88
|
+
if (typeof obj === 'object' && obj !== null) {
|
|
89
|
+
return {
|
|
90
|
+
format: 'json',
|
|
91
|
+
confidence: 1.0,
|
|
92
|
+
evidence: ['JavaScript object']
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
return null;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Detect by magic bytes
|
|
99
|
+
*/
|
|
100
|
+
detectByMagicBytes(buffer) {
|
|
101
|
+
if (buffer.length < 4)
|
|
102
|
+
return null;
|
|
103
|
+
// PDF: %PDF (25 50 44 46)
|
|
104
|
+
if (buffer[0] === 0x25 && buffer[1] === 0x50 && buffer[2] === 0x44 && buffer[3] === 0x46) {
|
|
105
|
+
return {
|
|
106
|
+
format: 'pdf',
|
|
107
|
+
confidence: 1.0,
|
|
108
|
+
evidence: ['PDF magic bytes: %PDF']
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
// Excel (ZIP-based): PK (50 4B)
|
|
112
|
+
if (buffer[0] === 0x50 && buffer[1] === 0x4B) {
|
|
113
|
+
// Check for [Content_Types].xml which is specific to Office Open XML
|
|
114
|
+
const content = buffer.toString('utf8', 0, Math.min(1000, buffer.length));
|
|
115
|
+
if (content.includes('[Content_Types].xml') || content.includes('xl/')) {
|
|
116
|
+
return {
|
|
117
|
+
format: 'excel',
|
|
118
|
+
confidence: 1.0,
|
|
119
|
+
evidence: ['ZIP magic bytes: PK', 'Contains Office Open XML structure']
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return null;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Detect by content analysis
|
|
127
|
+
*/
|
|
128
|
+
detectByContent(buffer) {
|
|
129
|
+
// Try to decode as UTF-8
|
|
130
|
+
let content;
|
|
131
|
+
try {
|
|
132
|
+
content = buffer.toString('utf8').trim();
|
|
133
|
+
}
|
|
134
|
+
catch {
|
|
135
|
+
return null;
|
|
136
|
+
}
|
|
137
|
+
// Check if it's text-based content
|
|
138
|
+
if (!this.isTextContent(content)) {
|
|
139
|
+
return null;
|
|
140
|
+
}
|
|
141
|
+
// JSON detection
|
|
142
|
+
if (this.looksLikeJSON(content)) {
|
|
143
|
+
return {
|
|
144
|
+
format: 'json',
|
|
145
|
+
confidence: 0.95,
|
|
146
|
+
evidence: ['Content starts with { or [', 'Valid JSON structure']
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
// Markdown detection
|
|
150
|
+
if (this.looksLikeMarkdown(content)) {
|
|
151
|
+
return {
|
|
152
|
+
format: 'markdown',
|
|
153
|
+
confidence: 0.85,
|
|
154
|
+
evidence: ['Contains markdown heading markers (#)', 'Text-based content']
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
// CSV detection
|
|
158
|
+
if (this.looksLikeCSV(content)) {
|
|
159
|
+
return {
|
|
160
|
+
format: 'csv',
|
|
161
|
+
confidence: 0.8,
|
|
162
|
+
evidence: ['Contains delimiter-separated values', 'Consistent column structure']
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
return null;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Check if content looks like JSON
|
|
169
|
+
*/
|
|
170
|
+
looksLikeJSON(content) {
|
|
171
|
+
const trimmed = content.trim();
|
|
172
|
+
if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) {
|
|
173
|
+
return false;
|
|
174
|
+
}
|
|
175
|
+
try {
|
|
176
|
+
JSON.parse(trimmed);
|
|
177
|
+
return true;
|
|
178
|
+
}
|
|
179
|
+
catch {
|
|
180
|
+
return false;
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Check if content looks like Markdown
|
|
185
|
+
*/
|
|
186
|
+
looksLikeMarkdown(content) {
|
|
187
|
+
const lines = content.split('\n').slice(0, 50); // Check first 50 lines
|
|
188
|
+
// Count markdown indicators
|
|
189
|
+
let indicators = 0;
|
|
190
|
+
for (const line of lines) {
|
|
191
|
+
// Headings
|
|
192
|
+
if (/^#{1,6}\s+.+/.test(line))
|
|
193
|
+
indicators += 2;
|
|
194
|
+
// Lists
|
|
195
|
+
if (/^[\*\-\+]\s+.+/.test(line))
|
|
196
|
+
indicators++;
|
|
197
|
+
if (/^\d+\.\s+.+/.test(line))
|
|
198
|
+
indicators++;
|
|
199
|
+
// Links
|
|
200
|
+
if (/\[.+\]\(.+\)/.test(line))
|
|
201
|
+
indicators++;
|
|
202
|
+
// Code blocks
|
|
203
|
+
if (/^```/.test(line))
|
|
204
|
+
indicators += 2;
|
|
205
|
+
// Bold/Italic
|
|
206
|
+
if (/\*\*.+\*\*/.test(line) || /\*.+\*/.test(line))
|
|
207
|
+
indicators++;
|
|
208
|
+
}
|
|
209
|
+
// If we have at least 3 markdown indicators, it's likely markdown
|
|
210
|
+
return indicators >= 3;
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Check if content looks like CSV
|
|
214
|
+
*/
|
|
215
|
+
looksLikeCSV(content) {
|
|
216
|
+
const lines = content.split('\n').filter(l => l.trim()).slice(0, 20);
|
|
217
|
+
if (lines.length < 2)
|
|
218
|
+
return false;
|
|
219
|
+
// Try common delimiters
|
|
220
|
+
const delimiters = [',', ';', '\t', '|'];
|
|
221
|
+
for (const delimiter of delimiters) {
|
|
222
|
+
const columnCounts = lines.map(line => {
|
|
223
|
+
// Simple split (doesn't handle quoted delimiters, but good enough for detection)
|
|
224
|
+
return line.split(delimiter).length;
|
|
225
|
+
});
|
|
226
|
+
// Check if all rows have the same number of columns (within 1)
|
|
227
|
+
const firstCount = columnCounts[0];
|
|
228
|
+
const consistent = columnCounts.filter(c => Math.abs(c - firstCount) <= 1).length;
|
|
229
|
+
// If >80% of rows have consistent column counts, it's likely CSV
|
|
230
|
+
if (consistent / columnCounts.length > 0.8 && firstCount > 1) {
|
|
231
|
+
return true;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
return false;
|
|
235
|
+
}
|
|
236
|
+
/**
|
|
237
|
+
* Check if content is text-based (not binary)
|
|
238
|
+
*/
|
|
239
|
+
isTextContent(content) {
|
|
240
|
+
// Check for null bytes (common in binary files)
|
|
241
|
+
if (content.includes('\0'))
|
|
242
|
+
return false;
|
|
243
|
+
// Check if mostly printable characters
|
|
244
|
+
const printable = content.split('').filter(c => {
|
|
245
|
+
const code = c.charCodeAt(0);
|
|
246
|
+
return (code >= 32 && code <= 126) || code === 9 || code === 10 || code === 13;
|
|
247
|
+
}).length;
|
|
248
|
+
const ratio = printable / content.length;
|
|
249
|
+
return ratio > 0.9;
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Get file extension from path
|
|
253
|
+
*/
|
|
254
|
+
getExtension(path) {
|
|
255
|
+
const lastDot = path.lastIndexOf('.');
|
|
256
|
+
const lastSlash = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\'));
|
|
257
|
+
if (lastDot > lastSlash && lastDot !== -1) {
|
|
258
|
+
return path.substring(lastDot);
|
|
259
|
+
}
|
|
260
|
+
return '';
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
//# sourceMappingURL=FormatDetector.js.map
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Import Coordinator
|
|
3
|
+
*
|
|
4
|
+
* Unified import orchestrator that:
|
|
5
|
+
* - Auto-detects file formats
|
|
6
|
+
* - Routes to appropriate handlers
|
|
7
|
+
* - Coordinates dual storage (VFS + Graph)
|
|
8
|
+
* - Provides simple, unified API
|
|
9
|
+
*
|
|
10
|
+
* NO MOCKS - Production-ready implementation
|
|
11
|
+
*/
|
|
12
|
+
import { Brainy } from '../brainy.js';
|
|
13
|
+
import { SupportedFormat } from './FormatDetector.js';
|
|
14
|
+
import { ImportHistory } from './ImportHistory.js';
|
|
15
|
+
import { NounType, VerbType } from '../types/graphTypes.js';
|
|
16
|
+
export interface ImportSource {
|
|
17
|
+
/** Source type */
|
|
18
|
+
type: 'buffer' | 'path' | 'string' | 'object';
|
|
19
|
+
/** Source data */
|
|
20
|
+
data: Buffer | string | object;
|
|
21
|
+
/** Optional filename hint */
|
|
22
|
+
filename?: string;
|
|
23
|
+
}
|
|
24
|
+
export interface ImportOptions {
|
|
25
|
+
/** Force specific format (skip auto-detection) */
|
|
26
|
+
format?: SupportedFormat;
|
|
27
|
+
/** VFS root path for imported files */
|
|
28
|
+
vfsPath?: string;
|
|
29
|
+
/** Grouping strategy for VFS */
|
|
30
|
+
groupBy?: 'type' | 'sheet' | 'flat' | 'custom';
|
|
31
|
+
/** Custom grouping function */
|
|
32
|
+
customGrouping?: (entity: any) => string;
|
|
33
|
+
/** Create entities in knowledge graph */
|
|
34
|
+
createEntities?: boolean;
|
|
35
|
+
/** Create relationships in knowledge graph */
|
|
36
|
+
createRelationships?: boolean;
|
|
37
|
+
/** Preserve source file in VFS */
|
|
38
|
+
preserveSource?: boolean;
|
|
39
|
+
/** Enable neural entity extraction */
|
|
40
|
+
enableNeuralExtraction?: boolean;
|
|
41
|
+
/** Enable relationship inference */
|
|
42
|
+
enableRelationshipInference?: boolean;
|
|
43
|
+
/** Enable concept extraction */
|
|
44
|
+
enableConceptExtraction?: boolean;
|
|
45
|
+
/** Confidence threshold for entities */
|
|
46
|
+
confidenceThreshold?: number;
|
|
47
|
+
/** Enable entity deduplication across imports */
|
|
48
|
+
enableDeduplication?: boolean;
|
|
49
|
+
/** Similarity threshold for deduplication (0-1) */
|
|
50
|
+
deduplicationThreshold?: number;
|
|
51
|
+
/** Enable import history tracking */
|
|
52
|
+
enableHistory?: boolean;
|
|
53
|
+
/** Chunk size for streaming large imports (0 = no streaming) */
|
|
54
|
+
chunkSize?: number;
|
|
55
|
+
/** Progress callback */
|
|
56
|
+
onProgress?: (progress: ImportProgress) => void;
|
|
57
|
+
}
|
|
58
|
+
export interface ImportProgress {
|
|
59
|
+
stage: 'detecting' | 'extracting' | 'storing-vfs' | 'storing-graph' | 'complete';
|
|
60
|
+
message: string;
|
|
61
|
+
processed?: number;
|
|
62
|
+
total?: number;
|
|
63
|
+
entities?: number;
|
|
64
|
+
relationships?: number;
|
|
65
|
+
}
|
|
66
|
+
export interface ImportResult {
|
|
67
|
+
/** Import ID for history tracking */
|
|
68
|
+
importId: string;
|
|
69
|
+
/** Detected format */
|
|
70
|
+
format: SupportedFormat;
|
|
71
|
+
/** Format detection confidence */
|
|
72
|
+
formatConfidence: number;
|
|
73
|
+
/** VFS paths created */
|
|
74
|
+
vfs: {
|
|
75
|
+
rootPath: string;
|
|
76
|
+
directories: string[];
|
|
77
|
+
files: Array<{
|
|
78
|
+
path: string;
|
|
79
|
+
entityId?: string;
|
|
80
|
+
type: 'entity' | 'metadata' | 'source' | 'relationships';
|
|
81
|
+
}>;
|
|
82
|
+
};
|
|
83
|
+
/** Knowledge graph entities created */
|
|
84
|
+
entities: Array<{
|
|
85
|
+
id: string;
|
|
86
|
+
name: string;
|
|
87
|
+
type: NounType;
|
|
88
|
+
vfsPath?: string;
|
|
89
|
+
}>;
|
|
90
|
+
/** Knowledge graph relationships created */
|
|
91
|
+
relationships: Array<{
|
|
92
|
+
id: string;
|
|
93
|
+
from: string;
|
|
94
|
+
to: string;
|
|
95
|
+
type: VerbType;
|
|
96
|
+
}>;
|
|
97
|
+
/** Import statistics */
|
|
98
|
+
stats: {
|
|
99
|
+
entitiesExtracted: number;
|
|
100
|
+
relationshipsInferred: number;
|
|
101
|
+
vfsFilesCreated: number;
|
|
102
|
+
graphNodesCreated: number;
|
|
103
|
+
graphEdgesCreated: number;
|
|
104
|
+
entitiesMerged: number;
|
|
105
|
+
entitiesNew: number;
|
|
106
|
+
processingTime: number;
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* ImportCoordinator - Main entry point for all imports
|
|
111
|
+
*/
|
|
112
|
+
export declare class ImportCoordinator {
|
|
113
|
+
private brain;
|
|
114
|
+
private detector;
|
|
115
|
+
private deduplicator;
|
|
116
|
+
private history;
|
|
117
|
+
private excelImporter;
|
|
118
|
+
private pdfImporter;
|
|
119
|
+
private csvImporter;
|
|
120
|
+
private jsonImporter;
|
|
121
|
+
private markdownImporter;
|
|
122
|
+
private vfsGenerator;
|
|
123
|
+
constructor(brain: Brainy);
|
|
124
|
+
/**
|
|
125
|
+
* Initialize all importers
|
|
126
|
+
*/
|
|
127
|
+
init(): Promise<void>;
|
|
128
|
+
/**
|
|
129
|
+
* Get import history
|
|
130
|
+
*/
|
|
131
|
+
getHistory(): ImportHistory;
|
|
132
|
+
/**
|
|
133
|
+
* Import from any source with auto-detection
|
|
134
|
+
*/
|
|
135
|
+
import(source: Buffer | string | object, options?: ImportOptions): Promise<ImportResult>;
|
|
136
|
+
/**
|
|
137
|
+
* Normalize source to ImportSource
|
|
138
|
+
*/
|
|
139
|
+
private normalizeSource;
|
|
140
|
+
/**
|
|
141
|
+
* Check if string is a file path
|
|
142
|
+
*/
|
|
143
|
+
private isFilePath;
|
|
144
|
+
/**
|
|
145
|
+
* Detect format from source
|
|
146
|
+
*/
|
|
147
|
+
private detectFormat;
|
|
148
|
+
/**
|
|
149
|
+
* Extract entities using format-specific importer
|
|
150
|
+
*/
|
|
151
|
+
private extract;
|
|
152
|
+
/**
|
|
153
|
+
* Create entities and relationships in knowledge graph
|
|
154
|
+
*/
|
|
155
|
+
private createGraphEntities;
|
|
156
|
+
/**
|
|
157
|
+
* Normalize extraction result to unified format (Excel-like structure)
|
|
158
|
+
*/
|
|
159
|
+
private normalizeExtractionResult;
|
|
160
|
+
}
|