@soulcraft/brainy 3.27.0 → 3.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,263 @@
1
+ /**
2
+ * Format Detector
3
+ *
4
+ * Unified format detection for all import types using:
5
+ * - Magic byte signatures (PDF, Excel, images)
6
+ * - File extensions
7
+ * - Content analysis (JSON, Markdown, CSV)
8
+ *
9
+ * NO MOCKS - Production-ready implementation
10
+ */
11
+ /**
12
+ * FormatDetector - Detect file format from various inputs
13
+ */
14
+ export class FormatDetector {
15
+ /**
16
+ * Detect format from buffer
17
+ */
18
+ detectFromBuffer(buffer) {
19
+ // Check magic bytes first (most reliable)
20
+ const magicResult = this.detectByMagicBytes(buffer);
21
+ if (magicResult)
22
+ return magicResult;
23
+ // Try content analysis
24
+ const contentResult = this.detectByContent(buffer);
25
+ if (contentResult)
26
+ return contentResult;
27
+ return null;
28
+ }
29
+ /**
30
+ * Detect format from file path
31
+ */
32
+ detectFromPath(path) {
33
+ const ext = this.getExtension(path).toLowerCase();
34
+ const extensionMap = {
35
+ '.xlsx': 'excel',
36
+ '.xls': 'excel',
37
+ '.pdf': 'pdf',
38
+ '.csv': 'csv',
39
+ '.json': 'json',
40
+ '.md': 'markdown',
41
+ '.markdown': 'markdown'
42
+ };
43
+ const format = extensionMap[ext];
44
+ if (format) {
45
+ return {
46
+ format,
47
+ confidence: 0.9,
48
+ evidence: [`File extension: ${ext}`]
49
+ };
50
+ }
51
+ return null;
52
+ }
53
+ /**
54
+ * Detect format from string content
55
+ */
56
+ detectFromString(content) {
57
+ const trimmed = content.trim();
58
+ // JSON detection
59
+ if (this.looksLikeJSON(trimmed)) {
60
+ return {
61
+ format: 'json',
62
+ confidence: 0.95,
63
+ evidence: ['Content starts with { or [', 'Valid JSON structure']
64
+ };
65
+ }
66
+ // Markdown detection
67
+ if (this.looksLikeMarkdown(trimmed)) {
68
+ return {
69
+ format: 'markdown',
70
+ confidence: 0.85,
71
+ evidence: ['Contains markdown heading markers (#)', 'Text-based content']
72
+ };
73
+ }
74
+ // CSV detection
75
+ if (this.looksLikeCSV(trimmed)) {
76
+ return {
77
+ format: 'csv',
78
+ confidence: 0.8,
79
+ evidence: ['Contains delimiter-separated values', 'Consistent column structure']
80
+ };
81
+ }
82
+ return null;
83
+ }
84
+ /**
85
+ * Detect format from object
86
+ */
87
+ detectFromObject(obj) {
88
+ if (typeof obj === 'object' && obj !== null) {
89
+ return {
90
+ format: 'json',
91
+ confidence: 1.0,
92
+ evidence: ['JavaScript object']
93
+ };
94
+ }
95
+ return null;
96
+ }
97
+ /**
98
+ * Detect by magic bytes
99
+ */
100
+ detectByMagicBytes(buffer) {
101
+ if (buffer.length < 4)
102
+ return null;
103
+ // PDF: %PDF (25 50 44 46)
104
+ if (buffer[0] === 0x25 && buffer[1] === 0x50 && buffer[2] === 0x44 && buffer[3] === 0x46) {
105
+ return {
106
+ format: 'pdf',
107
+ confidence: 1.0,
108
+ evidence: ['PDF magic bytes: %PDF']
109
+ };
110
+ }
111
+ // Excel (ZIP-based): PK (50 4B)
112
+ if (buffer[0] === 0x50 && buffer[1] === 0x4B) {
113
+ // Check for [Content_Types].xml which is specific to Office Open XML
114
+ const content = buffer.toString('utf8', 0, Math.min(1000, buffer.length));
115
+ if (content.includes('[Content_Types].xml') || content.includes('xl/')) {
116
+ return {
117
+ format: 'excel',
118
+ confidence: 1.0,
119
+ evidence: ['ZIP magic bytes: PK', 'Contains Office Open XML structure']
120
+ };
121
+ }
122
+ }
123
+ return null;
124
+ }
125
+ /**
126
+ * Detect by content analysis
127
+ */
128
+ detectByContent(buffer) {
129
+ // Try to decode as UTF-8
130
+ let content;
131
+ try {
132
+ content = buffer.toString('utf8').trim();
133
+ }
134
+ catch {
135
+ return null;
136
+ }
137
+ // Check if it's text-based content
138
+ if (!this.isTextContent(content)) {
139
+ return null;
140
+ }
141
+ // JSON detection
142
+ if (this.looksLikeJSON(content)) {
143
+ return {
144
+ format: 'json',
145
+ confidence: 0.95,
146
+ evidence: ['Content starts with { or [', 'Valid JSON structure']
147
+ };
148
+ }
149
+ // Markdown detection
150
+ if (this.looksLikeMarkdown(content)) {
151
+ return {
152
+ format: 'markdown',
153
+ confidence: 0.85,
154
+ evidence: ['Contains markdown heading markers (#)', 'Text-based content']
155
+ };
156
+ }
157
+ // CSV detection
158
+ if (this.looksLikeCSV(content)) {
159
+ return {
160
+ format: 'csv',
161
+ confidence: 0.8,
162
+ evidence: ['Contains delimiter-separated values', 'Consistent column structure']
163
+ };
164
+ }
165
+ return null;
166
+ }
167
+ /**
168
+ * Check if content looks like JSON
169
+ */
170
+ looksLikeJSON(content) {
171
+ const trimmed = content.trim();
172
+ if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) {
173
+ return false;
174
+ }
175
+ try {
176
+ JSON.parse(trimmed);
177
+ return true;
178
+ }
179
+ catch {
180
+ return false;
181
+ }
182
+ }
183
+ /**
184
+ * Check if content looks like Markdown
185
+ */
186
+ looksLikeMarkdown(content) {
187
+ const lines = content.split('\n').slice(0, 50); // Check first 50 lines
188
+ // Count markdown indicators
189
+ let indicators = 0;
190
+ for (const line of lines) {
191
+ // Headings
192
+ if (/^#{1,6}\s+.+/.test(line))
193
+ indicators += 2;
194
+ // Lists
195
+ if (/^[\*\-\+]\s+.+/.test(line))
196
+ indicators++;
197
+ if (/^\d+\.\s+.+/.test(line))
198
+ indicators++;
199
+ // Links
200
+ if (/\[.+\]\(.+\)/.test(line))
201
+ indicators++;
202
+ // Code blocks
203
+ if (/^```/.test(line))
204
+ indicators += 2;
205
+ // Bold/Italic
206
+ if (/\*\*.+\*\*/.test(line) || /\*.+\*/.test(line))
207
+ indicators++;
208
+ }
209
+ // If we have at least 3 markdown indicators, it's likely markdown
210
+ return indicators >= 3;
211
+ }
212
+ /**
213
+ * Check if content looks like CSV
214
+ */
215
+ looksLikeCSV(content) {
216
+ const lines = content.split('\n').filter(l => l.trim()).slice(0, 20);
217
+ if (lines.length < 2)
218
+ return false;
219
+ // Try common delimiters
220
+ const delimiters = [',', ';', '\t', '|'];
221
+ for (const delimiter of delimiters) {
222
+ const columnCounts = lines.map(line => {
223
+ // Simple split (doesn't handle quoted delimiters, but good enough for detection)
224
+ return line.split(delimiter).length;
225
+ });
226
+ // Check if all rows have the same number of columns (within 1)
227
+ const firstCount = columnCounts[0];
228
+ const consistent = columnCounts.filter(c => Math.abs(c - firstCount) <= 1).length;
229
+ // If >80% of rows have consistent column counts, it's likely CSV
230
+ if (consistent / columnCounts.length > 0.8 && firstCount > 1) {
231
+ return true;
232
+ }
233
+ }
234
+ return false;
235
+ }
236
+ /**
237
+ * Check if content is text-based (not binary)
238
+ */
239
+ isTextContent(content) {
240
+ // Check for null bytes (common in binary files)
241
+ if (content.includes('\0'))
242
+ return false;
243
+ // Check if mostly printable characters
244
+ const printable = content.split('').filter(c => {
245
+ const code = c.charCodeAt(0);
246
+ return (code >= 32 && code <= 126) || code === 9 || code === 10 || code === 13;
247
+ }).length;
248
+ const ratio = printable / content.length;
249
+ return ratio > 0.9;
250
+ }
251
+ /**
252
+ * Get file extension from path
253
+ */
254
+ getExtension(path) {
255
+ const lastDot = path.lastIndexOf('.');
256
+ const lastSlash = Math.max(path.lastIndexOf('/'), path.lastIndexOf('\\'));
257
+ if (lastDot > lastSlash && lastDot !== -1) {
258
+ return path.substring(lastDot);
259
+ }
260
+ return '';
261
+ }
262
+ }
263
+ //# sourceMappingURL=FormatDetector.js.map
@@ -0,0 +1,160 @@
1
+ /**
2
+ * Import Coordinator
3
+ *
4
+ * Unified import orchestrator that:
5
+ * - Auto-detects file formats
6
+ * - Routes to appropriate handlers
7
+ * - Coordinates dual storage (VFS + Graph)
8
+ * - Provides simple, unified API
9
+ *
10
+ * NO MOCKS - Production-ready implementation
11
+ */
12
+ import { Brainy } from '../brainy.js';
13
+ import { SupportedFormat } from './FormatDetector.js';
14
+ import { ImportHistory } from './ImportHistory.js';
15
+ import { NounType, VerbType } from '../types/graphTypes.js';
16
+ export interface ImportSource {
17
+ /** Source type */
18
+ type: 'buffer' | 'path' | 'string' | 'object';
19
+ /** Source data */
20
+ data: Buffer | string | object;
21
+ /** Optional filename hint */
22
+ filename?: string;
23
+ }
24
+ export interface ImportOptions {
25
+ /** Force specific format (skip auto-detection) */
26
+ format?: SupportedFormat;
27
+ /** VFS root path for imported files */
28
+ vfsPath?: string;
29
+ /** Grouping strategy for VFS */
30
+ groupBy?: 'type' | 'sheet' | 'flat' | 'custom';
31
+ /** Custom grouping function */
32
+ customGrouping?: (entity: any) => string;
33
+ /** Create entities in knowledge graph */
34
+ createEntities?: boolean;
35
+ /** Create relationships in knowledge graph */
36
+ createRelationships?: boolean;
37
+ /** Preserve source file in VFS */
38
+ preserveSource?: boolean;
39
+ /** Enable neural entity extraction */
40
+ enableNeuralExtraction?: boolean;
41
+ /** Enable relationship inference */
42
+ enableRelationshipInference?: boolean;
43
+ /** Enable concept extraction */
44
+ enableConceptExtraction?: boolean;
45
+ /** Confidence threshold for entities */
46
+ confidenceThreshold?: number;
47
+ /** Enable entity deduplication across imports */
48
+ enableDeduplication?: boolean;
49
+ /** Similarity threshold for deduplication (0-1) */
50
+ deduplicationThreshold?: number;
51
+ /** Enable import history tracking */
52
+ enableHistory?: boolean;
53
+ /** Chunk size for streaming large imports (0 = no streaming) */
54
+ chunkSize?: number;
55
+ /** Progress callback */
56
+ onProgress?: (progress: ImportProgress) => void;
57
+ }
58
+ export interface ImportProgress {
59
+ stage: 'detecting' | 'extracting' | 'storing-vfs' | 'storing-graph' | 'complete';
60
+ message: string;
61
+ processed?: number;
62
+ total?: number;
63
+ entities?: number;
64
+ relationships?: number;
65
+ }
66
+ export interface ImportResult {
67
+ /** Import ID for history tracking */
68
+ importId: string;
69
+ /** Detected format */
70
+ format: SupportedFormat;
71
+ /** Format detection confidence */
72
+ formatConfidence: number;
73
+ /** VFS paths created */
74
+ vfs: {
75
+ rootPath: string;
76
+ directories: string[];
77
+ files: Array<{
78
+ path: string;
79
+ entityId?: string;
80
+ type: 'entity' | 'metadata' | 'source' | 'relationships';
81
+ }>;
82
+ };
83
+ /** Knowledge graph entities created */
84
+ entities: Array<{
85
+ id: string;
86
+ name: string;
87
+ type: NounType;
88
+ vfsPath?: string;
89
+ }>;
90
+ /** Knowledge graph relationships created */
91
+ relationships: Array<{
92
+ id: string;
93
+ from: string;
94
+ to: string;
95
+ type: VerbType;
96
+ }>;
97
+ /** Import statistics */
98
+ stats: {
99
+ entitiesExtracted: number;
100
+ relationshipsInferred: number;
101
+ vfsFilesCreated: number;
102
+ graphNodesCreated: number;
103
+ graphEdgesCreated: number;
104
+ entitiesMerged: number;
105
+ entitiesNew: number;
106
+ processingTime: number;
107
+ };
108
+ }
109
+ /**
110
+ * ImportCoordinator - Main entry point for all imports
111
+ */
112
+ export declare class ImportCoordinator {
113
+ private brain;
114
+ private detector;
115
+ private deduplicator;
116
+ private history;
117
+ private excelImporter;
118
+ private pdfImporter;
119
+ private csvImporter;
120
+ private jsonImporter;
121
+ private markdownImporter;
122
+ private vfsGenerator;
123
+ constructor(brain: Brainy);
124
+ /**
125
+ * Initialize all importers
126
+ */
127
+ init(): Promise<void>;
128
+ /**
129
+ * Get import history
130
+ */
131
+ getHistory(): ImportHistory;
132
+ /**
133
+ * Import from any source with auto-detection
134
+ */
135
+ import(source: Buffer | string | object, options?: ImportOptions): Promise<ImportResult>;
136
+ /**
137
+ * Normalize source to ImportSource
138
+ */
139
+ private normalizeSource;
140
+ /**
141
+ * Check if string is a file path
142
+ */
143
+ private isFilePath;
144
+ /**
145
+ * Detect format from source
146
+ */
147
+ private detectFormat;
148
+ /**
149
+ * Extract entities using format-specific importer
150
+ */
151
+ private extract;
152
+ /**
153
+ * Create entities and relationships in knowledge graph
154
+ */
155
+ private createGraphEntities;
156
+ /**
157
+ * Normalize extraction result to unified format (Excel-like structure)
158
+ */
159
+ private normalizeExtractionResult;
160
+ }