@soulcraft/brainy 3.27.1 → 3.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,11 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
4
4
 
5
+ ### [3.28.0](https://github.com/soulcraftlabs/brainy/compare/v3.27.1...v3.28.0) (2025-10-08)
6
+
7
+ - feat: add unified import system with auto-detection and dual storage (a06e877)
8
+
9
+
5
10
  ### [3.27.1](https://github.com/soulcraftlabs/brainy/compare/v3.27.0...v3.27.1) (2025-10-08)
6
11
 
7
12
  - docs: clarify GCS storage type and config object pairing (dcbd0fd)
package/dist/brainy.d.ts CHANGED
@@ -640,6 +640,56 @@ export declare class Brainy<T = any> implements BrainyInterface<T> {
640
640
  confidence?: number;
641
641
  limit?: number;
642
642
  }): Promise<string[]>;
643
+ /**
644
+ * Import files with auto-detection and dual storage (VFS + Knowledge Graph)
645
+ *
646
+ * Unified import system that:
647
+ * - Auto-detects format (Excel, PDF, CSV, JSON, Markdown)
648
+ * - Extracts entities and relationships
649
+ * - Stores in both VFS (organized files) and Knowledge Graph (connected entities)
650
+ * - Links VFS files to graph entities
651
+ *
652
+ * @example
653
+ * // Import from file path
654
+ * const result = await brain.import('/path/to/file.xlsx')
655
+ *
656
+ * @example
657
+ * // Import from buffer
658
+ * const result = await brain.import(buffer, { format: 'pdf' })
659
+ *
660
+ * @example
661
+ * // Import JSON object
662
+ * const result = await brain.import({ entities: [...] })
663
+ *
664
+ * @example
665
+ * // Custom VFS path and grouping
666
+ * const result = await brain.import(buffer, {
667
+ * vfsPath: '/my-imports/data',
668
+ * groupBy: 'type',
669
+ * onProgress: (progress) => console.log(progress.message)
670
+ * })
671
+ */
672
+ import(source: Buffer | string | object, options?: {
673
+ format?: 'excel' | 'pdf' | 'csv' | 'json' | 'markdown';
674
+ vfsPath?: string;
675
+ groupBy?: 'type' | 'sheet' | 'flat' | 'custom';
676
+ customGrouping?: (entity: any) => string;
677
+ createEntities?: boolean;
678
+ createRelationships?: boolean;
679
+ preserveSource?: boolean;
680
+ enableNeuralExtraction?: boolean;
681
+ enableRelationshipInference?: boolean;
682
+ enableConceptExtraction?: boolean;
683
+ confidenceThreshold?: number;
684
+ onProgress?: (progress: {
685
+ stage: 'detecting' | 'extracting' | 'storing-vfs' | 'storing-graph' | 'complete';
686
+ message: string;
687
+ processed?: number;
688
+ total?: number;
689
+ entities?: number;
690
+ relationships?: number;
691
+ }) => void;
692
+ }): Promise<import("./import/ImportCoordinator.js").ImportResult>;
643
693
  /**
644
694
  * Virtual File System API - Knowledge Operating System
645
695
  */
package/dist/brainy.js CHANGED
@@ -1419,6 +1419,42 @@ export class Brainy {
1419
1419
  // Apply limit if specified
1420
1420
  return options?.limit ? concepts.slice(0, options.limit) : concepts;
1421
1421
  }
1422
+ /**
1423
+ * Import files with auto-detection and dual storage (VFS + Knowledge Graph)
1424
+ *
1425
+ * Unified import system that:
1426
+ * - Auto-detects format (Excel, PDF, CSV, JSON, Markdown)
1427
+ * - Extracts entities and relationships
1428
+ * - Stores in both VFS (organized files) and Knowledge Graph (connected entities)
1429
+ * - Links VFS files to graph entities
1430
+ *
1431
+ * @example
1432
+ * // Import from file path
1433
+ * const result = await brain.import('/path/to/file.xlsx')
1434
+ *
1435
+ * @example
1436
+ * // Import from buffer
1437
+ * const result = await brain.import(buffer, { format: 'pdf' })
1438
+ *
1439
+ * @example
1440
+ * // Import JSON object
1441
+ * const result = await brain.import({ entities: [...] })
1442
+ *
1443
+ * @example
1444
+ * // Custom VFS path and grouping
1445
+ * const result = await brain.import(buffer, {
1446
+ * vfsPath: '/my-imports/data',
1447
+ * groupBy: 'type',
1448
+ * onProgress: (progress) => console.log(progress.message)
1449
+ * })
1450
+ */
1451
+ async import(source, options) {
1452
+ // Lazy load ImportCoordinator
1453
+ const { ImportCoordinator } = await import('./import/ImportCoordinator.js');
1454
+ const coordinator = new ImportCoordinator(this);
1455
+ await coordinator.init();
1456
+ return await coordinator.import(source, options);
1457
+ }
1422
1458
  /**
1423
1459
  * Virtual File System API - Knowledge Operating System
1424
1460
  */
@@ -0,0 +1,84 @@
1
+ /**
2
+ * Entity Deduplicator
3
+ *
4
+ * Finds and merges duplicate entities across imports using:
5
+ * - Embedding-based similarity matching
6
+ * - Type-aware comparison
7
+ * - Confidence-weighted merging
8
+ * - Provenance tracking
9
+ *
10
+ * NO MOCKS - Production-ready implementation
11
+ */
12
+ import { Brainy } from '../brainy.js';
13
+ import { NounType } from '../types/graphTypes.js';
14
+ export interface EntityCandidate {
15
+ id?: string;
16
+ name: string;
17
+ type: NounType;
18
+ description: string;
19
+ confidence: number;
20
+ metadata: Record<string, any>;
21
+ }
22
+ export interface DuplicateMatch {
23
+ existingId: string;
24
+ existingName: string;
25
+ similarity: number;
26
+ shouldMerge: boolean;
27
+ reason: string;
28
+ }
29
+ export interface EntityDeduplicationOptions {
30
+ /** Similarity threshold for considering entities as duplicates (0-1) */
31
+ similarityThreshold?: number;
32
+ /** Only match entities of the same type */
33
+ strictTypeMatching?: boolean;
34
+ /** Enable fuzzy name matching */
35
+ enableFuzzyMatching?: boolean;
36
+ /** Minimum confidence to consider for merging */
37
+ minConfidence?: number;
38
+ }
39
+ export interface MergeResult {
40
+ mergedEntityId: string;
41
+ wasMerged: boolean;
42
+ mergedWith?: string;
43
+ confidence: number;
44
+ provenance: string[];
45
+ }
46
+ /**
47
+ * EntityDeduplicator - Prevents duplicate entities across imports
48
+ */
49
+ export declare class EntityDeduplicator {
50
+ private brain;
51
+ constructor(brain: Brainy);
52
+ /**
53
+ * Find duplicate entities in the knowledge graph
54
+ */
55
+ findDuplicates(candidate: EntityCandidate, options?: EntityDeduplicationOptions): Promise<DuplicateMatch | null>;
56
+ /**
57
+ * Merge entity data with existing entity
58
+ */
59
+ mergeEntity(existingId: string, candidate: EntityCandidate, importSource: string): Promise<MergeResult>;
60
+ /**
61
+ * Create or merge entity with deduplication
62
+ */
63
+ createOrMerge(candidate: EntityCandidate, importSource: string, options?: EntityDeduplicationOptions): Promise<MergeResult>;
64
+ /**
65
+ * Normalize string for comparison
66
+ */
67
+ private normalizeString;
68
+ /**
69
+ * Check if two names are similar (fuzzy matching)
70
+ */
71
+ private areSimilarNames;
72
+ /**
73
+ * Calculate Levenshtein distance between two strings
74
+ */
75
+ private levenshteinDistance;
76
+ /**
77
+ * Merge confidence scores (weighted average favoring higher confidence)
78
+ */
79
+ private mergeConfidence;
80
+ /**
81
+ * Merge metadata fields intelligently
82
+ */
83
+ private mergeMetadataFields;
84
+ }
@@ -0,0 +1,255 @@
1
+ /**
2
+ * Entity Deduplicator
3
+ *
4
+ * Finds and merges duplicate entities across imports using:
5
+ * - Embedding-based similarity matching
6
+ * - Type-aware comparison
7
+ * - Confidence-weighted merging
8
+ * - Provenance tracking
9
+ *
10
+ * NO MOCKS - Production-ready implementation
11
+ */
12
+ /**
13
+ * EntityDeduplicator - Prevents duplicate entities across imports
14
+ */
15
+ export class EntityDeduplicator {
16
+ constructor(brain) {
17
+ this.brain = brain;
18
+ }
19
+ /**
20
+ * Find duplicate entities in the knowledge graph
21
+ */
22
+ async findDuplicates(candidate, options = {}) {
23
+ const opts = {
24
+ similarityThreshold: options.similarityThreshold || 0.85,
25
+ strictTypeMatching: options.strictTypeMatching !== false,
26
+ enableFuzzyMatching: options.enableFuzzyMatching !== false,
27
+ minConfidence: options.minConfidence || 0.6
28
+ };
29
+ // Skip low-confidence candidates
30
+ if (candidate.confidence < opts.minConfidence) {
31
+ return null;
32
+ }
33
+ // Search for similar entities by name and description
34
+ const searchText = `${candidate.name} ${candidate.description}`.trim();
35
+ try {
36
+ const results = await this.brain.find({
37
+ query: searchText,
38
+ limit: 5,
39
+ where: opts.strictTypeMatching ? { type: candidate.type } : undefined
40
+ });
41
+ // Check each result for potential duplicates
42
+ for (const result of results) {
43
+ const similarity = result.score || 0;
44
+ const existingName = result.entity.metadata?.name || result.id;
45
+ const existingType = result.entity.metadata?.type || result.entity.metadata?.nounType || result.entity.type;
46
+ // Skip if below similarity threshold
47
+ if (similarity < opts.similarityThreshold) {
48
+ continue;
49
+ }
50
+ // Type matching check
51
+ if (opts.strictTypeMatching && existingType !== candidate.type) {
52
+ continue;
53
+ }
54
+ // Exact name match (case-insensitive)
55
+ if (this.normalizeString(candidate.name) === this.normalizeString(existingName)) {
56
+ return {
57
+ existingId: result.id,
58
+ existingName,
59
+ similarity: 1.0,
60
+ shouldMerge: true,
61
+ reason: 'Exact name match'
62
+ };
63
+ }
64
+ // High similarity match
65
+ if (similarity >= opts.similarityThreshold) {
66
+ // Additional validation for fuzzy matching
67
+ if (opts.enableFuzzyMatching && this.areSimilarNames(candidate.name, existingName)) {
68
+ return {
69
+ existingId: result.id,
70
+ existingName,
71
+ similarity,
72
+ shouldMerge: true,
73
+ reason: `High similarity (${(similarity * 100).toFixed(1)}%)`
74
+ };
75
+ }
76
+ }
77
+ }
78
+ }
79
+ catch (error) {
80
+ // If search fails, assume no duplicates
81
+ return null;
82
+ }
83
+ return null;
84
+ }
85
+ /**
86
+ * Merge entity data with existing entity
87
+ */
88
+ async mergeEntity(existingId, candidate, importSource) {
89
+ try {
90
+ // Get existing entity
91
+ const existing = await this.brain.get(existingId);
92
+ if (!existing) {
93
+ throw new Error(`Entity ${existingId} not found`);
94
+ }
95
+ // Merge metadata
96
+ const mergedMetadata = {
97
+ ...existing.metadata,
98
+ // Track provenance
99
+ imports: [
100
+ ...(existing.metadata?.imports || []),
101
+ importSource
102
+ ],
103
+ // Merge VFS paths
104
+ vfsPaths: [
105
+ ...(existing.metadata?.vfsPaths || [existing.metadata?.vfsPath]).filter(Boolean),
106
+ candidate.metadata?.vfsPath
107
+ ].filter(Boolean),
108
+ // Update confidence (weighted average)
109
+ confidence: this.mergeConfidence(existing.metadata?.confidence || 0.5, candidate.confidence),
110
+ // Merge other metadata
111
+ ...this.mergeMetadataFields(existing.metadata, candidate.metadata),
112
+ // Track last update
113
+ lastUpdated: Date.now(),
114
+ mergeCount: (existing.metadata?.mergeCount || 0) + 1
115
+ };
116
+ // Update entity
117
+ await this.brain.update({
118
+ id: existingId,
119
+ metadata: mergedMetadata,
120
+ merge: true
121
+ });
122
+ return {
123
+ mergedEntityId: existingId,
124
+ wasMerged: true,
125
+ mergedWith: existing.metadata?.name || existingId,
126
+ confidence: mergedMetadata.confidence,
127
+ provenance: mergedMetadata.imports
128
+ };
129
+ }
130
+ catch (error) {
131
+ throw new Error(`Failed to merge entity: ${error instanceof Error ? error.message : String(error)}`);
132
+ }
133
+ }
134
+ /**
135
+ * Create or merge entity with deduplication
136
+ */
137
+ async createOrMerge(candidate, importSource, options = {}) {
138
+ // Check for duplicates
139
+ const duplicate = await this.findDuplicates(candidate, options);
140
+ if (duplicate && duplicate.shouldMerge) {
141
+ // Merge with existing entity
142
+ return await this.mergeEntity(duplicate.existingId, candidate, importSource);
143
+ }
144
+ // No duplicate found, create new entity
145
+ const entityId = await this.brain.add({
146
+ data: candidate.description || candidate.name,
147
+ type: candidate.type,
148
+ metadata: {
149
+ ...candidate.metadata,
150
+ name: candidate.name,
151
+ confidence: candidate.confidence,
152
+ imports: [importSource],
153
+ vfsPaths: [candidate.metadata?.vfsPath].filter(Boolean),
154
+ createdAt: Date.now(),
155
+ mergeCount: 0
156
+ }
157
+ });
158
+ // Update candidate with new ID
159
+ candidate.id = entityId;
160
+ return {
161
+ mergedEntityId: entityId,
162
+ wasMerged: false,
163
+ confidence: candidate.confidence,
164
+ provenance: [importSource]
165
+ };
166
+ }
167
+ /**
168
+ * Normalize string for comparison
169
+ */
170
+ normalizeString(str) {
171
+ return str
172
+ .toLowerCase()
173
+ .trim()
174
+ .replace(/[^a-z0-9]/g, '');
175
+ }
176
+ /**
177
+ * Check if two names are similar (fuzzy matching)
178
+ */
179
+ areSimilarNames(name1, name2) {
180
+ const n1 = this.normalizeString(name1);
181
+ const n2 = this.normalizeString(name2);
182
+ // Exact match
183
+ if (n1 === n2)
184
+ return true;
185
+ // Length difference check
186
+ const lengthDiff = Math.abs(n1.length - n2.length);
187
+ if (lengthDiff > 3)
188
+ return false;
189
+ // Levenshtein distance
190
+ const distance = this.levenshteinDistance(n1, n2);
191
+ const maxLength = Math.max(n1.length, n2.length);
192
+ const similarity = 1 - (distance / maxLength);
193
+ return similarity >= 0.85;
194
+ }
195
+ /**
196
+ * Calculate Levenshtein distance between two strings
197
+ */
198
+ levenshteinDistance(str1, str2) {
199
+ const m = str1.length;
200
+ const n = str2.length;
201
+ const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
202
+ for (let i = 0; i <= m; i++)
203
+ dp[i][0] = i;
204
+ for (let j = 0; j <= n; j++)
205
+ dp[0][j] = j;
206
+ for (let i = 1; i <= m; i++) {
207
+ for (let j = 1; j <= n; j++) {
208
+ if (str1[i - 1] === str2[j - 1]) {
209
+ dp[i][j] = dp[i - 1][j - 1];
210
+ }
211
+ else {
212
+ dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion
213
+ dp[i][j - 1] + 1, // insertion
214
+ dp[i - 1][j - 1] + 1 // substitution
215
+ );
216
+ }
217
+ }
218
+ }
219
+ return dp[m][n];
220
+ }
221
+ /**
222
+ * Merge confidence scores (weighted average favoring higher confidence)
223
+ */
224
+ mergeConfidence(existing, incoming) {
225
+ // Weight higher confidence more heavily
226
+ const weights = existing > incoming ? [0.6, 0.4] : [0.4, 0.6];
227
+ return existing * weights[0] + incoming * weights[1];
228
+ }
229
+ /**
230
+ * Merge metadata fields intelligently
231
+ */
232
+ mergeMetadataFields(existing, incoming) {
233
+ const merged = {};
234
+ // Merge arrays
235
+ const arrayFields = ['concepts', 'tags', 'categories'];
236
+ for (const field of arrayFields) {
237
+ if (existing[field] || incoming[field]) {
238
+ const combined = [
239
+ ...(existing[field] || []),
240
+ ...(incoming[field] || [])
241
+ ];
242
+ // Deduplicate
243
+ merged[field] = [...new Set(combined)];
244
+ }
245
+ }
246
+ // Prefer longer descriptions
247
+ if (existing.description || incoming.description) {
248
+ merged.description = (existing.description || '').length > (incoming.description || '').length
249
+ ? existing.description
250
+ : incoming.description;
251
+ }
252
+ return merged;
253
+ }
254
+ }
255
+ //# sourceMappingURL=EntityDeduplicator.js.map
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Format Detector
3
+ *
4
+ * Unified format detection for all import types using:
5
+ * - Magic byte signatures (PDF, Excel, images)
6
+ * - File extensions
7
+ * - Content analysis (JSON, Markdown, CSV)
8
+ *
9
+ * NO MOCKS - Production-ready implementation
10
+ */
11
+ export type SupportedFormat = 'excel' | 'pdf' | 'csv' | 'json' | 'markdown';
12
+ export interface DetectionResult {
13
+ format: SupportedFormat;
14
+ confidence: number;
15
+ evidence: string[];
16
+ }
17
+ /**
18
+ * FormatDetector - Detect file format from various inputs
19
+ */
20
+ export declare class FormatDetector {
21
+ /**
22
+ * Detect format from buffer
23
+ */
24
+ detectFromBuffer(buffer: Buffer): DetectionResult | null;
25
+ /**
26
+ * Detect format from file path
27
+ */
28
+ detectFromPath(path: string): DetectionResult | null;
29
+ /**
30
+ * Detect format from string content
31
+ */
32
+ detectFromString(content: string): DetectionResult | null;
33
+ /**
34
+ * Detect format from object
35
+ */
36
+ detectFromObject(obj: any): DetectionResult | null;
37
+ /**
38
+ * Detect by magic bytes
39
+ */
40
+ private detectByMagicBytes;
41
+ /**
42
+ * Detect by content analysis
43
+ */
44
+ private detectByContent;
45
+ /**
46
+ * Check if content looks like JSON
47
+ */
48
+ private looksLikeJSON;
49
+ /**
50
+ * Check if content looks like Markdown
51
+ */
52
+ private looksLikeMarkdown;
53
+ /**
54
+ * Check if content looks like CSV
55
+ */
56
+ private looksLikeCSV;
57
+ /**
58
+ * Check if content is text-based (not binary)
59
+ */
60
+ private isTextContent;
61
+ /**
62
+ * Get file extension from path
63
+ */
64
+ private getExtension;
65
+ }