@soulcraft/brainy 3.27.1 → 3.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/brainy.d.ts +50 -0
- package/dist/brainy.js +54 -2
- package/dist/config/storageAutoConfig.d.ts +2 -1
- package/dist/config/storageAutoConfig.js +5 -4
- package/dist/import/EntityDeduplicator.d.ts +84 -0
- package/dist/import/EntityDeduplicator.js +255 -0
- package/dist/import/FormatDetector.d.ts +65 -0
- package/dist/import/FormatDetector.js +263 -0
- package/dist/import/ImportCoordinator.d.ts +160 -0
- package/dist/import/ImportCoordinator.js +498 -0
- package/dist/import/ImportHistory.d.ts +92 -0
- package/dist/import/ImportHistory.js +183 -0
- package/dist/import/index.d.ts +16 -0
- package/dist/import/index.js +14 -0
- package/dist/importers/SmartCSVImporter.d.ts +136 -0
- package/dist/importers/SmartCSVImporter.js +308 -0
- package/dist/importers/SmartExcelImporter.d.ts +131 -0
- package/dist/importers/SmartExcelImporter.js +302 -0
- package/dist/importers/SmartImportOrchestrator.d.ts +125 -0
- package/dist/importers/SmartImportOrchestrator.js +531 -0
- package/dist/importers/SmartJSONImporter.d.ts +135 -0
- package/dist/importers/SmartJSONImporter.js +325 -0
- package/dist/importers/SmartMarkdownImporter.d.ts +159 -0
- package/dist/importers/SmartMarkdownImporter.js +369 -0
- package/dist/importers/SmartPDFImporter.d.ts +154 -0
- package/dist/importers/SmartPDFImporter.js +337 -0
- package/dist/importers/VFSStructureGenerator.d.ts +82 -0
- package/dist/importers/VFSStructureGenerator.js +260 -0
- package/dist/importers/index.d.ts +28 -0
- package/dist/importers/index.js +29 -0
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
|
|
4
4
|
|
|
5
|
+
## [3.29.0](https://github.com/soulcraftlabs/brainy/compare/v3.28.0...v3.29.0) (2025-10-09)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### 🐛 Bug Fixes
|
|
9
|
+
|
|
10
|
+
* enable GCS native storage with Application Default Credentials ([1e77ecd](https://github.com/soulcraftlabs/brainy/commit/1e77ecd145d3dea46e04ca5ecc6692b41e569c1e))
|
|
11
|
+
|
|
12
|
+
### [3.28.0](https://github.com/soulcraftlabs/brainy/compare/v3.27.1...v3.28.0) (2025-10-08)
|
|
13
|
+
|
|
14
|
+
- feat: add unified import system with auto-detection and dual storage (a06e877)
|
|
15
|
+
|
|
16
|
+
|
|
5
17
|
### [3.27.1](https://github.com/soulcraftlabs/brainy/compare/v3.27.0...v3.27.1) (2025-10-08)
|
|
6
18
|
|
|
7
19
|
- docs: clarify GCS storage type and config object pairing (dcbd0fd)
|
package/dist/brainy.d.ts
CHANGED
|
@@ -640,6 +640,56 @@ export declare class Brainy<T = any> implements BrainyInterface<T> {
|
|
|
640
640
|
confidence?: number;
|
|
641
641
|
limit?: number;
|
|
642
642
|
}): Promise<string[]>;
|
|
643
|
+
/**
|
|
644
|
+
* Import files with auto-detection and dual storage (VFS + Knowledge Graph)
|
|
645
|
+
*
|
|
646
|
+
* Unified import system that:
|
|
647
|
+
* - Auto-detects format (Excel, PDF, CSV, JSON, Markdown)
|
|
648
|
+
* - Extracts entities and relationships
|
|
649
|
+
* - Stores in both VFS (organized files) and Knowledge Graph (connected entities)
|
|
650
|
+
* - Links VFS files to graph entities
|
|
651
|
+
*
|
|
652
|
+
* @example
|
|
653
|
+
* // Import from file path
|
|
654
|
+
* const result = await brain.import('/path/to/file.xlsx')
|
|
655
|
+
*
|
|
656
|
+
* @example
|
|
657
|
+
* // Import from buffer
|
|
658
|
+
* const result = await brain.import(buffer, { format: 'pdf' })
|
|
659
|
+
*
|
|
660
|
+
* @example
|
|
661
|
+
* // Import JSON object
|
|
662
|
+
* const result = await brain.import({ entities: [...] })
|
|
663
|
+
*
|
|
664
|
+
* @example
|
|
665
|
+
* // Custom VFS path and grouping
|
|
666
|
+
* const result = await brain.import(buffer, {
|
|
667
|
+
* vfsPath: '/my-imports/data',
|
|
668
|
+
* groupBy: 'type',
|
|
669
|
+
* onProgress: (progress) => console.log(progress.message)
|
|
670
|
+
* })
|
|
671
|
+
*/
|
|
672
|
+
import(source: Buffer | string | object, options?: {
|
|
673
|
+
format?: 'excel' | 'pdf' | 'csv' | 'json' | 'markdown';
|
|
674
|
+
vfsPath?: string;
|
|
675
|
+
groupBy?: 'type' | 'sheet' | 'flat' | 'custom';
|
|
676
|
+
customGrouping?: (entity: any) => string;
|
|
677
|
+
createEntities?: boolean;
|
|
678
|
+
createRelationships?: boolean;
|
|
679
|
+
preserveSource?: boolean;
|
|
680
|
+
enableNeuralExtraction?: boolean;
|
|
681
|
+
enableRelationshipInference?: boolean;
|
|
682
|
+
enableConceptExtraction?: boolean;
|
|
683
|
+
confidenceThreshold?: number;
|
|
684
|
+
onProgress?: (progress: {
|
|
685
|
+
stage: 'detecting' | 'extracting' | 'storing-vfs' | 'storing-graph' | 'complete';
|
|
686
|
+
message: string;
|
|
687
|
+
processed?: number;
|
|
688
|
+
total?: number;
|
|
689
|
+
entities?: number;
|
|
690
|
+
relationships?: number;
|
|
691
|
+
}) => void;
|
|
692
|
+
}): Promise<import("./import/ImportCoordinator.js").ImportResult>;
|
|
643
693
|
/**
|
|
644
694
|
* Virtual File System API - Knowledge Operating System
|
|
645
695
|
*/
|
package/dist/brainy.js
CHANGED
|
@@ -1419,6 +1419,42 @@ export class Brainy {
|
|
|
1419
1419
|
// Apply limit if specified
|
|
1420
1420
|
return options?.limit ? concepts.slice(0, options.limit) : concepts;
|
|
1421
1421
|
}
|
|
1422
|
+
/**
|
|
1423
|
+
* Import files with auto-detection and dual storage (VFS + Knowledge Graph)
|
|
1424
|
+
*
|
|
1425
|
+
* Unified import system that:
|
|
1426
|
+
* - Auto-detects format (Excel, PDF, CSV, JSON, Markdown)
|
|
1427
|
+
* - Extracts entities and relationships
|
|
1428
|
+
* - Stores in both VFS (organized files) and Knowledge Graph (connected entities)
|
|
1429
|
+
* - Links VFS files to graph entities
|
|
1430
|
+
*
|
|
1431
|
+
* @example
|
|
1432
|
+
* // Import from file path
|
|
1433
|
+
* const result = await brain.import('/path/to/file.xlsx')
|
|
1434
|
+
*
|
|
1435
|
+
* @example
|
|
1436
|
+
* // Import from buffer
|
|
1437
|
+
* const result = await brain.import(buffer, { format: 'pdf' })
|
|
1438
|
+
*
|
|
1439
|
+
* @example
|
|
1440
|
+
* // Import JSON object
|
|
1441
|
+
* const result = await brain.import({ entities: [...] })
|
|
1442
|
+
*
|
|
1443
|
+
* @example
|
|
1444
|
+
* // Custom VFS path and grouping
|
|
1445
|
+
* const result = await brain.import(buffer, {
|
|
1446
|
+
* vfsPath: '/my-imports/data',
|
|
1447
|
+
* groupBy: 'type',
|
|
1448
|
+
* onProgress: (progress) => console.log(progress.message)
|
|
1449
|
+
* })
|
|
1450
|
+
*/
|
|
1451
|
+
async import(source, options) {
|
|
1452
|
+
// Lazy load ImportCoordinator
|
|
1453
|
+
const { ImportCoordinator } = await import('./import/ImportCoordinator.js');
|
|
1454
|
+
const coordinator = new ImportCoordinator(this);
|
|
1455
|
+
await coordinator.init();
|
|
1456
|
+
return await coordinator.import(source, options);
|
|
1457
|
+
}
|
|
1422
1458
|
/**
|
|
1423
1459
|
* Virtual File System API - Knowledge Operating System
|
|
1424
1460
|
*/
|
|
@@ -2238,8 +2274,24 @@ export class Brainy {
|
|
|
2238
2274
|
*/
|
|
2239
2275
|
normalizeConfig(config) {
|
|
2240
2276
|
// Validate storage configuration
|
|
2241
|
-
if (config?.storage?.type && !['auto', 'memory', 'filesystem', 'opfs', 'remote', 's3', 'r2', 'gcs'].includes(config.storage.type)) {
|
|
2242
|
-
throw new Error(`Invalid storage type: ${config.storage.type}. Must be one of: auto, memory, filesystem, opfs, remote, s3, r2, gcs`);
|
|
2277
|
+
if (config?.storage?.type && !['auto', 'memory', 'filesystem', 'opfs', 'remote', 's3', 'r2', 'gcs', 'gcs-native'].includes(config.storage.type)) {
|
|
2278
|
+
throw new Error(`Invalid storage type: ${config.storage.type}. Must be one of: auto, memory, filesystem, opfs, remote, s3, r2, gcs, gcs-native`);
|
|
2279
|
+
}
|
|
2280
|
+
// Validate storage type/config pairing (catch common mismatches)
|
|
2281
|
+
if (config?.storage) {
|
|
2282
|
+
const storage = config.storage;
|
|
2283
|
+
// Check for gcs/gcsNativeStorage mismatch
|
|
2284
|
+
if (storage.type === 'gcs' && storage.gcsNativeStorage) {
|
|
2285
|
+
throw new Error(`Storage type/config mismatch: type 'gcs' requires 'gcsStorage' config object (S3-compatible). ` +
|
|
2286
|
+
`You provided 'gcsNativeStorage' which requires type 'gcs-native'. ` +
|
|
2287
|
+
`Either change type to 'gcs-native' or use 'gcsStorage' instead of 'gcsNativeStorage'.`);
|
|
2288
|
+
}
|
|
2289
|
+
// Check for gcs-native/gcsStorage mismatch
|
|
2290
|
+
if (storage.type === 'gcs-native' && storage.gcsStorage) {
|
|
2291
|
+
throw new Error(`Storage type/config mismatch: type 'gcs-native' requires 'gcsNativeStorage' config object. ` +
|
|
2292
|
+
`You provided 'gcsStorage' which requires type 'gcs' (S3-compatible). ` +
|
|
2293
|
+
`Either change type to 'gcs' or use 'gcsNativeStorage' instead of 'gcsStorage'.`);
|
|
2294
|
+
}
|
|
2243
2295
|
}
|
|
2244
2296
|
// Validate model configuration
|
|
2245
2297
|
if (config?.model?.type && !['fast', 'accurate', 'custom'].includes(config.model.type)) {
|
|
@@ -11,6 +11,7 @@ export declare enum StorageType {
|
|
|
11
11
|
OPFS = "opfs",
|
|
12
12
|
S3 = "s3",
|
|
13
13
|
GCS = "gcs",
|
|
14
|
+
GCS_NATIVE = "gcs-native",
|
|
14
15
|
R2 = "r2"
|
|
15
16
|
}
|
|
16
17
|
/**
|
|
@@ -22,7 +23,7 @@ export declare enum StoragePreset {
|
|
|
22
23
|
DISK = "disk",
|
|
23
24
|
CLOUD = "cloud"
|
|
24
25
|
}
|
|
25
|
-
export type StorageTypeString = 'memory' | 'filesystem' | 'opfs' | 's3' | 'gcs' | 'r2';
|
|
26
|
+
export type StorageTypeString = 'memory' | 'filesystem' | 'opfs' | 's3' | 'gcs' | 'r2' | 'gcs-native';
|
|
26
27
|
export type StoragePresetString = 'auto' | 'memory' | 'disk' | 'cloud';
|
|
27
28
|
export interface StorageConfigResult {
|
|
28
29
|
type: StorageType | StorageTypeString;
|
|
@@ -13,6 +13,7 @@ export var StorageType;
|
|
|
13
13
|
StorageType["OPFS"] = "opfs";
|
|
14
14
|
StorageType["S3"] = "s3";
|
|
15
15
|
StorageType["GCS"] = "gcs";
|
|
16
|
+
StorageType["GCS_NATIVE"] = "gcs-native";
|
|
16
17
|
StorageType["R2"] = "r2";
|
|
17
18
|
})(StorageType || (StorageType = {}));
|
|
18
19
|
/**
|
|
@@ -170,14 +171,14 @@ async function detectCloudStorage() {
|
|
|
170
171
|
}
|
|
171
172
|
};
|
|
172
173
|
}
|
|
173
|
-
// Google Cloud Storage Detection
|
|
174
|
+
// Google Cloud Storage Detection (Native SDK with ADC)
|
|
174
175
|
if (hasGCPConfig()) {
|
|
175
176
|
return {
|
|
176
|
-
type: StorageType.
|
|
177
|
+
type: StorageType.GCS_NATIVE,
|
|
177
178
|
config: {
|
|
178
|
-
|
|
179
|
+
gcsNativeStorage: {
|
|
179
180
|
bucketName: process.env.GCS_BUCKET || process.env.GOOGLE_STORAGE_BUCKET || 'brainy-data',
|
|
180
|
-
// Credentials will be picked up
|
|
181
|
+
// Application Default Credentials will be picked up automatically
|
|
181
182
|
}
|
|
182
183
|
}
|
|
183
184
|
};
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Entity Deduplicator
|
|
3
|
+
*
|
|
4
|
+
* Finds and merges duplicate entities across imports using:
|
|
5
|
+
* - Embedding-based similarity matching
|
|
6
|
+
* - Type-aware comparison
|
|
7
|
+
* - Confidence-weighted merging
|
|
8
|
+
* - Provenance tracking
|
|
9
|
+
*
|
|
10
|
+
* NO MOCKS - Production-ready implementation
|
|
11
|
+
*/
|
|
12
|
+
import { Brainy } from '../brainy.js';
|
|
13
|
+
import { NounType } from '../types/graphTypes.js';
|
|
14
|
+
export interface EntityCandidate {
|
|
15
|
+
id?: string;
|
|
16
|
+
name: string;
|
|
17
|
+
type: NounType;
|
|
18
|
+
description: string;
|
|
19
|
+
confidence: number;
|
|
20
|
+
metadata: Record<string, any>;
|
|
21
|
+
}
|
|
22
|
+
export interface DuplicateMatch {
|
|
23
|
+
existingId: string;
|
|
24
|
+
existingName: string;
|
|
25
|
+
similarity: number;
|
|
26
|
+
shouldMerge: boolean;
|
|
27
|
+
reason: string;
|
|
28
|
+
}
|
|
29
|
+
export interface EntityDeduplicationOptions {
|
|
30
|
+
/** Similarity threshold for considering entities as duplicates (0-1) */
|
|
31
|
+
similarityThreshold?: number;
|
|
32
|
+
/** Only match entities of the same type */
|
|
33
|
+
strictTypeMatching?: boolean;
|
|
34
|
+
/** Enable fuzzy name matching */
|
|
35
|
+
enableFuzzyMatching?: boolean;
|
|
36
|
+
/** Minimum confidence to consider for merging */
|
|
37
|
+
minConfidence?: number;
|
|
38
|
+
}
|
|
39
|
+
export interface MergeResult {
|
|
40
|
+
mergedEntityId: string;
|
|
41
|
+
wasMerged: boolean;
|
|
42
|
+
mergedWith?: string;
|
|
43
|
+
confidence: number;
|
|
44
|
+
provenance: string[];
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* EntityDeduplicator - Prevents duplicate entities across imports
|
|
48
|
+
*/
|
|
49
|
+
export declare class EntityDeduplicator {
|
|
50
|
+
private brain;
|
|
51
|
+
constructor(brain: Brainy);
|
|
52
|
+
/**
|
|
53
|
+
* Find duplicate entities in the knowledge graph
|
|
54
|
+
*/
|
|
55
|
+
findDuplicates(candidate: EntityCandidate, options?: EntityDeduplicationOptions): Promise<DuplicateMatch | null>;
|
|
56
|
+
/**
|
|
57
|
+
* Merge entity data with existing entity
|
|
58
|
+
*/
|
|
59
|
+
mergeEntity(existingId: string, candidate: EntityCandidate, importSource: string): Promise<MergeResult>;
|
|
60
|
+
/**
|
|
61
|
+
* Create or merge entity with deduplication
|
|
62
|
+
*/
|
|
63
|
+
createOrMerge(candidate: EntityCandidate, importSource: string, options?: EntityDeduplicationOptions): Promise<MergeResult>;
|
|
64
|
+
/**
|
|
65
|
+
* Normalize string for comparison
|
|
66
|
+
*/
|
|
67
|
+
private normalizeString;
|
|
68
|
+
/**
|
|
69
|
+
* Check if two names are similar (fuzzy matching)
|
|
70
|
+
*/
|
|
71
|
+
private areSimilarNames;
|
|
72
|
+
/**
|
|
73
|
+
* Calculate Levenshtein distance between two strings
|
|
74
|
+
*/
|
|
75
|
+
private levenshteinDistance;
|
|
76
|
+
/**
|
|
77
|
+
* Merge confidence scores (weighted average favoring higher confidence)
|
|
78
|
+
*/
|
|
79
|
+
private mergeConfidence;
|
|
80
|
+
/**
|
|
81
|
+
* Merge metadata fields intelligently
|
|
82
|
+
*/
|
|
83
|
+
private mergeMetadataFields;
|
|
84
|
+
}
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Entity Deduplicator
|
|
3
|
+
*
|
|
4
|
+
* Finds and merges duplicate entities across imports using:
|
|
5
|
+
* - Embedding-based similarity matching
|
|
6
|
+
* - Type-aware comparison
|
|
7
|
+
* - Confidence-weighted merging
|
|
8
|
+
* - Provenance tracking
|
|
9
|
+
*
|
|
10
|
+
* NO MOCKS - Production-ready implementation
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* EntityDeduplicator - Prevents duplicate entities across imports
|
|
14
|
+
*/
|
|
15
|
+
export class EntityDeduplicator {
|
|
16
|
+
constructor(brain) {
|
|
17
|
+
this.brain = brain;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Find duplicate entities in the knowledge graph
|
|
21
|
+
*/
|
|
22
|
+
async findDuplicates(candidate, options = {}) {
|
|
23
|
+
const opts = {
|
|
24
|
+
similarityThreshold: options.similarityThreshold || 0.85,
|
|
25
|
+
strictTypeMatching: options.strictTypeMatching !== false,
|
|
26
|
+
enableFuzzyMatching: options.enableFuzzyMatching !== false,
|
|
27
|
+
minConfidence: options.minConfidence || 0.6
|
|
28
|
+
};
|
|
29
|
+
// Skip low-confidence candidates
|
|
30
|
+
if (candidate.confidence < opts.minConfidence) {
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
// Search for similar entities by name and description
|
|
34
|
+
const searchText = `${candidate.name} ${candidate.description}`.trim();
|
|
35
|
+
try {
|
|
36
|
+
const results = await this.brain.find({
|
|
37
|
+
query: searchText,
|
|
38
|
+
limit: 5,
|
|
39
|
+
where: opts.strictTypeMatching ? { type: candidate.type } : undefined
|
|
40
|
+
});
|
|
41
|
+
// Check each result for potential duplicates
|
|
42
|
+
for (const result of results) {
|
|
43
|
+
const similarity = result.score || 0;
|
|
44
|
+
const existingName = result.entity.metadata?.name || result.id;
|
|
45
|
+
const existingType = result.entity.metadata?.type || result.entity.metadata?.nounType || result.entity.type;
|
|
46
|
+
// Skip if below similarity threshold
|
|
47
|
+
if (similarity < opts.similarityThreshold) {
|
|
48
|
+
continue;
|
|
49
|
+
}
|
|
50
|
+
// Type matching check
|
|
51
|
+
if (opts.strictTypeMatching && existingType !== candidate.type) {
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
// Exact name match (case-insensitive)
|
|
55
|
+
if (this.normalizeString(candidate.name) === this.normalizeString(existingName)) {
|
|
56
|
+
return {
|
|
57
|
+
existingId: result.id,
|
|
58
|
+
existingName,
|
|
59
|
+
similarity: 1.0,
|
|
60
|
+
shouldMerge: true,
|
|
61
|
+
reason: 'Exact name match'
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
// High similarity match
|
|
65
|
+
if (similarity >= opts.similarityThreshold) {
|
|
66
|
+
// Additional validation for fuzzy matching
|
|
67
|
+
if (opts.enableFuzzyMatching && this.areSimilarNames(candidate.name, existingName)) {
|
|
68
|
+
return {
|
|
69
|
+
existingId: result.id,
|
|
70
|
+
existingName,
|
|
71
|
+
similarity,
|
|
72
|
+
shouldMerge: true,
|
|
73
|
+
reason: `High similarity (${(similarity * 100).toFixed(1)}%)`
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
catch (error) {
|
|
80
|
+
// If search fails, assume no duplicates
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
return null;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Merge entity data with existing entity
|
|
87
|
+
*/
|
|
88
|
+
async mergeEntity(existingId, candidate, importSource) {
|
|
89
|
+
try {
|
|
90
|
+
// Get existing entity
|
|
91
|
+
const existing = await this.brain.get(existingId);
|
|
92
|
+
if (!existing) {
|
|
93
|
+
throw new Error(`Entity ${existingId} not found`);
|
|
94
|
+
}
|
|
95
|
+
// Merge metadata
|
|
96
|
+
const mergedMetadata = {
|
|
97
|
+
...existing.metadata,
|
|
98
|
+
// Track provenance
|
|
99
|
+
imports: [
|
|
100
|
+
...(existing.metadata?.imports || []),
|
|
101
|
+
importSource
|
|
102
|
+
],
|
|
103
|
+
// Merge VFS paths
|
|
104
|
+
vfsPaths: [
|
|
105
|
+
...(existing.metadata?.vfsPaths || [existing.metadata?.vfsPath]).filter(Boolean),
|
|
106
|
+
candidate.metadata?.vfsPath
|
|
107
|
+
].filter(Boolean),
|
|
108
|
+
// Update confidence (weighted average)
|
|
109
|
+
confidence: this.mergeConfidence(existing.metadata?.confidence || 0.5, candidate.confidence),
|
|
110
|
+
// Merge other metadata
|
|
111
|
+
...this.mergeMetadataFields(existing.metadata, candidate.metadata),
|
|
112
|
+
// Track last update
|
|
113
|
+
lastUpdated: Date.now(),
|
|
114
|
+
mergeCount: (existing.metadata?.mergeCount || 0) + 1
|
|
115
|
+
};
|
|
116
|
+
// Update entity
|
|
117
|
+
await this.brain.update({
|
|
118
|
+
id: existingId,
|
|
119
|
+
metadata: mergedMetadata,
|
|
120
|
+
merge: true
|
|
121
|
+
});
|
|
122
|
+
return {
|
|
123
|
+
mergedEntityId: existingId,
|
|
124
|
+
wasMerged: true,
|
|
125
|
+
mergedWith: existing.metadata?.name || existingId,
|
|
126
|
+
confidence: mergedMetadata.confidence,
|
|
127
|
+
provenance: mergedMetadata.imports
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
catch (error) {
|
|
131
|
+
throw new Error(`Failed to merge entity: ${error instanceof Error ? error.message : String(error)}`);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Create or merge entity with deduplication
|
|
136
|
+
*/
|
|
137
|
+
async createOrMerge(candidate, importSource, options = {}) {
|
|
138
|
+
// Check for duplicates
|
|
139
|
+
const duplicate = await this.findDuplicates(candidate, options);
|
|
140
|
+
if (duplicate && duplicate.shouldMerge) {
|
|
141
|
+
// Merge with existing entity
|
|
142
|
+
return await this.mergeEntity(duplicate.existingId, candidate, importSource);
|
|
143
|
+
}
|
|
144
|
+
// No duplicate found, create new entity
|
|
145
|
+
const entityId = await this.brain.add({
|
|
146
|
+
data: candidate.description || candidate.name,
|
|
147
|
+
type: candidate.type,
|
|
148
|
+
metadata: {
|
|
149
|
+
...candidate.metadata,
|
|
150
|
+
name: candidate.name,
|
|
151
|
+
confidence: candidate.confidence,
|
|
152
|
+
imports: [importSource],
|
|
153
|
+
vfsPaths: [candidate.metadata?.vfsPath].filter(Boolean),
|
|
154
|
+
createdAt: Date.now(),
|
|
155
|
+
mergeCount: 0
|
|
156
|
+
}
|
|
157
|
+
});
|
|
158
|
+
// Update candidate with new ID
|
|
159
|
+
candidate.id = entityId;
|
|
160
|
+
return {
|
|
161
|
+
mergedEntityId: entityId,
|
|
162
|
+
wasMerged: false,
|
|
163
|
+
confidence: candidate.confidence,
|
|
164
|
+
provenance: [importSource]
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Normalize string for comparison
|
|
169
|
+
*/
|
|
170
|
+
normalizeString(str) {
|
|
171
|
+
return str
|
|
172
|
+
.toLowerCase()
|
|
173
|
+
.trim()
|
|
174
|
+
.replace(/[^a-z0-9]/g, '');
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* Check if two names are similar (fuzzy matching)
|
|
178
|
+
*/
|
|
179
|
+
areSimilarNames(name1, name2) {
|
|
180
|
+
const n1 = this.normalizeString(name1);
|
|
181
|
+
const n2 = this.normalizeString(name2);
|
|
182
|
+
// Exact match
|
|
183
|
+
if (n1 === n2)
|
|
184
|
+
return true;
|
|
185
|
+
// Length difference check
|
|
186
|
+
const lengthDiff = Math.abs(n1.length - n2.length);
|
|
187
|
+
if (lengthDiff > 3)
|
|
188
|
+
return false;
|
|
189
|
+
// Levenshtein distance
|
|
190
|
+
const distance = this.levenshteinDistance(n1, n2);
|
|
191
|
+
const maxLength = Math.max(n1.length, n2.length);
|
|
192
|
+
const similarity = 1 - (distance / maxLength);
|
|
193
|
+
return similarity >= 0.85;
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Calculate Levenshtein distance between two strings
|
|
197
|
+
*/
|
|
198
|
+
levenshteinDistance(str1, str2) {
|
|
199
|
+
const m = str1.length;
|
|
200
|
+
const n = str2.length;
|
|
201
|
+
const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
|
|
202
|
+
for (let i = 0; i <= m; i++)
|
|
203
|
+
dp[i][0] = i;
|
|
204
|
+
for (let j = 0; j <= n; j++)
|
|
205
|
+
dp[0][j] = j;
|
|
206
|
+
for (let i = 1; i <= m; i++) {
|
|
207
|
+
for (let j = 1; j <= n; j++) {
|
|
208
|
+
if (str1[i - 1] === str2[j - 1]) {
|
|
209
|
+
dp[i][j] = dp[i - 1][j - 1];
|
|
210
|
+
}
|
|
211
|
+
else {
|
|
212
|
+
dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion
|
|
213
|
+
dp[i][j - 1] + 1, // insertion
|
|
214
|
+
dp[i - 1][j - 1] + 1 // substitution
|
|
215
|
+
);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return dp[m][n];
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Merge confidence scores (weighted average favoring higher confidence)
|
|
223
|
+
*/
|
|
224
|
+
mergeConfidence(existing, incoming) {
|
|
225
|
+
// Weight higher confidence more heavily
|
|
226
|
+
const weights = existing > incoming ? [0.6, 0.4] : [0.4, 0.6];
|
|
227
|
+
return existing * weights[0] + incoming * weights[1];
|
|
228
|
+
}
|
|
229
|
+
/**
|
|
230
|
+
* Merge metadata fields intelligently
|
|
231
|
+
*/
|
|
232
|
+
mergeMetadataFields(existing, incoming) {
|
|
233
|
+
const merged = {};
|
|
234
|
+
// Merge arrays
|
|
235
|
+
const arrayFields = ['concepts', 'tags', 'categories'];
|
|
236
|
+
for (const field of arrayFields) {
|
|
237
|
+
if (existing[field] || incoming[field]) {
|
|
238
|
+
const combined = [
|
|
239
|
+
...(existing[field] || []),
|
|
240
|
+
...(incoming[field] || [])
|
|
241
|
+
];
|
|
242
|
+
// Deduplicate
|
|
243
|
+
merged[field] = [...new Set(combined)];
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
// Prefer longer descriptions
|
|
247
|
+
if (existing.description || incoming.description) {
|
|
248
|
+
merged.description = (existing.description || '').length > (incoming.description || '').length
|
|
249
|
+
? existing.description
|
|
250
|
+
: incoming.description;
|
|
251
|
+
}
|
|
252
|
+
return merged;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
//# sourceMappingURL=EntityDeduplicator.js.map
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Format Detector
|
|
3
|
+
*
|
|
4
|
+
* Unified format detection for all import types using:
|
|
5
|
+
* - Magic byte signatures (PDF, Excel, images)
|
|
6
|
+
* - File extensions
|
|
7
|
+
* - Content analysis (JSON, Markdown, CSV)
|
|
8
|
+
*
|
|
9
|
+
* NO MOCKS - Production-ready implementation
|
|
10
|
+
*/
|
|
11
|
+
export type SupportedFormat = 'excel' | 'pdf' | 'csv' | 'json' | 'markdown';
|
|
12
|
+
export interface DetectionResult {
|
|
13
|
+
format: SupportedFormat;
|
|
14
|
+
confidence: number;
|
|
15
|
+
evidence: string[];
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* FormatDetector - Detect file format from various inputs
|
|
19
|
+
*/
|
|
20
|
+
export declare class FormatDetector {
|
|
21
|
+
/**
|
|
22
|
+
* Detect format from buffer
|
|
23
|
+
*/
|
|
24
|
+
detectFromBuffer(buffer: Buffer): DetectionResult | null;
|
|
25
|
+
/**
|
|
26
|
+
* Detect format from file path
|
|
27
|
+
*/
|
|
28
|
+
detectFromPath(path: string): DetectionResult | null;
|
|
29
|
+
/**
|
|
30
|
+
* Detect format from string content
|
|
31
|
+
*/
|
|
32
|
+
detectFromString(content: string): DetectionResult | null;
|
|
33
|
+
/**
|
|
34
|
+
* Detect format from object
|
|
35
|
+
*/
|
|
36
|
+
detectFromObject(obj: any): DetectionResult | null;
|
|
37
|
+
/**
|
|
38
|
+
* Detect by magic bytes
|
|
39
|
+
*/
|
|
40
|
+
private detectByMagicBytes;
|
|
41
|
+
/**
|
|
42
|
+
* Detect by content analysis
|
|
43
|
+
*/
|
|
44
|
+
private detectByContent;
|
|
45
|
+
/**
|
|
46
|
+
* Check if content looks like JSON
|
|
47
|
+
*/
|
|
48
|
+
private looksLikeJSON;
|
|
49
|
+
/**
|
|
50
|
+
* Check if content looks like Markdown
|
|
51
|
+
*/
|
|
52
|
+
private looksLikeMarkdown;
|
|
53
|
+
/**
|
|
54
|
+
* Check if content looks like CSV
|
|
55
|
+
*/
|
|
56
|
+
private looksLikeCSV;
|
|
57
|
+
/**
|
|
58
|
+
* Check if content is text-based (not binary)
|
|
59
|
+
*/
|
|
60
|
+
private isTextContent;
|
|
61
|
+
/**
|
|
62
|
+
* Get file extension from path
|
|
63
|
+
*/
|
|
64
|
+
private getExtension;
|
|
65
|
+
}
|