@soulcraft/brainy 3.43.3 → 3.44.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +36 -4
- package/dist/graph/graphAdjacencyIndex.d.ts +23 -22
- package/dist/graph/graphAdjacencyIndex.js +106 -121
- package/dist/graph/lsm/BloomFilter.d.ts +188 -0
- package/dist/graph/lsm/BloomFilter.js +278 -0
- package/dist/graph/lsm/LSMTree.d.ts +168 -0
- package/dist/graph/lsm/LSMTree.js +443 -0
- package/dist/graph/lsm/SSTable.d.ts +228 -0
- package/dist/graph/lsm/SSTable.js +290 -0
- package/dist/storage/storageFactory.js +9 -6
- package/package.json +2 -1
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SSTable - Sorted String Table for LSM-Tree
|
|
3
|
+
*
|
|
4
|
+
* Production-grade sorted file format for storing graph relationships:
|
|
5
|
+
* - Binary format using MessagePack (50-70% smaller than JSON)
|
|
6
|
+
* - Sorted by sourceId for O(log n) binary search
|
|
7
|
+
* - Bloom filter for fast negative lookups (90% disk I/O reduction)
|
|
8
|
+
* - Zone maps (min/max keys) for file skipping
|
|
9
|
+
* - Immutable after creation (LSM-tree property)
|
|
10
|
+
*
|
|
11
|
+
* File Structure:
|
|
12
|
+
* - Header: version, metadata, bloom filter, zone map
|
|
13
|
+
* - Data: sorted array of [sourceId, targetIds[]]
|
|
14
|
+
* - Footer: checksum, stats
|
|
15
|
+
*/
|
|
16
|
+
import { encode, decode } from '@msgpack/msgpack';
|
|
17
|
+
import { BloomFilter } from './BloomFilter.js';
|
|
18
|
+
/**
|
|
19
|
+
* SSTable - Immutable sorted file for LSM-tree
|
|
20
|
+
*
|
|
21
|
+
* Key Properties:
|
|
22
|
+
* - Immutable: Never modified after creation
|
|
23
|
+
* - Sorted: Entries sorted by sourceId for binary search
|
|
24
|
+
* - Filtered: Bloom filter for fast negative lookups
|
|
25
|
+
* - Zoned: Min/max keys for file skipping
|
|
26
|
+
* - Compact: MessagePack binary format
|
|
27
|
+
*
|
|
28
|
+
* Typical Usage:
|
|
29
|
+
* 1. Create from MemTable entries
|
|
30
|
+
* 2. Serialize and store via StorageAdapter
|
|
31
|
+
* 3. Load from storage when needed
|
|
32
|
+
* 4. Query with binary search
|
|
33
|
+
* 5. Eventually merge via compaction
|
|
34
|
+
*/
|
|
35
|
+
export class SSTable {
|
|
36
|
+
/**
|
|
37
|
+
* Create a new SSTable from entries
|
|
38
|
+
* @param entries Unsorted entries (will be sorted)
|
|
39
|
+
* @param level Compaction level
|
|
40
|
+
* @param id Unique ID for this SSTable
|
|
41
|
+
*/
|
|
42
|
+
constructor(entries, level = 0, id) {
|
|
43
|
+
// Sort entries by sourceId for binary search
|
|
44
|
+
this.entries = entries.sort((a, b) => a.sourceId.localeCompare(b.sourceId));
|
|
45
|
+
// Calculate statistics
|
|
46
|
+
const relationshipCount = entries.reduce((sum, entry) => sum + entry.count, 0);
|
|
47
|
+
// Create bloom filter for all sourceIds
|
|
48
|
+
this.bloomFilter = BloomFilter.createOptimal(entries.length, 0.01 // 1% false positive rate
|
|
49
|
+
);
|
|
50
|
+
for (const entry of entries) {
|
|
51
|
+
this.bloomFilter.add(entry.sourceId);
|
|
52
|
+
}
|
|
53
|
+
// Build metadata
|
|
54
|
+
this.metadata = {
|
|
55
|
+
version: SSTable.VERSION,
|
|
56
|
+
id: id || this.generateId(),
|
|
57
|
+
level,
|
|
58
|
+
createdAt: Date.now(),
|
|
59
|
+
entryCount: entries.length,
|
|
60
|
+
relationshipCount,
|
|
61
|
+
minSourceId: entries.length > 0 ? entries[0].sourceId : '',
|
|
62
|
+
maxSourceId: entries.length > 0 ? entries[entries.length - 1].sourceId : '',
|
|
63
|
+
sizeBytes: 0, // Will be set during serialization
|
|
64
|
+
compressed: false
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Generate a unique ID for this SSTable
|
|
69
|
+
*/
|
|
70
|
+
generateId() {
|
|
71
|
+
return `sstable-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Check if a sourceId might be in this SSTable (using bloom filter)
|
|
75
|
+
* @param sourceId The source ID to check
|
|
76
|
+
* @returns true if might be present (with 1% FP rate), false if definitely not present
|
|
77
|
+
*/
|
|
78
|
+
mightContain(sourceId) {
|
|
79
|
+
// Check bloom filter first (fast, in-memory)
|
|
80
|
+
return this.bloomFilter.contains(sourceId);
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Check if a sourceId is in the valid range for this SSTable (zone map)
|
|
84
|
+
* @param sourceId The source ID to check
|
|
85
|
+
* @returns true if in range, false otherwise
|
|
86
|
+
*/
|
|
87
|
+
isInRange(sourceId) {
|
|
88
|
+
if (this.metadata.entryCount === 0) {
|
|
89
|
+
return false;
|
|
90
|
+
}
|
|
91
|
+
return (sourceId >= this.metadata.minSourceId &&
|
|
92
|
+
sourceId <= this.metadata.maxSourceId);
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Get targets for a sourceId using binary search
|
|
96
|
+
* @param sourceId The source ID to query
|
|
97
|
+
* @returns Array of target IDs, or null if not found
|
|
98
|
+
*/
|
|
99
|
+
get(sourceId) {
|
|
100
|
+
// Quick check: Is it in range?
|
|
101
|
+
if (!this.isInRange(sourceId)) {
|
|
102
|
+
return null;
|
|
103
|
+
}
|
|
104
|
+
// Quick check: Does bloom filter say it might be here?
|
|
105
|
+
if (!this.mightContain(sourceId)) {
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
// Binary search in sorted entries
|
|
109
|
+
let left = 0;
|
|
110
|
+
let right = this.entries.length - 1;
|
|
111
|
+
while (left <= right) {
|
|
112
|
+
const mid = Math.floor((left + right) / 2);
|
|
113
|
+
const entry = this.entries[mid];
|
|
114
|
+
const cmp = entry.sourceId.localeCompare(sourceId);
|
|
115
|
+
if (cmp === 0) {
|
|
116
|
+
// Found it!
|
|
117
|
+
return entry.targets;
|
|
118
|
+
}
|
|
119
|
+
else if (cmp < 0) {
|
|
120
|
+
left = mid + 1;
|
|
121
|
+
}
|
|
122
|
+
else {
|
|
123
|
+
right = mid - 1;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
// Not found (bloom filter false positive)
|
|
127
|
+
return null;
|
|
128
|
+
}
|
|
129
|
+
/**
|
|
130
|
+
* Get all entries in this SSTable
|
|
131
|
+
* Used for compaction and merging
|
|
132
|
+
*/
|
|
133
|
+
getEntries() {
|
|
134
|
+
return this.entries;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Get number of entries
|
|
138
|
+
*/
|
|
139
|
+
size() {
|
|
140
|
+
return this.entries.length;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* Serialize SSTable to binary format using MessagePack
|
|
144
|
+
* @returns Uint8Array of serialized data
|
|
145
|
+
*/
|
|
146
|
+
serialize() {
|
|
147
|
+
const data = {
|
|
148
|
+
metadata: this.metadata,
|
|
149
|
+
entries: this.entries,
|
|
150
|
+
bloomFilter: this.bloomFilter.serialize(),
|
|
151
|
+
checksum: this.calculateChecksum(this.entries)
|
|
152
|
+
};
|
|
153
|
+
const serialized = encode(data);
|
|
154
|
+
// Update size in metadata
|
|
155
|
+
this.metadata.sizeBytes = serialized.length;
|
|
156
|
+
return serialized;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Calculate checksum for data integrity
|
|
160
|
+
* Simple but effective: hash of all sourceIds concatenated
|
|
161
|
+
*/
|
|
162
|
+
calculateChecksum(entries) {
|
|
163
|
+
const crypto = require('crypto');
|
|
164
|
+
const hash = crypto.createHash('sha256');
|
|
165
|
+
for (const entry of entries) {
|
|
166
|
+
hash.update(entry.sourceId);
|
|
167
|
+
for (const target of entry.targets) {
|
|
168
|
+
hash.update(target);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
return hash.digest('hex');
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Deserialize SSTable from binary format
|
|
175
|
+
* @param data Serialized SSTable data
|
|
176
|
+
* @returns SSTable instance
|
|
177
|
+
*/
|
|
178
|
+
static deserialize(data) {
|
|
179
|
+
const decoded = decode(data);
|
|
180
|
+
// Verify checksum
|
|
181
|
+
const sstable = new SSTable(decoded.entries, decoded.metadata.level, decoded.metadata.id);
|
|
182
|
+
const calculatedChecksum = sstable.calculateChecksum(decoded.entries);
|
|
183
|
+
if (calculatedChecksum !== decoded.checksum) {
|
|
184
|
+
throw new Error(`SSTable checksum mismatch: expected ${decoded.checksum}, got ${calculatedChecksum}`);
|
|
185
|
+
}
|
|
186
|
+
// Restore metadata
|
|
187
|
+
Object.assign(sstable.metadata, decoded.metadata);
|
|
188
|
+
// Restore bloom filter
|
|
189
|
+
sstable.bloomFilter = BloomFilter.deserialize(decoded.bloomFilter);
|
|
190
|
+
return sstable;
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Merge multiple SSTables into a single sorted SSTable
|
|
194
|
+
* Used during compaction to combine multiple files
|
|
195
|
+
*
|
|
196
|
+
* @param sstables Array of SSTables to merge
|
|
197
|
+
* @param targetLevel Target compaction level
|
|
198
|
+
* @returns New merged SSTable
|
|
199
|
+
*/
|
|
200
|
+
static merge(sstables, targetLevel) {
|
|
201
|
+
if (sstables.length === 0) {
|
|
202
|
+
throw new Error('Cannot merge zero SSTables');
|
|
203
|
+
}
|
|
204
|
+
if (sstables.length === 1) {
|
|
205
|
+
// Nothing to merge, just update level
|
|
206
|
+
return new SSTable(sstables[0].getEntries(), targetLevel);
|
|
207
|
+
}
|
|
208
|
+
// Collect all entries from all SSTables
|
|
209
|
+
const allEntries = new Map();
|
|
210
|
+
for (const sstable of sstables) {
|
|
211
|
+
for (const entry of sstable.getEntries()) {
|
|
212
|
+
if (!allEntries.has(entry.sourceId)) {
|
|
213
|
+
allEntries.set(entry.sourceId, new Set());
|
|
214
|
+
}
|
|
215
|
+
const targets = allEntries.get(entry.sourceId);
|
|
216
|
+
for (const target of entry.targets) {
|
|
217
|
+
targets.add(target);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
// Convert back to SSTableEntry format
|
|
222
|
+
const mergedEntries = [];
|
|
223
|
+
allEntries.forEach((targets, sourceId) => {
|
|
224
|
+
mergedEntries.push({
|
|
225
|
+
sourceId,
|
|
226
|
+
targets: Array.from(targets),
|
|
227
|
+
count: targets.size
|
|
228
|
+
});
|
|
229
|
+
});
|
|
230
|
+
// Create new merged SSTable (will be sorted in constructor)
|
|
231
|
+
return new SSTable(mergedEntries, targetLevel);
|
|
232
|
+
}
|
|
233
|
+
/**
|
|
234
|
+
* Get statistics about this SSTable
|
|
235
|
+
*/
|
|
236
|
+
getStats() {
|
|
237
|
+
return {
|
|
238
|
+
id: this.metadata.id,
|
|
239
|
+
level: this.metadata.level,
|
|
240
|
+
entries: this.metadata.entryCount,
|
|
241
|
+
relationships: this.metadata.relationshipCount,
|
|
242
|
+
sizeBytes: this.metadata.sizeBytes,
|
|
243
|
+
minSourceId: this.metadata.minSourceId,
|
|
244
|
+
maxSourceId: this.metadata.maxSourceId,
|
|
245
|
+
bloomFilterStats: this.bloomFilter.getStats()
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Create an SSTable from a Map of sourceId → targets
|
|
250
|
+
* Convenience method for creating from MemTable
|
|
251
|
+
*
|
|
252
|
+
* @param sourceMap Map of sourceId to Set of targetIds
|
|
253
|
+
* @param level Compaction level
|
|
254
|
+
* @returns New SSTable
|
|
255
|
+
*/
|
|
256
|
+
static fromMap(sourceMap, level = 0) {
|
|
257
|
+
const entries = [];
|
|
258
|
+
sourceMap.forEach((targets, sourceId) => {
|
|
259
|
+
entries.push({
|
|
260
|
+
sourceId,
|
|
261
|
+
targets: Array.from(targets),
|
|
262
|
+
count: targets.size
|
|
263
|
+
});
|
|
264
|
+
});
|
|
265
|
+
return new SSTable(entries, level);
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Estimate memory usage of this SSTable when loaded
|
|
269
|
+
* @returns Estimated bytes
|
|
270
|
+
*/
|
|
271
|
+
estimateMemoryUsage() {
|
|
272
|
+
let bytes = 0;
|
|
273
|
+
// Metadata
|
|
274
|
+
bytes += 500; // Rough estimate for metadata object
|
|
275
|
+
// Entries
|
|
276
|
+
for (const entry of this.entries) {
|
|
277
|
+
bytes += entry.sourceId.length * 2; // UTF-16 encoding
|
|
278
|
+
bytes += entry.targets.length * 40; // ~40 bytes per UUID string
|
|
279
|
+
bytes += 50; // Entry object overhead
|
|
280
|
+
}
|
|
281
|
+
// Bloom filter
|
|
282
|
+
bytes += this.bloomFilter.getStats().memoryBytes;
|
|
283
|
+
return bytes;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
/**
|
|
287
|
+
* Current format version
|
|
288
|
+
*/
|
|
289
|
+
SSTable.VERSION = 1;
|
|
290
|
+
//# sourceMappingURL=SSTable.js.map
|
|
@@ -38,10 +38,11 @@ export async function createStorage(options = {}) {
|
|
|
38
38
|
console.warn('FileSystemStorage is not available in browser environments, falling back to memory storage');
|
|
39
39
|
return new MemoryStorage();
|
|
40
40
|
}
|
|
41
|
-
|
|
41
|
+
const fsPath = getFileSystemPath(options);
|
|
42
|
+
console.log(`Using file system storage (forced): ${fsPath}`);
|
|
42
43
|
try {
|
|
43
44
|
const { FileSystemStorage } = await import('./adapters/fileSystemStorage.js');
|
|
44
|
-
return new FileSystemStorage(
|
|
45
|
+
return new FileSystemStorage(fsPath);
|
|
45
46
|
}
|
|
46
47
|
catch (error) {
|
|
47
48
|
console.warn('Failed to load FileSystemStorage, falling back to memory storage:', error);
|
|
@@ -77,10 +78,11 @@ export async function createStorage(options = {}) {
|
|
|
77
78
|
console.warn('FileSystemStorage is not available in browser environments, falling back to memory storage');
|
|
78
79
|
return new MemoryStorage();
|
|
79
80
|
}
|
|
80
|
-
|
|
81
|
+
const fsPath = getFileSystemPath(options);
|
|
82
|
+
console.log(`Using file system storage: ${fsPath}`);
|
|
81
83
|
try {
|
|
82
84
|
const { FileSystemStorage } = await import('./adapters/fileSystemStorage.js');
|
|
83
|
-
return new FileSystemStorage(
|
|
85
|
+
return new FileSystemStorage(fsPath);
|
|
84
86
|
}
|
|
85
87
|
catch (error) {
|
|
86
88
|
console.warn('Failed to load FileSystemStorage, falling back to memory storage:', error);
|
|
@@ -230,10 +232,11 @@ export async function createStorage(options = {}) {
|
|
|
230
232
|
if (typeof process !== 'undefined' &&
|
|
231
233
|
process.versions &&
|
|
232
234
|
process.versions.node) {
|
|
233
|
-
|
|
235
|
+
const fsPath = getFileSystemPath(options);
|
|
236
|
+
console.log(`Using file system storage (auto-detected): ${fsPath}`);
|
|
234
237
|
try {
|
|
235
238
|
const { FileSystemStorage } = await import('./adapters/fileSystemStorage.js');
|
|
236
|
-
return new FileSystemStorage(
|
|
239
|
+
return new FileSystemStorage(fsPath);
|
|
237
240
|
}
|
|
238
241
|
catch (fsError) {
|
|
239
242
|
console.warn('Failed to load FileSystemStorage, falling back to memory storage:', fsError);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@soulcraft/brainy",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.44.0",
|
|
4
4
|
"description": "Universal Knowledge Protocol™ - World's first Triple Intelligence database unifying vector, graph, and document search in one API. 31 nouns × 40 verbs for infinite expressiveness.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.js",
|
|
@@ -162,6 +162,7 @@
|
|
|
162
162
|
"@aws-sdk/client-s3": "^3.540.0",
|
|
163
163
|
"@google-cloud/storage": "^7.14.0",
|
|
164
164
|
"@huggingface/transformers": "^3.7.2",
|
|
165
|
+
"@msgpack/msgpack": "^3.1.2",
|
|
165
166
|
"boxen": "^8.0.1",
|
|
166
167
|
"chalk": "^5.3.0",
|
|
167
168
|
"chardet": "^2.0.0",
|