@soulcraft/brainy 3.25.2 → 3.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/storage/adapters/fileSystemStorage.js +7 -2
- package/dist/storage/adapters/opfsStorage.js +174 -85
- package/dist/storage/adapters/s3CompatibleStorage.d.ts +43 -5
- package/dist/storage/adapters/s3CompatibleStorage.js +191 -86
- package/dist/storage/sharding.d.ts +103 -0
- package/dist/storage/sharding.js +137 -0
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
|
|
4
4
|
|
|
5
|
+
## [3.26.0](https://github.com/soulcraftlabs/brainy/compare/v3.25.2...v3.26.0) (2025-10-08)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### ⚠ BREAKING CHANGES
|
|
9
|
+
|
|
10
|
+
* Requires data migration for existing S3/GCS/R2/OpFS deployments.
|
|
11
|
+
See .strategy/UNIFIED-UUID-SHARDING.md for migration guidance.
|
|
12
|
+
|
|
13
|
+
### 🐛 Bug Fixes
|
|
14
|
+
|
|
15
|
+
* implement unified UUID-based sharding for metadata across all storage adapters ([2f33571](https://github.com/soulcraftlabs/brainy/commit/2f3357132d06c70cd74532d22cbfbf6abb92903a))
|
|
16
|
+
|
|
5
17
|
### [3.25.2](https://github.com/soulcraftlabs/brainy/compare/v3.25.1...v3.25.2) (2025-10-08)
|
|
6
18
|
|
|
7
19
|
|
|
@@ -510,7 +510,11 @@ export class FileSystemStorage extends BaseStorage {
|
|
|
510
510
|
*/
|
|
511
511
|
async saveNounMetadata_internal(id, metadata) {
|
|
512
512
|
await this.ensureInitialized();
|
|
513
|
-
|
|
513
|
+
// Use UUID-based sharding for metadata (consistent with noun vectors)
|
|
514
|
+
const filePath = this.getShardedPath(this.nounMetadataDir, id);
|
|
515
|
+
// Ensure shard directory exists
|
|
516
|
+
const shardDir = path.dirname(filePath);
|
|
517
|
+
await fs.promises.mkdir(shardDir, { recursive: true });
|
|
514
518
|
await fs.promises.writeFile(filePath, JSON.stringify(metadata, null, 2));
|
|
515
519
|
}
|
|
516
520
|
/**
|
|
@@ -518,7 +522,8 @@ export class FileSystemStorage extends BaseStorage {
|
|
|
518
522
|
*/
|
|
519
523
|
async getNounMetadata(id) {
|
|
520
524
|
await this.ensureInitialized();
|
|
521
|
-
|
|
525
|
+
// Use UUID-based sharding for metadata (consistent with noun vectors)
|
|
526
|
+
const filePath = this.getShardedPath(this.nounMetadataDir, id);
|
|
522
527
|
try {
|
|
523
528
|
const data = await fs.promises.readFile(filePath, 'utf-8');
|
|
524
529
|
return JSON.parse(data);
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
* Provides persistent storage for the vector database using the Origin Private File System API
|
|
4
4
|
*/
|
|
5
5
|
import { BaseStorage, NOUNS_DIR, VERBS_DIR, METADATA_DIR, NOUN_METADATA_DIR, VERB_METADATA_DIR, INDEX_DIR } from '../baseStorage.js';
|
|
6
|
+
import { getShardIdFromUuid } from '../sharding.js';
|
|
6
7
|
import '../../types/fileSystemTypes.js';
|
|
7
8
|
/**
|
|
8
9
|
* Helper function to safely get a file from a FileSystemHandle
|
|
@@ -145,8 +146,14 @@ export class OPFSStorage extends BaseStorage {
|
|
|
145
146
|
...noun,
|
|
146
147
|
connections: this.mapToObject(noun.connections, (set) => Array.from(set))
|
|
147
148
|
};
|
|
148
|
-
//
|
|
149
|
-
const
|
|
149
|
+
// Use UUID-based sharding for nouns
|
|
150
|
+
const shardId = getShardIdFromUuid(noun.id);
|
|
151
|
+
// Get or create the shard directory
|
|
152
|
+
const shardDir = await this.nounsDir.getDirectoryHandle(shardId, {
|
|
153
|
+
create: true
|
|
154
|
+
});
|
|
155
|
+
// Create or get the file in the shard directory
|
|
156
|
+
const fileHandle = await shardDir.getFileHandle(`${noun.id}.json`, {
|
|
150
157
|
create: true
|
|
151
158
|
});
|
|
152
159
|
// Write the noun data to the file
|
|
@@ -165,8 +172,12 @@ export class OPFSStorage extends BaseStorage {
|
|
|
165
172
|
async getNoun_internal(id) {
|
|
166
173
|
await this.ensureInitialized();
|
|
167
174
|
try {
|
|
168
|
-
//
|
|
169
|
-
const
|
|
175
|
+
// Use UUID-based sharding for nouns
|
|
176
|
+
const shardId = getShardIdFromUuid(id);
|
|
177
|
+
// Get the shard directory
|
|
178
|
+
const shardDir = await this.nounsDir.getDirectoryHandle(shardId);
|
|
179
|
+
// Get the file handle from the shard directory
|
|
180
|
+
const fileHandle = await shardDir.getFileHandle(`${id}.json`);
|
|
170
181
|
// Read the noun data from the file
|
|
171
182
|
const file = await fileHandle.getFile();
|
|
172
183
|
const text = await file.text();
|
|
@@ -205,34 +216,40 @@ export class OPFSStorage extends BaseStorage {
|
|
|
205
216
|
await this.ensureInitialized();
|
|
206
217
|
const nodes = [];
|
|
207
218
|
try {
|
|
208
|
-
// Iterate through all
|
|
209
|
-
for await (const [
|
|
210
|
-
if (
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
219
|
+
// Iterate through all shard directories
|
|
220
|
+
for await (const [shardName, shardHandle] of this.nounsDir.entries()) {
|
|
221
|
+
if (shardHandle.kind === 'directory') {
|
|
222
|
+
const shardDir = shardHandle;
|
|
223
|
+
// Iterate through all files in this shard
|
|
224
|
+
for await (const [fileName, fileHandle] of shardDir.entries()) {
|
|
225
|
+
if (fileHandle.kind === 'file') {
|
|
226
|
+
try {
|
|
227
|
+
// Read the node data from the file
|
|
228
|
+
const file = await safeGetFile(fileHandle);
|
|
229
|
+
const text = await file.text();
|
|
230
|
+
const data = JSON.parse(text);
|
|
231
|
+
// Get the metadata to check the noun type
|
|
232
|
+
const metadata = await this.getMetadata(data.id);
|
|
233
|
+
// Include the node if its noun type matches the requested type
|
|
234
|
+
if (metadata && metadata.noun === nounType) {
|
|
235
|
+
// Convert serialized connections back to Map<number, Set<string>>
|
|
236
|
+
const connections = new Map();
|
|
237
|
+
for (const [level, nodeIds] of Object.entries(data.connections)) {
|
|
238
|
+
connections.set(Number(level), new Set(nodeIds));
|
|
239
|
+
}
|
|
240
|
+
nodes.push({
|
|
241
|
+
id: data.id,
|
|
242
|
+
vector: data.vector,
|
|
243
|
+
connections,
|
|
244
|
+
level: data.level || 0
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
catch (error) {
|
|
249
|
+
console.error(`Error reading node file ${shardName}/${fileName}:`, error);
|
|
224
250
|
}
|
|
225
|
-
nodes.push({
|
|
226
|
-
id: data.id,
|
|
227
|
-
vector: data.vector,
|
|
228
|
-
connections,
|
|
229
|
-
level: data.level || 0
|
|
230
|
-
});
|
|
231
251
|
}
|
|
232
252
|
}
|
|
233
|
-
catch (error) {
|
|
234
|
-
console.error(`Error reading node file ${name}:`, error);
|
|
235
|
-
}
|
|
236
253
|
}
|
|
237
254
|
}
|
|
238
255
|
}
|
|
@@ -253,7 +270,12 @@ export class OPFSStorage extends BaseStorage {
|
|
|
253
270
|
async deleteNode(id) {
|
|
254
271
|
await this.ensureInitialized();
|
|
255
272
|
try {
|
|
256
|
-
|
|
273
|
+
// Use UUID-based sharding for nouns
|
|
274
|
+
const shardId = getShardIdFromUuid(id);
|
|
275
|
+
// Get the shard directory
|
|
276
|
+
const shardDir = await this.nounsDir.getDirectoryHandle(shardId);
|
|
277
|
+
// Delete the file from the shard directory
|
|
278
|
+
await shardDir.removeEntry(`${id}.json`);
|
|
257
279
|
}
|
|
258
280
|
catch (error) {
|
|
259
281
|
// Ignore NotFoundError, which means the file doesn't exist
|
|
@@ -280,8 +302,14 @@ export class OPFSStorage extends BaseStorage {
|
|
|
280
302
|
...edge,
|
|
281
303
|
connections: this.mapToObject(edge.connections, (set) => Array.from(set))
|
|
282
304
|
};
|
|
283
|
-
//
|
|
284
|
-
const
|
|
305
|
+
// Use UUID-based sharding for verbs
|
|
306
|
+
const shardId = getShardIdFromUuid(edge.id);
|
|
307
|
+
// Get or create the shard directory
|
|
308
|
+
const shardDir = await this.verbsDir.getDirectoryHandle(shardId, {
|
|
309
|
+
create: true
|
|
310
|
+
});
|
|
311
|
+
// Create or get the file in the shard directory
|
|
312
|
+
const fileHandle = await shardDir.getFileHandle(`${edge.id}.json`, {
|
|
285
313
|
create: true
|
|
286
314
|
});
|
|
287
315
|
// Write the verb data to the file
|
|
@@ -306,8 +334,12 @@ export class OPFSStorage extends BaseStorage {
|
|
|
306
334
|
async getEdge(id) {
|
|
307
335
|
await this.ensureInitialized();
|
|
308
336
|
try {
|
|
309
|
-
//
|
|
310
|
-
const
|
|
337
|
+
// Use UUID-based sharding for verbs
|
|
338
|
+
const shardId = getShardIdFromUuid(id);
|
|
339
|
+
// Get the shard directory
|
|
340
|
+
const shardDir = await this.verbsDir.getDirectoryHandle(shardId);
|
|
341
|
+
// Get the file handle from the shard directory
|
|
342
|
+
const fileHandle = await shardDir.getFileHandle(`${id}.json`);
|
|
311
343
|
// Read the edge data from the file
|
|
312
344
|
const file = await fileHandle.getFile();
|
|
313
345
|
const text = await file.text();
|
|
@@ -345,37 +377,43 @@ export class OPFSStorage extends BaseStorage {
|
|
|
345
377
|
await this.ensureInitialized();
|
|
346
378
|
const allEdges = [];
|
|
347
379
|
try {
|
|
348
|
-
// Iterate through all
|
|
349
|
-
for await (const [
|
|
350
|
-
if (
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
380
|
+
// Iterate through all shard directories
|
|
381
|
+
for await (const [shardName, shardHandle] of this.verbsDir.entries()) {
|
|
382
|
+
if (shardHandle.kind === 'directory') {
|
|
383
|
+
const shardDir = shardHandle;
|
|
384
|
+
// Iterate through all files in this shard
|
|
385
|
+
for await (const [fileName, fileHandle] of shardDir.entries()) {
|
|
386
|
+
if (fileHandle.kind === 'file') {
|
|
387
|
+
try {
|
|
388
|
+
// Read the edge data from the file
|
|
389
|
+
const file = await safeGetFile(fileHandle);
|
|
390
|
+
const text = await file.text();
|
|
391
|
+
const data = JSON.parse(text);
|
|
392
|
+
// Convert serialized connections back to Map<number, Set<string>>
|
|
393
|
+
const connections = new Map();
|
|
394
|
+
for (const [level, nodeIds] of Object.entries(data.connections)) {
|
|
395
|
+
connections.set(Number(level), new Set(nodeIds));
|
|
396
|
+
}
|
|
397
|
+
// Create default timestamp if not present
|
|
398
|
+
const defaultTimestamp = {
|
|
399
|
+
seconds: Math.floor(Date.now() / 1000),
|
|
400
|
+
nanoseconds: (Date.now() % 1000) * 1000000
|
|
401
|
+
};
|
|
402
|
+
// Create default createdBy if not present
|
|
403
|
+
const defaultCreatedBy = {
|
|
404
|
+
augmentation: 'unknown',
|
|
405
|
+
version: '1.0'
|
|
406
|
+
};
|
|
407
|
+
allEdges.push({
|
|
408
|
+
id: data.id,
|
|
409
|
+
vector: data.vector,
|
|
410
|
+
connections
|
|
411
|
+
});
|
|
412
|
+
}
|
|
413
|
+
catch (error) {
|
|
414
|
+
console.error(`Error reading edge file ${shardName}/${fileName}:`, error);
|
|
415
|
+
}
|
|
360
416
|
}
|
|
361
|
-
// Create default timestamp if not present
|
|
362
|
-
const defaultTimestamp = {
|
|
363
|
-
seconds: Math.floor(Date.now() / 1000),
|
|
364
|
-
nanoseconds: (Date.now() % 1000) * 1000000
|
|
365
|
-
};
|
|
366
|
-
// Create default createdBy if not present
|
|
367
|
-
const defaultCreatedBy = {
|
|
368
|
-
augmentation: 'unknown',
|
|
369
|
-
version: '1.0'
|
|
370
|
-
};
|
|
371
|
-
allEdges.push({
|
|
372
|
-
id: data.id,
|
|
373
|
-
vector: data.vector,
|
|
374
|
-
connections
|
|
375
|
-
});
|
|
376
|
-
}
|
|
377
|
-
catch (error) {
|
|
378
|
-
console.error(`Error reading edge file ${name}:`, error);
|
|
379
417
|
}
|
|
380
418
|
}
|
|
381
419
|
}
|
|
@@ -457,7 +495,12 @@ export class OPFSStorage extends BaseStorage {
|
|
|
457
495
|
async deleteEdge(id) {
|
|
458
496
|
await this.ensureInitialized();
|
|
459
497
|
try {
|
|
460
|
-
|
|
498
|
+
// Use UUID-based sharding for verbs
|
|
499
|
+
const shardId = getShardIdFromUuid(id);
|
|
500
|
+
// Get the shard directory
|
|
501
|
+
const shardDir = await this.verbsDir.getDirectoryHandle(shardId);
|
|
502
|
+
// Delete the file from the shard directory
|
|
503
|
+
await shardDir.removeEntry(`${id}.json`);
|
|
461
504
|
}
|
|
462
505
|
catch (error) {
|
|
463
506
|
// Ignore NotFoundError, which means the file doesn't exist
|
|
@@ -542,8 +585,13 @@ export class OPFSStorage extends BaseStorage {
|
|
|
542
585
|
*/
|
|
543
586
|
async saveVerbMetadata_internal(id, metadata) {
|
|
544
587
|
await this.ensureInitialized();
|
|
588
|
+
// Use UUID-based sharding for metadata (consistent with verb vectors)
|
|
589
|
+
const shardId = getShardIdFromUuid(id);
|
|
590
|
+
// Get or create the shard directory
|
|
591
|
+
const shardDir = await this.verbMetadataDir.getDirectoryHandle(shardId, { create: true });
|
|
592
|
+
// Create or get the file in the shard directory
|
|
545
593
|
const fileName = `${id}.json`;
|
|
546
|
-
const fileHandle = await
|
|
594
|
+
const fileHandle = await shardDir.getFileHandle(fileName, { create: true });
|
|
547
595
|
const writable = await fileHandle.createWritable();
|
|
548
596
|
await writable.write(JSON.stringify(metadata, null, 2));
|
|
549
597
|
await writable.close();
|
|
@@ -553,9 +601,14 @@ export class OPFSStorage extends BaseStorage {
|
|
|
553
601
|
*/
|
|
554
602
|
async getVerbMetadata(id) {
|
|
555
603
|
await this.ensureInitialized();
|
|
604
|
+
// Use UUID-based sharding for metadata (consistent with verb vectors)
|
|
605
|
+
const shardId = getShardIdFromUuid(id);
|
|
556
606
|
const fileName = `${id}.json`;
|
|
557
607
|
try {
|
|
558
|
-
|
|
608
|
+
// Get the shard directory
|
|
609
|
+
const shardDir = await this.verbMetadataDir.getDirectoryHandle(shardId);
|
|
610
|
+
// Get the file from the shard directory
|
|
611
|
+
const fileHandle = await shardDir.getFileHandle(fileName);
|
|
559
612
|
const file = await safeGetFile(fileHandle);
|
|
560
613
|
const text = await file.text();
|
|
561
614
|
return JSON.parse(text);
|
|
@@ -572,8 +625,13 @@ export class OPFSStorage extends BaseStorage {
|
|
|
572
625
|
*/
|
|
573
626
|
async saveNounMetadata_internal(id, metadata) {
|
|
574
627
|
await this.ensureInitialized();
|
|
628
|
+
// Use UUID-based sharding for metadata (consistent with noun vectors)
|
|
629
|
+
const shardId = getShardIdFromUuid(id);
|
|
630
|
+
// Get or create the shard directory
|
|
631
|
+
const shardDir = await this.nounMetadataDir.getDirectoryHandle(shardId, { create: true });
|
|
632
|
+
// Create or get the file in the shard directory
|
|
575
633
|
const fileName = `${id}.json`;
|
|
576
|
-
const fileHandle = await
|
|
634
|
+
const fileHandle = await shardDir.getFileHandle(fileName, { create: true });
|
|
577
635
|
const writable = await fileHandle.createWritable();
|
|
578
636
|
await writable.write(JSON.stringify(metadata, null, 2));
|
|
579
637
|
await writable.close();
|
|
@@ -583,9 +641,14 @@ export class OPFSStorage extends BaseStorage {
|
|
|
583
641
|
*/
|
|
584
642
|
async getNounMetadata(id) {
|
|
585
643
|
await this.ensureInitialized();
|
|
644
|
+
// Use UUID-based sharding for metadata (consistent with noun vectors)
|
|
645
|
+
const shardId = getShardIdFromUuid(id);
|
|
586
646
|
const fileName = `${id}.json`;
|
|
587
647
|
try {
|
|
588
|
-
|
|
648
|
+
// Get the shard directory
|
|
649
|
+
const shardDir = await this.nounMetadataDir.getDirectoryHandle(shardId);
|
|
650
|
+
// Get the file from the shard directory
|
|
651
|
+
const fileHandle = await shardDir.getFileHandle(fileName);
|
|
589
652
|
const file = await safeGetFile(fileHandle);
|
|
590
653
|
const text = await file.text();
|
|
591
654
|
return JSON.parse(text);
|
|
@@ -1117,12 +1180,19 @@ export class OPFSStorage extends BaseStorage {
|
|
|
1117
1180
|
await this.ensureInitialized();
|
|
1118
1181
|
const limit = options.limit || 100;
|
|
1119
1182
|
const cursor = options.cursor;
|
|
1120
|
-
// Get all noun files
|
|
1183
|
+
// Get all noun files from all shards
|
|
1121
1184
|
const nounFiles = [];
|
|
1122
1185
|
if (this.nounsDir) {
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1186
|
+
// Iterate through all shard directories
|
|
1187
|
+
for await (const [shardName, shardHandle] of this.nounsDir.entries()) {
|
|
1188
|
+
if (shardHandle.kind === 'directory') {
|
|
1189
|
+
// Iterate through files in this shard
|
|
1190
|
+
const shardDir = shardHandle;
|
|
1191
|
+
for await (const [fileName, fileHandle] of shardDir.entries()) {
|
|
1192
|
+
if (fileHandle.kind === 'file' && fileName.endsWith('.json')) {
|
|
1193
|
+
nounFiles.push(`${shardName}/${fileName}`);
|
|
1194
|
+
}
|
|
1195
|
+
}
|
|
1126
1196
|
}
|
|
1127
1197
|
}
|
|
1128
1198
|
}
|
|
@@ -1141,7 +1211,8 @@ export class OPFSStorage extends BaseStorage {
|
|
|
1141
1211
|
// Load nouns from files
|
|
1142
1212
|
const items = [];
|
|
1143
1213
|
for (const fileName of pageFiles) {
|
|
1144
|
-
|
|
1214
|
+
// fileName is in format "shard/uuid.json", extract just the UUID
|
|
1215
|
+
const id = fileName.split('/')[1].replace('.json', '');
|
|
1145
1216
|
const noun = await this.getNoun_internal(id);
|
|
1146
1217
|
if (noun) {
|
|
1147
1218
|
// Apply filters if provided
|
|
@@ -1205,12 +1276,19 @@ export class OPFSStorage extends BaseStorage {
|
|
|
1205
1276
|
await this.ensureInitialized();
|
|
1206
1277
|
const limit = options.limit || 100;
|
|
1207
1278
|
const cursor = options.cursor;
|
|
1208
|
-
// Get all verb files
|
|
1279
|
+
// Get all verb files from all shards
|
|
1209
1280
|
const verbFiles = [];
|
|
1210
1281
|
if (this.verbsDir) {
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1282
|
+
// Iterate through all shard directories
|
|
1283
|
+
for await (const [shardName, shardHandle] of this.verbsDir.entries()) {
|
|
1284
|
+
if (shardHandle.kind === 'directory') {
|
|
1285
|
+
// Iterate through files in this shard
|
|
1286
|
+
const shardDir = shardHandle;
|
|
1287
|
+
for await (const [fileName, fileHandle] of shardDir.entries()) {
|
|
1288
|
+
if (fileHandle.kind === 'file' && fileName.endsWith('.json')) {
|
|
1289
|
+
verbFiles.push(`${shardName}/${fileName}`);
|
|
1290
|
+
}
|
|
1291
|
+
}
|
|
1214
1292
|
}
|
|
1215
1293
|
}
|
|
1216
1294
|
}
|
|
@@ -1229,7 +1307,8 @@ export class OPFSStorage extends BaseStorage {
|
|
|
1229
1307
|
// Load verbs from files and convert to GraphVerb
|
|
1230
1308
|
const items = [];
|
|
1231
1309
|
for (const fileName of pageFiles) {
|
|
1232
|
-
|
|
1310
|
+
// fileName is in format "shard/uuid.json", extract just the UUID
|
|
1311
|
+
const id = fileName.split('/')[1].replace('.json', '');
|
|
1233
1312
|
const hnswVerb = await this.getVerb_internal(id);
|
|
1234
1313
|
if (hnswVerb) {
|
|
1235
1314
|
// Convert HNSWVerb to GraphVerb
|
|
@@ -1330,16 +1409,26 @@ export class OPFSStorage extends BaseStorage {
|
|
|
1330
1409
|
*/
|
|
1331
1410
|
async initializeCountsFromScan() {
|
|
1332
1411
|
try {
|
|
1333
|
-
// Count nouns
|
|
1412
|
+
// Count nouns across all shards
|
|
1334
1413
|
let nounCount = 0;
|
|
1335
|
-
for await (const [,] of this.nounsDir.entries()) {
|
|
1336
|
-
|
|
1414
|
+
for await (const [shardName, shardHandle] of this.nounsDir.entries()) {
|
|
1415
|
+
if (shardHandle.kind === 'directory') {
|
|
1416
|
+
const shardDir = shardHandle;
|
|
1417
|
+
for await (const [,] of shardDir.entries()) {
|
|
1418
|
+
nounCount++;
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1337
1421
|
}
|
|
1338
1422
|
this.totalNounCount = nounCount;
|
|
1339
|
-
// Count verbs
|
|
1423
|
+
// Count verbs across all shards
|
|
1340
1424
|
let verbCount = 0;
|
|
1341
|
-
for await (const [,] of this.verbsDir.entries()) {
|
|
1342
|
-
|
|
1425
|
+
for await (const [shardName, shardHandle] of this.verbsDir.entries()) {
|
|
1426
|
+
if (shardHandle.kind === 'directory') {
|
|
1427
|
+
const shardDir = shardHandle;
|
|
1428
|
+
for await (const [,] of shardDir.entries()) {
|
|
1429
|
+
verbCount++;
|
|
1430
|
+
}
|
|
1431
|
+
}
|
|
1343
1432
|
}
|
|
1344
1433
|
this.totalVerbCount = verbCount;
|
|
1345
1434
|
// Save initial counts
|
|
@@ -73,7 +73,6 @@ export declare class S3CompatibleStorage extends BaseStorage {
|
|
|
73
73
|
private nounWriteBuffer;
|
|
74
74
|
private verbWriteBuffer;
|
|
75
75
|
private coordinator?;
|
|
76
|
-
private shardManager?;
|
|
77
76
|
private cacheSync?;
|
|
78
77
|
private readWriteSeparation?;
|
|
79
78
|
private requestCoalescer;
|
|
@@ -112,7 +111,9 @@ export declare class S3CompatibleStorage extends BaseStorage {
|
|
|
112
111
|
init(): Promise<void>;
|
|
113
112
|
/**
|
|
114
113
|
* Set distributed components for multi-node coordination
|
|
115
|
-
*
|
|
114
|
+
*
|
|
115
|
+
* Note: Sharding is always enabled via UUID-based prefixes (00-ff).
|
|
116
|
+
* ShardManager is no longer required - sharding is deterministic based on UUID.
|
|
116
117
|
*/
|
|
117
118
|
setDistributedComponents(components: {
|
|
118
119
|
coordinator?: any;
|
|
@@ -121,11 +122,25 @@ export declare class S3CompatibleStorage extends BaseStorage {
|
|
|
121
122
|
readWriteSeparation?: any;
|
|
122
123
|
}): void;
|
|
123
124
|
/**
|
|
124
|
-
* Get the S3 key for a noun
|
|
125
|
+
* Get the S3 key for a noun using UUID-based sharding
|
|
126
|
+
*
|
|
127
|
+
* Uses first 2 hex characters of UUID for consistent sharding.
|
|
128
|
+
* Path format: entities/nouns/vectors/{shardId}/{uuid}.json
|
|
129
|
+
*
|
|
130
|
+
* @example
|
|
131
|
+
* getNounKey('ab123456-1234-5678-9abc-def012345678')
|
|
132
|
+
* // returns 'entities/nouns/vectors/ab/ab123456-1234-5678-9abc-def012345678.json'
|
|
125
133
|
*/
|
|
126
134
|
private getNounKey;
|
|
127
135
|
/**
|
|
128
|
-
* Get the S3 key for a verb
|
|
136
|
+
* Get the S3 key for a verb using UUID-based sharding
|
|
137
|
+
*
|
|
138
|
+
* Uses first 2 hex characters of UUID for consistent sharding.
|
|
139
|
+
* Path format: verbs/{shardId}/{uuid}.json
|
|
140
|
+
*
|
|
141
|
+
* @example
|
|
142
|
+
* getVerbKey('cd987654-4321-8765-cba9-fed543210987')
|
|
143
|
+
* // returns 'verbs/cd/cd987654-4321-8765-cba9-fed543210987.json'
|
|
129
144
|
*/
|
|
130
145
|
private getVerbKey;
|
|
131
146
|
/**
|
|
@@ -221,9 +236,23 @@ export declare class S3CompatibleStorage extends BaseStorage {
|
|
|
221
236
|
*/
|
|
222
237
|
protected getAllNodes(): Promise<HNSWNode[]>;
|
|
223
238
|
/**
|
|
224
|
-
* Get nodes with pagination
|
|
239
|
+
* Get nodes with pagination using UUID-based sharding
|
|
240
|
+
*
|
|
241
|
+
* Iterates through 256 UUID-based shards (00-ff) to retrieve nodes.
|
|
242
|
+
* Cursor format: "shardIndex:s3ContinuationToken" to support pagination across shards.
|
|
243
|
+
*
|
|
225
244
|
* @param options Pagination options
|
|
226
245
|
* @returns Promise that resolves to a paginated result of nodes
|
|
246
|
+
*
|
|
247
|
+
* @example
|
|
248
|
+
* // First page
|
|
249
|
+
* const page1 = await getNodesWithPagination({ limit: 100 })
|
|
250
|
+
* // page1.nodes contains up to 100 nodes
|
|
251
|
+
* // page1.nextCursor might be "5:some-s3-token" (currently in shard 05)
|
|
252
|
+
*
|
|
253
|
+
* // Next page
|
|
254
|
+
* const page2 = await getNodesWithPagination({ limit: 100, cursor: page1.nextCursor })
|
|
255
|
+
* // Continues from where page1 left off
|
|
227
256
|
*/
|
|
228
257
|
protected getNodesWithPagination(options?: {
|
|
229
258
|
limit?: number;
|
|
@@ -234,6 +263,10 @@ export declare class S3CompatibleStorage extends BaseStorage {
|
|
|
234
263
|
hasMore: boolean;
|
|
235
264
|
nextCursor?: string;
|
|
236
265
|
}>;
|
|
266
|
+
/**
|
|
267
|
+
* Load nodes by IDs efficiently using cache or direct fetch
|
|
268
|
+
*/
|
|
269
|
+
private loadNodesByIds;
|
|
237
270
|
/**
|
|
238
271
|
* Get nouns by noun type (internal implementation)
|
|
239
272
|
* @param nounType The noun type to filter by
|
|
@@ -517,6 +550,11 @@ export declare class S3CompatibleStorage extends BaseStorage {
|
|
|
517
550
|
hasMore: boolean;
|
|
518
551
|
nextCursor?: string;
|
|
519
552
|
}>;
|
|
553
|
+
/**
|
|
554
|
+
* Estimate total noun count by listing objects across all shards
|
|
555
|
+
* This is more efficient than loading all nouns
|
|
556
|
+
*/
|
|
557
|
+
private estimateTotalNounCount;
|
|
520
558
|
/**
|
|
521
559
|
* Initialize counts from S3 storage
|
|
522
560
|
*/
|
|
@@ -13,6 +13,7 @@ import { getGlobalSocketManager } from '../../utils/adaptiveSocketManager.js';
|
|
|
13
13
|
import { getGlobalBackpressure } from '../../utils/adaptiveBackpressure.js';
|
|
14
14
|
import { getWriteBuffer } from '../../utils/writeBuffer.js';
|
|
15
15
|
import { getCoalescer } from '../../utils/requestCoalescer.js';
|
|
16
|
+
import { getShardIdFromUuid, getShardIdByIndex, TOTAL_SHARDS } from '../sharding.js';
|
|
16
17
|
// Export R2Storage as an alias for S3CompatibleStorage
|
|
17
18
|
export { S3CompatibleStorage as R2Storage };
|
|
18
19
|
/**
|
|
@@ -69,6 +70,8 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
69
70
|
// Write buffers for bulk operations
|
|
70
71
|
this.nounWriteBuffer = null;
|
|
71
72
|
this.verbWriteBuffer = null;
|
|
73
|
+
// Note: Sharding is always enabled via UUID-based prefixes (00-ff)
|
|
74
|
+
// ShardManager is no longer used - sharding is deterministic
|
|
72
75
|
// Request coalescer for deduplication
|
|
73
76
|
this.requestCoalescer = null;
|
|
74
77
|
// High-volume mode detection - MUCH more aggressive
|
|
@@ -242,17 +245,16 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
242
245
|
}
|
|
243
246
|
/**
|
|
244
247
|
* Set distributed components for multi-node coordination
|
|
245
|
-
*
|
|
248
|
+
*
|
|
249
|
+
* Note: Sharding is always enabled via UUID-based prefixes (00-ff).
|
|
250
|
+
* ShardManager is no longer required - sharding is deterministic based on UUID.
|
|
246
251
|
*/
|
|
247
252
|
setDistributedComponents(components) {
|
|
248
253
|
this.coordinator = components.coordinator;
|
|
249
|
-
this.shardManager = components.shardManager;
|
|
250
254
|
this.cacheSync = components.cacheSync;
|
|
251
255
|
this.readWriteSeparation = components.readWriteSeparation;
|
|
252
|
-
//
|
|
253
|
-
|
|
254
|
-
console.log(`🎯 S3 Storage: Sharding enabled with ${this.shardManager.config?.shardCount || 64} shards`);
|
|
255
|
-
}
|
|
256
|
+
// Note: UUID-based sharding is always active (256 shards: 00-ff)
|
|
257
|
+
console.log(`🎯 S3 Storage: UUID-based sharding active (256 shards: 00-ff)`);
|
|
256
258
|
if (this.coordinator) {
|
|
257
259
|
console.log(`🤝 S3 Storage: Distributed coordination active (node: ${this.coordinator.nodeId})`);
|
|
258
260
|
}
|
|
@@ -264,24 +266,32 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
264
266
|
}
|
|
265
267
|
}
|
|
266
268
|
/**
|
|
267
|
-
* Get the S3 key for a noun
|
|
269
|
+
* Get the S3 key for a noun using UUID-based sharding
|
|
270
|
+
*
|
|
271
|
+
* Uses first 2 hex characters of UUID for consistent sharding.
|
|
272
|
+
* Path format: entities/nouns/vectors/{shardId}/{uuid}.json
|
|
273
|
+
*
|
|
274
|
+
* @example
|
|
275
|
+
* getNounKey('ab123456-1234-5678-9abc-def012345678')
|
|
276
|
+
* // returns 'entities/nouns/vectors/ab/ab123456-1234-5678-9abc-def012345678.json'
|
|
268
277
|
*/
|
|
269
278
|
getNounKey(id) {
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
return `shards/${shardId}/${this.nounPrefix}${id}.json`;
|
|
273
|
-
}
|
|
274
|
-
return `${this.nounPrefix}${id}.json`;
|
|
279
|
+
const shardId = getShardIdFromUuid(id);
|
|
280
|
+
return `${this.nounPrefix}${shardId}/${id}.json`;
|
|
275
281
|
}
|
|
276
282
|
/**
|
|
277
|
-
* Get the S3 key for a verb
|
|
283
|
+
* Get the S3 key for a verb using UUID-based sharding
|
|
284
|
+
*
|
|
285
|
+
* Uses first 2 hex characters of UUID for consistent sharding.
|
|
286
|
+
* Path format: verbs/{shardId}/{uuid}.json
|
|
287
|
+
*
|
|
288
|
+
* @example
|
|
289
|
+
* getVerbKey('cd987654-4321-8765-cba9-fed543210987')
|
|
290
|
+
* // returns 'verbs/cd/cd987654-4321-8765-cba9-fed543210987.json'
|
|
278
291
|
*/
|
|
279
292
|
getVerbKey(id) {
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
return `shards/${shardId}/${this.verbPrefix}${id}.json`;
|
|
283
|
-
}
|
|
284
|
-
return `${this.verbPrefix}${id}.json`;
|
|
293
|
+
const shardId = getShardIdFromUuid(id);
|
|
294
|
+
return `${this.verbPrefix}${shardId}/${id}.json`;
|
|
285
295
|
}
|
|
286
296
|
/**
|
|
287
297
|
* Override base class method to detect S3-specific throttling errors
|
|
@@ -775,7 +785,8 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
775
785
|
try {
|
|
776
786
|
// Import the GetObjectCommand only when needed
|
|
777
787
|
const { GetObjectCommand } = await import('@aws-sdk/client-s3');
|
|
778
|
-
|
|
788
|
+
// Use getNounKey() to properly handle sharding
|
|
789
|
+
const key = this.getNounKey(id);
|
|
779
790
|
this.logger.trace(`Getting node ${id} from key: ${key}`);
|
|
780
791
|
// Try to get the node from the nouns directory
|
|
781
792
|
const response = await this.s3Client.send(new GetObjectCommand({
|
|
@@ -853,86 +864,97 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
853
864
|
}
|
|
854
865
|
}
|
|
855
866
|
/**
|
|
856
|
-
* Get nodes with pagination
|
|
867
|
+
* Get nodes with pagination using UUID-based sharding
|
|
868
|
+
*
|
|
869
|
+
* Iterates through 256 UUID-based shards (00-ff) to retrieve nodes.
|
|
870
|
+
* Cursor format: "shardIndex:s3ContinuationToken" to support pagination across shards.
|
|
871
|
+
*
|
|
857
872
|
* @param options Pagination options
|
|
858
873
|
* @returns Promise that resolves to a paginated result of nodes
|
|
874
|
+
*
|
|
875
|
+
* @example
|
|
876
|
+
* // First page
|
|
877
|
+
* const page1 = await getNodesWithPagination({ limit: 100 })
|
|
878
|
+
* // page1.nodes contains up to 100 nodes
|
|
879
|
+
* // page1.nextCursor might be "5:some-s3-token" (currently in shard 05)
|
|
880
|
+
*
|
|
881
|
+
* // Next page
|
|
882
|
+
* const page2 = await getNodesWithPagination({ limit: 100, cursor: page1.nextCursor })
|
|
883
|
+
* // Continues from where page1 left off
|
|
859
884
|
*/
|
|
860
885
|
async getNodesWithPagination(options = {}) {
|
|
861
886
|
await this.ensureInitialized();
|
|
862
887
|
const limit = options.limit || 100;
|
|
863
888
|
const useCache = options.useCache !== false;
|
|
864
889
|
try {
|
|
865
|
-
// Import the ListObjectsV2Command and GetObjectCommand only when needed
|
|
866
890
|
const { ListObjectsV2Command } = await import('@aws-sdk/client-s3');
|
|
867
|
-
// List objects with pagination
|
|
868
|
-
const listResponse = await this.s3Client.send(new ListObjectsV2Command({
|
|
869
|
-
Bucket: this.bucketName,
|
|
870
|
-
Prefix: this.nounPrefix,
|
|
871
|
-
MaxKeys: limit,
|
|
872
|
-
ContinuationToken: options.cursor
|
|
873
|
-
}));
|
|
874
|
-
// If listResponse is null/undefined or there are no objects, return an empty result
|
|
875
|
-
if (!listResponse ||
|
|
876
|
-
!listResponse.Contents ||
|
|
877
|
-
listResponse.Contents.length === 0) {
|
|
878
|
-
return {
|
|
879
|
-
nodes: [],
|
|
880
|
-
hasMore: false
|
|
881
|
-
};
|
|
882
|
-
}
|
|
883
|
-
// Extract node IDs from the keys
|
|
884
|
-
const nodeIds = listResponse.Contents
|
|
885
|
-
.filter((object) => object && object.Key)
|
|
886
|
-
.map((object) => object.Key.replace(this.nounPrefix, '').replace('.json', ''));
|
|
887
|
-
// Use the cache manager to get nodes efficiently
|
|
888
891
|
const nodes = [];
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
//
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
return null;
|
|
892
|
+
// Parse cursor (format: "shardIndex:s3ContinuationToken")
|
|
893
|
+
let startShardIndex = 0;
|
|
894
|
+
let s3ContinuationToken;
|
|
895
|
+
if (options.cursor) {
|
|
896
|
+
const parts = options.cursor.split(':', 2);
|
|
897
|
+
startShardIndex = parseInt(parts[0]) || 0;
|
|
898
|
+
s3ContinuationToken = parts[1] || undefined;
|
|
899
|
+
}
|
|
900
|
+
// Iterate through shards starting from cursor position
|
|
901
|
+
for (let shardIndex = startShardIndex; shardIndex < TOTAL_SHARDS; shardIndex++) {
|
|
902
|
+
const shardId = getShardIdByIndex(shardIndex);
|
|
903
|
+
const shardPrefix = `${this.nounPrefix}${shardId}/`;
|
|
904
|
+
// List objects in this shard
|
|
905
|
+
const listResponse = await this.s3Client.send(new ListObjectsV2Command({
|
|
906
|
+
Bucket: this.bucketName,
|
|
907
|
+
Prefix: shardPrefix,
|
|
908
|
+
MaxKeys: limit - nodes.length,
|
|
909
|
+
ContinuationToken: shardIndex === startShardIndex ? s3ContinuationToken : undefined
|
|
910
|
+
}));
|
|
911
|
+
// Extract node IDs from keys
|
|
912
|
+
if (listResponse.Contents && listResponse.Contents.length > 0) {
|
|
913
|
+
const nodeIds = listResponse.Contents
|
|
914
|
+
.filter((obj) => obj && obj.Key)
|
|
915
|
+
.map((obj) => {
|
|
916
|
+
// Extract UUID from: entities/nouns/vectors/ab/ab123456-uuid.json
|
|
917
|
+
let key = obj.Key;
|
|
918
|
+
if (key.startsWith(shardPrefix)) {
|
|
919
|
+
key = key.substring(shardPrefix.length);
|
|
918
920
|
}
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
for (const node of batchNodes) {
|
|
922
|
-
if (node) {
|
|
923
|
-
nodes.push(node);
|
|
921
|
+
if (key.endsWith('.json')) {
|
|
922
|
+
key = key.substring(0, key.length - 5);
|
|
924
923
|
}
|
|
925
|
-
|
|
924
|
+
return key;
|
|
925
|
+
});
|
|
926
|
+
// Load nodes for this shard (use direct loading for pagination scans)
|
|
927
|
+
const shardNodes = await this.loadNodesByIds(nodeIds, false);
|
|
928
|
+
nodes.push(...shardNodes);
|
|
929
|
+
}
|
|
930
|
+
// Check if we've reached the limit
|
|
931
|
+
if (nodes.length >= limit) {
|
|
932
|
+
const hasMore = !!listResponse.IsTruncated || shardIndex < TOTAL_SHARDS - 1;
|
|
933
|
+
const nextCursor = listResponse.IsTruncated
|
|
934
|
+
? `${shardIndex}:${listResponse.NextContinuationToken}`
|
|
935
|
+
: shardIndex < TOTAL_SHARDS - 1
|
|
936
|
+
? `${shardIndex + 1}:`
|
|
937
|
+
: undefined;
|
|
938
|
+
return {
|
|
939
|
+
nodes: nodes.slice(0, limit),
|
|
940
|
+
hasMore,
|
|
941
|
+
nextCursor
|
|
942
|
+
};
|
|
943
|
+
}
|
|
944
|
+
// If this shard has more data but we haven't hit limit, continue to next shard
|
|
945
|
+
if (listResponse.IsTruncated) {
|
|
946
|
+
return {
|
|
947
|
+
nodes,
|
|
948
|
+
hasMore: true,
|
|
949
|
+
nextCursor: `${shardIndex}:${listResponse.NextContinuationToken}`
|
|
950
|
+
};
|
|
926
951
|
}
|
|
927
952
|
}
|
|
928
|
-
//
|
|
929
|
-
const hasMore = !!listResponse.IsTruncated;
|
|
930
|
-
// Set next cursor if there are more nodes
|
|
931
|
-
const nextCursor = listResponse.NextContinuationToken;
|
|
953
|
+
// All shards exhausted
|
|
932
954
|
return {
|
|
933
955
|
nodes,
|
|
934
|
-
hasMore,
|
|
935
|
-
nextCursor
|
|
956
|
+
hasMore: false,
|
|
957
|
+
nextCursor: undefined
|
|
936
958
|
};
|
|
937
959
|
}
|
|
938
960
|
catch (error) {
|
|
@@ -943,6 +965,43 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
943
965
|
};
|
|
944
966
|
}
|
|
945
967
|
}
|
|
968
|
+
/**
|
|
969
|
+
* Load nodes by IDs efficiently using cache or direct fetch
|
|
970
|
+
*/
|
|
971
|
+
async loadNodesByIds(nodeIds, useCache) {
|
|
972
|
+
const nodes = [];
|
|
973
|
+
if (useCache) {
|
|
974
|
+
const cachedNodes = await this.nounCacheManager.getMany(nodeIds);
|
|
975
|
+
for (const id of nodeIds) {
|
|
976
|
+
const node = cachedNodes.get(id);
|
|
977
|
+
if (node) {
|
|
978
|
+
nodes.push(node);
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
else {
|
|
983
|
+
// Load directly in batches
|
|
984
|
+
const batchSize = 50;
|
|
985
|
+
for (let i = 0; i < nodeIds.length; i += batchSize) {
|
|
986
|
+
const batch = nodeIds.slice(i, i + batchSize);
|
|
987
|
+
const batchNodes = await Promise.all(batch.map(async (id) => {
|
|
988
|
+
try {
|
|
989
|
+
return await this.getNoun_internal(id);
|
|
990
|
+
}
|
|
991
|
+
catch (error) {
|
|
992
|
+
this.logger.warn(`Failed to load node ${id}:`, error);
|
|
993
|
+
return null;
|
|
994
|
+
}
|
|
995
|
+
}));
|
|
996
|
+
for (const node of batchNodes) {
|
|
997
|
+
if (node) {
|
|
998
|
+
nodes.push(node);
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
return nodes;
|
|
1004
|
+
}
|
|
946
1005
|
/**
|
|
947
1006
|
* Get nouns by noun type (internal implementation)
|
|
948
1007
|
* @param nounType The noun type to filter by
|
|
@@ -1098,7 +1157,7 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
1098
1157
|
try {
|
|
1099
1158
|
// Import the GetObjectCommand only when needed
|
|
1100
1159
|
const { GetObjectCommand } = await import('@aws-sdk/client-s3');
|
|
1101
|
-
const key =
|
|
1160
|
+
const key = this.getVerbKey(id);
|
|
1102
1161
|
this.logger.trace(`Getting edge ${id} from key: ${key}`);
|
|
1103
1162
|
// Try to get the edge from the verbs directory
|
|
1104
1163
|
const response = await this.s3Client.send(new GetObjectCommand({
|
|
@@ -1572,7 +1631,9 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
1572
1631
|
try {
|
|
1573
1632
|
// Import the PutObjectCommand only when needed
|
|
1574
1633
|
const { PutObjectCommand } = await import('@aws-sdk/client-s3');
|
|
1575
|
-
|
|
1634
|
+
// Use UUID-based sharding for metadata (consistent with noun vectors)
|
|
1635
|
+
const shardId = getShardIdFromUuid(id);
|
|
1636
|
+
const key = `${this.metadataPrefix}${shardId}/${id}.json`;
|
|
1576
1637
|
const body = JSON.stringify(metadata, null, 2);
|
|
1577
1638
|
this.logger.trace(`Saving noun metadata for ${id} to key: ${key}`);
|
|
1578
1639
|
// Save the noun metadata to S3-compatible storage
|
|
@@ -1701,7 +1762,9 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
1701
1762
|
try {
|
|
1702
1763
|
// Import the GetObjectCommand only when needed
|
|
1703
1764
|
const { GetObjectCommand } = await import('@aws-sdk/client-s3');
|
|
1704
|
-
|
|
1765
|
+
// Use UUID-based sharding for metadata (consistent with noun vectors)
|
|
1766
|
+
const shardId = getShardIdFromUuid(id);
|
|
1767
|
+
const key = `${this.metadataPrefix}${shardId}/${id}.json`;
|
|
1705
1768
|
this.logger.trace(`Getting noun metadata for ${id} from key: ${key}`);
|
|
1706
1769
|
// Try to get the noun metadata
|
|
1707
1770
|
const response = await this.s3Client.send(new GetObjectCommand({
|
|
@@ -2698,12 +2761,54 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
2698
2761
|
filteredNodes = filteredByMetadata;
|
|
2699
2762
|
}
|
|
2700
2763
|
}
|
|
2764
|
+
// Calculate total count efficiently
|
|
2765
|
+
// For the first page (no cursor), we can estimate total count
|
|
2766
|
+
let totalCount;
|
|
2767
|
+
if (!cursor) {
|
|
2768
|
+
try {
|
|
2769
|
+
totalCount = await this.estimateTotalNounCount();
|
|
2770
|
+
}
|
|
2771
|
+
catch (error) {
|
|
2772
|
+
this.logger.warn('Failed to estimate total noun count:', error);
|
|
2773
|
+
// totalCount remains undefined
|
|
2774
|
+
}
|
|
2775
|
+
}
|
|
2701
2776
|
return {
|
|
2702
2777
|
items: filteredNodes,
|
|
2778
|
+
totalCount,
|
|
2703
2779
|
hasMore: result.hasMore,
|
|
2704
2780
|
nextCursor: result.nextCursor
|
|
2705
2781
|
};
|
|
2706
2782
|
}
|
|
2783
|
+
/**
|
|
2784
|
+
* Estimate total noun count by listing objects across all shards
|
|
2785
|
+
* This is more efficient than loading all nouns
|
|
2786
|
+
*/
|
|
2787
|
+
async estimateTotalNounCount() {
|
|
2788
|
+
const { ListObjectsV2Command } = await import('@aws-sdk/client-s3');
|
|
2789
|
+
let totalCount = 0;
|
|
2790
|
+
// Count across all UUID-based shards (00-ff)
|
|
2791
|
+
for (let shardIndex = 0; shardIndex < TOTAL_SHARDS; shardIndex++) {
|
|
2792
|
+
const shardId = getShardIdByIndex(shardIndex);
|
|
2793
|
+
const shardPrefix = `${this.nounPrefix}${shardId}/`;
|
|
2794
|
+
let shardCursor;
|
|
2795
|
+
let hasMore = true;
|
|
2796
|
+
while (hasMore) {
|
|
2797
|
+
const listResponse = await this.s3Client.send(new ListObjectsV2Command({
|
|
2798
|
+
Bucket: this.bucketName,
|
|
2799
|
+
Prefix: shardPrefix,
|
|
2800
|
+
MaxKeys: 1000,
|
|
2801
|
+
ContinuationToken: shardCursor
|
|
2802
|
+
}));
|
|
2803
|
+
if (listResponse.Contents) {
|
|
2804
|
+
totalCount += listResponse.Contents.length;
|
|
2805
|
+
}
|
|
2806
|
+
hasMore = !!listResponse.IsTruncated;
|
|
2807
|
+
shardCursor = listResponse.NextContinuationToken;
|
|
2808
|
+
}
|
|
2809
|
+
}
|
|
2810
|
+
return totalCount;
|
|
2811
|
+
}
|
|
2707
2812
|
/**
|
|
2708
2813
|
* Initialize counts from S3 storage
|
|
2709
2814
|
*/
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified UUID-based sharding for all storage adapters
|
|
3
|
+
*
|
|
4
|
+
* Uses first 2 hex characters of UUID for consistent, predictable sharding
|
|
5
|
+
* that scales from hundreds to millions of entities without configuration.
|
|
6
|
+
*
|
|
7
|
+
* Sharding characteristics:
|
|
8
|
+
* - 256 buckets (00-ff)
|
|
9
|
+
* - Deterministic (same UUID always maps to same shard)
|
|
10
|
+
* - No configuration required
|
|
11
|
+
* - Works across all storage types (filesystem, S3, GCS, memory)
|
|
12
|
+
* - Efficient for list operations and pagination
|
|
13
|
+
*/
|
|
14
|
+
/**
|
|
15
|
+
* Extract shard ID from UUID
|
|
16
|
+
*
|
|
17
|
+
* Uses first 2 hex characters of the UUID as the shard ID.
|
|
18
|
+
* This provides 256 evenly-distributed buckets (00-ff).
|
|
19
|
+
*
|
|
20
|
+
* @param uuid - UUID string (with or without hyphens)
|
|
21
|
+
* @returns 2-character hex shard ID (00-ff)
|
|
22
|
+
*
|
|
23
|
+
* @example
|
|
24
|
+
* ```typescript
|
|
25
|
+
* getShardIdFromUuid('ab123456-1234-5678-9abc-def012345678') // returns 'ab'
|
|
26
|
+
* getShardIdFromUuid('cd987654-4321-8765-cba9-fed543210987') // returns 'cd'
|
|
27
|
+
* getShardIdFromUuid('00000000-0000-0000-0000-000000000000') // returns '00'
|
|
28
|
+
* ```
|
|
29
|
+
*/
|
|
30
|
+
export declare function getShardIdFromUuid(uuid: string): string;
|
|
31
|
+
/**
|
|
32
|
+
* Get all possible shard IDs (00-ff)
|
|
33
|
+
*
|
|
34
|
+
* Returns array of 256 shard IDs in ascending order.
|
|
35
|
+
* Useful for iterating through all shards during pagination.
|
|
36
|
+
*
|
|
37
|
+
* @returns Array of 256 shard IDs
|
|
38
|
+
*
|
|
39
|
+
* @example
|
|
40
|
+
* ```typescript
|
|
41
|
+
* const shards = getAllShardIds()
|
|
42
|
+
* // ['00', '01', '02', ..., 'fd', 'fe', 'ff']
|
|
43
|
+
*
|
|
44
|
+
* for (const shardId of shards) {
|
|
45
|
+
* const prefix = `entities/nouns/vectors/${shardId}/`
|
|
46
|
+
* // List objects with this prefix
|
|
47
|
+
* }
|
|
48
|
+
* ```
|
|
49
|
+
*/
|
|
50
|
+
export declare function getAllShardIds(): string[];
|
|
51
|
+
/**
|
|
52
|
+
* Get shard ID for a given index (0-255)
|
|
53
|
+
*
|
|
54
|
+
* @param index - Shard index (0-255)
|
|
55
|
+
* @returns 2-character hex shard ID
|
|
56
|
+
*
|
|
57
|
+
* @example
|
|
58
|
+
* ```typescript
|
|
59
|
+
* getShardIdByIndex(0) // '00'
|
|
60
|
+
* getShardIdByIndex(15) // '0f'
|
|
61
|
+
* getShardIdByIndex(255) // 'ff'
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
export declare function getShardIdByIndex(index: number): string;
|
|
65
|
+
/**
|
|
66
|
+
* Get shard index from shard ID (0-255)
|
|
67
|
+
*
|
|
68
|
+
* @param shardId - 2-character hex shard ID
|
|
69
|
+
* @returns Shard index (0-255)
|
|
70
|
+
*
|
|
71
|
+
* @example
|
|
72
|
+
* ```typescript
|
|
73
|
+
* getShardIndexFromId('00') // 0
|
|
74
|
+
* getShardIndexFromId('0f') // 15
|
|
75
|
+
* getShardIndexFromId('ff') // 255
|
|
76
|
+
* ```
|
|
77
|
+
*/
|
|
78
|
+
export declare function getShardIndexFromId(shardId: string): number;
|
|
79
|
+
/**
|
|
80
|
+
* Total number of shards in the system
|
|
81
|
+
*/
|
|
82
|
+
export declare const TOTAL_SHARDS = 256;
|
|
83
|
+
/**
|
|
84
|
+
* Shard configuration (read-only)
|
|
85
|
+
*/
|
|
86
|
+
export declare const SHARD_CONFIG: {
|
|
87
|
+
/**
|
|
88
|
+
* Total number of shards (256)
|
|
89
|
+
*/
|
|
90
|
+
readonly count: 256;
|
|
91
|
+
/**
|
|
92
|
+
* Number of hex characters used for sharding (2)
|
|
93
|
+
*/
|
|
94
|
+
readonly prefixLength: 2;
|
|
95
|
+
/**
|
|
96
|
+
* Sharding method description
|
|
97
|
+
*/
|
|
98
|
+
readonly method: "uuid-prefix";
|
|
99
|
+
/**
|
|
100
|
+
* Whether sharding is always enabled
|
|
101
|
+
*/
|
|
102
|
+
readonly alwaysEnabled: true;
|
|
103
|
+
};
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified UUID-based sharding for all storage adapters
|
|
3
|
+
*
|
|
4
|
+
* Uses first 2 hex characters of UUID for consistent, predictable sharding
|
|
5
|
+
* that scales from hundreds to millions of entities without configuration.
|
|
6
|
+
*
|
|
7
|
+
* Sharding characteristics:
|
|
8
|
+
* - 256 buckets (00-ff)
|
|
9
|
+
* - Deterministic (same UUID always maps to same shard)
|
|
10
|
+
* - No configuration required
|
|
11
|
+
* - Works across all storage types (filesystem, S3, GCS, memory)
|
|
12
|
+
* - Efficient for list operations and pagination
|
|
13
|
+
*/
|
|
14
|
+
/**
|
|
15
|
+
* Extract shard ID from UUID
|
|
16
|
+
*
|
|
17
|
+
* Uses first 2 hex characters of the UUID as the shard ID.
|
|
18
|
+
* This provides 256 evenly-distributed buckets (00-ff).
|
|
19
|
+
*
|
|
20
|
+
* @param uuid - UUID string (with or without hyphens)
|
|
21
|
+
* @returns 2-character hex shard ID (00-ff)
|
|
22
|
+
*
|
|
23
|
+
* @example
|
|
24
|
+
* ```typescript
|
|
25
|
+
* getShardIdFromUuid('ab123456-1234-5678-9abc-def012345678') // returns 'ab'
|
|
26
|
+
* getShardIdFromUuid('cd987654-4321-8765-cba9-fed543210987') // returns 'cd'
|
|
27
|
+
* getShardIdFromUuid('00000000-0000-0000-0000-000000000000') // returns '00'
|
|
28
|
+
* ```
|
|
29
|
+
*/
|
|
30
|
+
export function getShardIdFromUuid(uuid) {
|
|
31
|
+
if (!uuid) {
|
|
32
|
+
throw new Error('UUID is required for sharding');
|
|
33
|
+
}
|
|
34
|
+
// Remove hyphens and convert to lowercase
|
|
35
|
+
const normalized = uuid.toLowerCase().replace(/-/g, '');
|
|
36
|
+
// Validate UUID format (32 hex characters)
|
|
37
|
+
if (normalized.length !== 32) {
|
|
38
|
+
throw new Error(`Invalid UUID format: ${uuid} (expected 32 hex chars, got ${normalized.length})`);
|
|
39
|
+
}
|
|
40
|
+
// Extract first 2 characters
|
|
41
|
+
const shardId = normalized.substring(0, 2);
|
|
42
|
+
// Validate hex format
|
|
43
|
+
if (!/^[0-9a-f]{2}$/.test(shardId)) {
|
|
44
|
+
throw new Error(`Invalid UUID prefix: ${shardId} (expected 2 hex chars)`);
|
|
45
|
+
}
|
|
46
|
+
return shardId;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Get all possible shard IDs (00-ff)
|
|
50
|
+
*
|
|
51
|
+
* Returns array of 256 shard IDs in ascending order.
|
|
52
|
+
* Useful for iterating through all shards during pagination.
|
|
53
|
+
*
|
|
54
|
+
* @returns Array of 256 shard IDs
|
|
55
|
+
*
|
|
56
|
+
* @example
|
|
57
|
+
* ```typescript
|
|
58
|
+
* const shards = getAllShardIds()
|
|
59
|
+
* // ['00', '01', '02', ..., 'fd', 'fe', 'ff']
|
|
60
|
+
*
|
|
61
|
+
* for (const shardId of shards) {
|
|
62
|
+
* const prefix = `entities/nouns/vectors/${shardId}/`
|
|
63
|
+
* // List objects with this prefix
|
|
64
|
+
* }
|
|
65
|
+
* ```
|
|
66
|
+
*/
|
|
67
|
+
export function getAllShardIds() {
|
|
68
|
+
const shards = [];
|
|
69
|
+
for (let i = 0; i < 256; i++) {
|
|
70
|
+
shards.push(i.toString(16).padStart(2, '0'));
|
|
71
|
+
}
|
|
72
|
+
return shards;
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Get shard ID for a given index (0-255)
|
|
76
|
+
*
|
|
77
|
+
* @param index - Shard index (0-255)
|
|
78
|
+
* @returns 2-character hex shard ID
|
|
79
|
+
*
|
|
80
|
+
* @example
|
|
81
|
+
* ```typescript
|
|
82
|
+
* getShardIdByIndex(0) // '00'
|
|
83
|
+
* getShardIdByIndex(15) // '0f'
|
|
84
|
+
* getShardIdByIndex(255) // 'ff'
|
|
85
|
+
* ```
|
|
86
|
+
*/
|
|
87
|
+
export function getShardIdByIndex(index) {
|
|
88
|
+
if (index < 0 || index > 255) {
|
|
89
|
+
throw new Error(`Shard index out of range: ${index} (expected 0-255)`);
|
|
90
|
+
}
|
|
91
|
+
return index.toString(16).padStart(2, '0');
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Get shard index from shard ID (0-255)
|
|
95
|
+
*
|
|
96
|
+
* @param shardId - 2-character hex shard ID
|
|
97
|
+
* @returns Shard index (0-255)
|
|
98
|
+
*
|
|
99
|
+
* @example
|
|
100
|
+
* ```typescript
|
|
101
|
+
* getShardIndexFromId('00') // 0
|
|
102
|
+
* getShardIndexFromId('0f') // 15
|
|
103
|
+
* getShardIndexFromId('ff') // 255
|
|
104
|
+
* ```
|
|
105
|
+
*/
|
|
106
|
+
export function getShardIndexFromId(shardId) {
|
|
107
|
+
if (!/^[0-9a-f]{2}$/.test(shardId)) {
|
|
108
|
+
throw new Error(`Invalid shard ID: ${shardId} (expected 2 hex chars)`);
|
|
109
|
+
}
|
|
110
|
+
return parseInt(shardId, 16);
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Total number of shards in the system
|
|
114
|
+
*/
|
|
115
|
+
export const TOTAL_SHARDS = 256;
|
|
116
|
+
/**
|
|
117
|
+
* Shard configuration (read-only)
|
|
118
|
+
*/
|
|
119
|
+
export const SHARD_CONFIG = {
|
|
120
|
+
/**
|
|
121
|
+
* Total number of shards (256)
|
|
122
|
+
*/
|
|
123
|
+
count: TOTAL_SHARDS,
|
|
124
|
+
/**
|
|
125
|
+
* Number of hex characters used for sharding (2)
|
|
126
|
+
*/
|
|
127
|
+
prefixLength: 2,
|
|
128
|
+
/**
|
|
129
|
+
* Sharding method description
|
|
130
|
+
*/
|
|
131
|
+
method: 'uuid-prefix',
|
|
132
|
+
/**
|
|
133
|
+
* Whether sharding is always enabled
|
|
134
|
+
*/
|
|
135
|
+
alwaysEnabled: true
|
|
136
|
+
};
|
|
137
|
+
//# sourceMappingURL=sharding.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@soulcraft/brainy",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.26.0",
|
|
4
4
|
"description": "Universal Knowledge Protocol™ - World's first Triple Intelligence database unifying vector, graph, and document search in one API. 31 nouns × 40 verbs for infinite expressiveness.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.js",
|