@soulcraft/brainy 0.32.0 → 0.33.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +495 -400
- package/dist/brainyData.d.ts +115 -0
- package/dist/coreTypes.d.ts +26 -0
- package/dist/storage/adapters/baseStorageAdapter.d.ts +16 -0
- package/dist/storage/adapters/baseStorageAdapter.d.ts.map +1 -1
- package/dist/storage/cacheManager.d.ts +27 -13
- package/dist/storage/cacheManager.d.ts.map +1 -1
- package/dist/storage/storageFactory.d.ts +44 -0
- package/dist/storage/storageFactory.d.ts.map +1 -1
- package/dist/unified.js +823 -56
- package/dist/unified.min.js +747 -747
- package/dist/utils/fieldNameTracking.d.ts +21 -0
- package/dist/utils/fieldNameTracking.d.ts.map +1 -0
- package/dist/utils/index.d.ts +2 -0
- package/dist/utils/index.d.ts.map +1 -1
- package/dist/utils/jsonProcessing.d.ts +43 -0
- package/dist/utils/jsonProcessing.d.ts.map +1 -0
- package/package.json +1 -1
package/dist/unified.js
CHANGED
|
@@ -4432,6 +4432,275 @@ async function getStatistics(instance, options = {}) {
|
|
|
4432
4432
|
}
|
|
4433
4433
|
}
|
|
4434
4434
|
|
|
4435
|
+
/**
|
|
4436
|
+
* Utility functions for processing JSON documents for vectorization and search
|
|
4437
|
+
*/
|
|
4438
|
+
/**
|
|
4439
|
+
* Extracts text from a JSON object for vectorization
|
|
4440
|
+
* This function recursively processes the JSON object and extracts text from all fields
|
|
4441
|
+
* It can also prioritize specific fields if provided
|
|
4442
|
+
*
|
|
4443
|
+
* @param jsonObject The JSON object to extract text from
|
|
4444
|
+
* @param options Configuration options for text extraction
|
|
4445
|
+
* @returns A string containing the extracted text
|
|
4446
|
+
*/
|
|
4447
|
+
function extractTextFromJson(jsonObject, options = {}) {
|
|
4448
|
+
// Set default options
|
|
4449
|
+
const { priorityFields = [], excludeFields = [], includeFieldNames = true, maxDepth = 5, currentDepth = 0, fieldPath = [] } = options;
|
|
4450
|
+
// If input is not an object or array, or we've reached max depth, return as string
|
|
4451
|
+
if (jsonObject === null ||
|
|
4452
|
+
jsonObject === undefined ||
|
|
4453
|
+
typeof jsonObject !== 'object' ||
|
|
4454
|
+
currentDepth >= maxDepth) {
|
|
4455
|
+
return String(jsonObject || '');
|
|
4456
|
+
}
|
|
4457
|
+
const extractedText = [];
|
|
4458
|
+
const priorityText = [];
|
|
4459
|
+
// Process arrays
|
|
4460
|
+
if (Array.isArray(jsonObject)) {
|
|
4461
|
+
for (let i = 0; i < jsonObject.length; i++) {
|
|
4462
|
+
const value = jsonObject[i];
|
|
4463
|
+
const newPath = [...fieldPath, i.toString()];
|
|
4464
|
+
// Recursively extract text from array items
|
|
4465
|
+
const itemText = extractTextFromJson(value, {
|
|
4466
|
+
priorityFields,
|
|
4467
|
+
excludeFields,
|
|
4468
|
+
includeFieldNames,
|
|
4469
|
+
maxDepth,
|
|
4470
|
+
currentDepth: currentDepth + 1,
|
|
4471
|
+
fieldPath: newPath
|
|
4472
|
+
});
|
|
4473
|
+
if (itemText) {
|
|
4474
|
+
extractedText.push(itemText);
|
|
4475
|
+
}
|
|
4476
|
+
}
|
|
4477
|
+
}
|
|
4478
|
+
// Process objects
|
|
4479
|
+
else {
|
|
4480
|
+
for (const [key, value] of Object.entries(jsonObject)) {
|
|
4481
|
+
// Skip excluded fields
|
|
4482
|
+
if (excludeFields.includes(key)) {
|
|
4483
|
+
continue;
|
|
4484
|
+
}
|
|
4485
|
+
const newPath = [...fieldPath, key];
|
|
4486
|
+
const fullPath = newPath.join('.');
|
|
4487
|
+
// Check if this is a priority field
|
|
4488
|
+
const isPriority = priorityFields.some(field => {
|
|
4489
|
+
// Exact match
|
|
4490
|
+
if (field === key)
|
|
4491
|
+
return true;
|
|
4492
|
+
// Path match
|
|
4493
|
+
if (field === fullPath)
|
|
4494
|
+
return true;
|
|
4495
|
+
// Wildcard match (e.g., "user.*" matches "user.name", "user.email", etc.)
|
|
4496
|
+
if (field.endsWith('.*') && fullPath.startsWith(field.slice(0, -2)))
|
|
4497
|
+
return true;
|
|
4498
|
+
return false;
|
|
4499
|
+
});
|
|
4500
|
+
// Get the field value as text
|
|
4501
|
+
let fieldText;
|
|
4502
|
+
if (typeof value === 'object' && value !== null) {
|
|
4503
|
+
// Recursively extract text from nested objects
|
|
4504
|
+
fieldText = extractTextFromJson(value, {
|
|
4505
|
+
priorityFields,
|
|
4506
|
+
excludeFields,
|
|
4507
|
+
includeFieldNames,
|
|
4508
|
+
maxDepth,
|
|
4509
|
+
currentDepth: currentDepth + 1,
|
|
4510
|
+
fieldPath: newPath
|
|
4511
|
+
});
|
|
4512
|
+
}
|
|
4513
|
+
else {
|
|
4514
|
+
fieldText = String(value || '');
|
|
4515
|
+
}
|
|
4516
|
+
// Add field name if requested
|
|
4517
|
+
if (includeFieldNames && fieldText) {
|
|
4518
|
+
fieldText = `${key}: ${fieldText}`;
|
|
4519
|
+
}
|
|
4520
|
+
// Add to appropriate collection
|
|
4521
|
+
if (fieldText) {
|
|
4522
|
+
if (isPriority) {
|
|
4523
|
+
priorityText.push(fieldText);
|
|
4524
|
+
}
|
|
4525
|
+
else {
|
|
4526
|
+
extractedText.push(fieldText);
|
|
4527
|
+
}
|
|
4528
|
+
}
|
|
4529
|
+
}
|
|
4530
|
+
}
|
|
4531
|
+
// Combine priority text (repeated for emphasis) and regular text
|
|
4532
|
+
return [...priorityText, ...priorityText, ...extractedText].join(' ');
|
|
4533
|
+
}
|
|
4534
|
+
/**
|
|
4535
|
+
* Prepares a JSON document for vectorization
|
|
4536
|
+
* This function extracts text from the JSON document and formats it for optimal vectorization
|
|
4537
|
+
*
|
|
4538
|
+
* @param jsonDocument The JSON document to prepare
|
|
4539
|
+
* @param options Configuration options for preparation
|
|
4540
|
+
* @returns A string ready for vectorization
|
|
4541
|
+
*/
|
|
4542
|
+
function prepareJsonForVectorization(jsonDocument, options = {}) {
|
|
4543
|
+
// If input is a string, try to parse it as JSON
|
|
4544
|
+
let document = jsonDocument;
|
|
4545
|
+
if (typeof jsonDocument === 'string') {
|
|
4546
|
+
try {
|
|
4547
|
+
document = JSON.parse(jsonDocument);
|
|
4548
|
+
}
|
|
4549
|
+
catch (e) {
|
|
4550
|
+
// If parsing fails, treat it as a plain string
|
|
4551
|
+
return jsonDocument;
|
|
4552
|
+
}
|
|
4553
|
+
}
|
|
4554
|
+
// If not an object after parsing, return as is
|
|
4555
|
+
if (typeof document !== 'object' || document === null) {
|
|
4556
|
+
return String(document || '');
|
|
4557
|
+
}
|
|
4558
|
+
// Extract text from the document
|
|
4559
|
+
return extractTextFromJson(document, options);
|
|
4560
|
+
}
|
|
4561
|
+
/**
|
|
4562
|
+
* Extracts text from a specific field in a JSON document
|
|
4563
|
+
* This is useful for searching within specific fields
|
|
4564
|
+
*
|
|
4565
|
+
* @param jsonDocument The JSON document to extract from
|
|
4566
|
+
* @param fieldPath The path to the field (e.g., "user.name" or "addresses[0].city")
|
|
4567
|
+
* @returns The extracted text or empty string if field not found
|
|
4568
|
+
*/
|
|
4569
|
+
function extractFieldFromJson(jsonDocument, fieldPath) {
|
|
4570
|
+
// If input is a string, try to parse it as JSON
|
|
4571
|
+
let document = jsonDocument;
|
|
4572
|
+
if (typeof jsonDocument === 'string') {
|
|
4573
|
+
try {
|
|
4574
|
+
document = JSON.parse(jsonDocument);
|
|
4575
|
+
}
|
|
4576
|
+
catch (e) {
|
|
4577
|
+
// If parsing fails, return empty string
|
|
4578
|
+
return '';
|
|
4579
|
+
}
|
|
4580
|
+
}
|
|
4581
|
+
// If not an object after parsing, return empty string
|
|
4582
|
+
if (typeof document !== 'object' || document === null) {
|
|
4583
|
+
return '';
|
|
4584
|
+
}
|
|
4585
|
+
// Parse the field path
|
|
4586
|
+
const parts = fieldPath.split('.');
|
|
4587
|
+
let current = document;
|
|
4588
|
+
// Navigate through the path
|
|
4589
|
+
for (const part of parts) {
|
|
4590
|
+
// Handle array indexing (e.g., "addresses[0]")
|
|
4591
|
+
const match = part.match(/^([^[]+)(?:\[(\d+)\])?$/);
|
|
4592
|
+
if (!match) {
|
|
4593
|
+
return '';
|
|
4594
|
+
}
|
|
4595
|
+
const [, key, indexStr] = match;
|
|
4596
|
+
// Move to the next level
|
|
4597
|
+
current = current[key];
|
|
4598
|
+
// If we have an array index, access that element
|
|
4599
|
+
if (indexStr !== undefined && Array.isArray(current)) {
|
|
4600
|
+
const index = parseInt(indexStr, 10);
|
|
4601
|
+
current = current[index];
|
|
4602
|
+
}
|
|
4603
|
+
// If we've reached a null or undefined value, return empty string
|
|
4604
|
+
if (current === null || current === undefined) {
|
|
4605
|
+
return '';
|
|
4606
|
+
}
|
|
4607
|
+
}
|
|
4608
|
+
// Convert the final value to string
|
|
4609
|
+
return typeof current === 'object'
|
|
4610
|
+
? JSON.stringify(current)
|
|
4611
|
+
: String(current);
|
|
4612
|
+
}
|
|
4613
|
+
|
|
4614
|
+
/**
|
|
4615
|
+
* Utility functions for tracking and managing field names in JSON documents
|
|
4616
|
+
*/
|
|
4617
|
+
/**
|
|
4618
|
+
* Extracts field names from a JSON document
|
|
4619
|
+
* @param jsonObject The JSON object to extract field names from
|
|
4620
|
+
* @param options Configuration options
|
|
4621
|
+
* @returns An array of field paths (e.g., "user.name", "addresses[0].city")
|
|
4622
|
+
*/
|
|
4623
|
+
function extractFieldNamesFromJson(jsonObject, options = {}) {
|
|
4624
|
+
const { maxDepth = 5, currentDepth = 0, currentPath = '', fieldNames = new Set() } = options;
|
|
4625
|
+
if (jsonObject === null ||
|
|
4626
|
+
jsonObject === undefined ||
|
|
4627
|
+
typeof jsonObject !== 'object' ||
|
|
4628
|
+
currentDepth >= maxDepth) {
|
|
4629
|
+
return Array.from(fieldNames);
|
|
4630
|
+
}
|
|
4631
|
+
if (Array.isArray(jsonObject)) {
|
|
4632
|
+
// For arrays, we'll just check the first item to avoid explosion of paths
|
|
4633
|
+
if (jsonObject.length > 0) {
|
|
4634
|
+
const arrayPath = currentPath ? `${currentPath}[0]` : '[0]';
|
|
4635
|
+
extractFieldNamesFromJson(jsonObject[0], {
|
|
4636
|
+
maxDepth,
|
|
4637
|
+
currentDepth: currentDepth + 1,
|
|
4638
|
+
currentPath: arrayPath,
|
|
4639
|
+
fieldNames
|
|
4640
|
+
});
|
|
4641
|
+
}
|
|
4642
|
+
}
|
|
4643
|
+
else {
|
|
4644
|
+
// For objects, process each property
|
|
4645
|
+
for (const key of Object.keys(jsonObject)) {
|
|
4646
|
+
const value = jsonObject[key];
|
|
4647
|
+
const fieldPath = currentPath ? `${currentPath}.${key}` : key;
|
|
4648
|
+
// Add this field path
|
|
4649
|
+
fieldNames.add(fieldPath);
|
|
4650
|
+
// Recursively process nested objects
|
|
4651
|
+
if (typeof value === 'object' && value !== null) {
|
|
4652
|
+
extractFieldNamesFromJson(value, {
|
|
4653
|
+
maxDepth,
|
|
4654
|
+
currentDepth: currentDepth + 1,
|
|
4655
|
+
currentPath: fieldPath,
|
|
4656
|
+
fieldNames
|
|
4657
|
+
});
|
|
4658
|
+
}
|
|
4659
|
+
}
|
|
4660
|
+
}
|
|
4661
|
+
return Array.from(fieldNames);
|
|
4662
|
+
}
|
|
4663
|
+
/**
|
|
4664
|
+
* Maps field names to standard field names based on common patterns
|
|
4665
|
+
* @param fieldName The field name to map
|
|
4666
|
+
* @returns The standard field name if a match is found, or null if no match
|
|
4667
|
+
*/
|
|
4668
|
+
function mapToStandardField(fieldName) {
|
|
4669
|
+
// Standard field mappings
|
|
4670
|
+
const standardMappings = {
|
|
4671
|
+
'title': ['title', 'name', 'headline', 'subject'],
|
|
4672
|
+
'description': ['description', 'summary', 'content', 'text', 'body'],
|
|
4673
|
+
'author': ['author', 'creator', 'user', 'owner', 'by'],
|
|
4674
|
+
'date': ['date', 'created', 'createdAt', 'timestamp', 'published'],
|
|
4675
|
+
'url': ['url', 'link', 'href', 'source'],
|
|
4676
|
+
'image': ['image', 'thumbnail', 'photo', 'picture'],
|
|
4677
|
+
'tags': ['tags', 'categories', 'keywords', 'topics']
|
|
4678
|
+
};
|
|
4679
|
+
// Check for matches
|
|
4680
|
+
for (const [standardField, possibleMatches] of Object.entries(standardMappings)) {
|
|
4681
|
+
// Exact match
|
|
4682
|
+
if (possibleMatches.includes(fieldName)) {
|
|
4683
|
+
return standardField;
|
|
4684
|
+
}
|
|
4685
|
+
// Path match (e.g., "user.name" matches "name")
|
|
4686
|
+
const parts = fieldName.split('.');
|
|
4687
|
+
const lastPart = parts[parts.length - 1];
|
|
4688
|
+
if (possibleMatches.includes(lastPart)) {
|
|
4689
|
+
return standardField;
|
|
4690
|
+
}
|
|
4691
|
+
// Array match (e.g., "items[0].name" matches "name")
|
|
4692
|
+
if (fieldName.includes('[')) {
|
|
4693
|
+
for (const part of parts) {
|
|
4694
|
+
const cleanPart = part.split('[')[0];
|
|
4695
|
+
if (possibleMatches.includes(cleanPart)) {
|
|
4696
|
+
return standardField;
|
|
4697
|
+
}
|
|
4698
|
+
}
|
|
4699
|
+
}
|
|
4700
|
+
}
|
|
4701
|
+
return null;
|
|
4702
|
+
}
|
|
4703
|
+
|
|
4435
4704
|
/**
|
|
4436
4705
|
* HNSW (Hierarchical Navigable Small World) Index implementation
|
|
4437
4706
|
* Based on the paper: "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs"
|
|
@@ -5708,6 +5977,107 @@ class BaseStorageAdapter {
|
|
|
5708
5977
|
// Call the protected flushStatistics method to immediately write to storage
|
|
5709
5978
|
await this.flushStatistics();
|
|
5710
5979
|
}
|
|
5980
|
+
/**
|
|
5981
|
+
* Track field names from a JSON document
|
|
5982
|
+
* @param jsonDocument The JSON document to extract field names from
|
|
5983
|
+
* @param service The service that inserted the data
|
|
5984
|
+
*/
|
|
5985
|
+
async trackFieldNames(jsonDocument, service) {
|
|
5986
|
+
// Skip if not a JSON object
|
|
5987
|
+
if (typeof jsonDocument !== 'object' || jsonDocument === null || Array.isArray(jsonDocument)) {
|
|
5988
|
+
return;
|
|
5989
|
+
}
|
|
5990
|
+
// Get current statistics from cache or storage
|
|
5991
|
+
let statistics = this.statisticsCache;
|
|
5992
|
+
if (!statistics) {
|
|
5993
|
+
statistics = await this.getStatisticsData();
|
|
5994
|
+
if (!statistics) {
|
|
5995
|
+
statistics = this.createDefaultStatistics();
|
|
5996
|
+
}
|
|
5997
|
+
// Update the cache
|
|
5998
|
+
this.statisticsCache = {
|
|
5999
|
+
...statistics,
|
|
6000
|
+
nounCount: { ...statistics.nounCount },
|
|
6001
|
+
verbCount: { ...statistics.verbCount },
|
|
6002
|
+
metadataCount: { ...statistics.metadataCount },
|
|
6003
|
+
fieldNames: { ...statistics.fieldNames },
|
|
6004
|
+
standardFieldMappings: { ...statistics.standardFieldMappings }
|
|
6005
|
+
};
|
|
6006
|
+
}
|
|
6007
|
+
// Ensure fieldNames exists
|
|
6008
|
+
if (!this.statisticsCache.fieldNames) {
|
|
6009
|
+
this.statisticsCache.fieldNames = {};
|
|
6010
|
+
}
|
|
6011
|
+
// Ensure standardFieldMappings exists
|
|
6012
|
+
if (!this.statisticsCache.standardFieldMappings) {
|
|
6013
|
+
this.statisticsCache.standardFieldMappings = {};
|
|
6014
|
+
}
|
|
6015
|
+
// Extract field names from the JSON document
|
|
6016
|
+
const fieldNames = extractFieldNamesFromJson(jsonDocument);
|
|
6017
|
+
// Initialize service entry if it doesn't exist
|
|
6018
|
+
if (!this.statisticsCache.fieldNames[service]) {
|
|
6019
|
+
this.statisticsCache.fieldNames[service] = [];
|
|
6020
|
+
}
|
|
6021
|
+
// Add new field names to the service's list
|
|
6022
|
+
for (const fieldName of fieldNames) {
|
|
6023
|
+
if (!this.statisticsCache.fieldNames[service].includes(fieldName)) {
|
|
6024
|
+
this.statisticsCache.fieldNames[service].push(fieldName);
|
|
6025
|
+
}
|
|
6026
|
+
// Map to standard field if possible
|
|
6027
|
+
const standardField = mapToStandardField(fieldName);
|
|
6028
|
+
if (standardField) {
|
|
6029
|
+
// Initialize standard field entry if it doesn't exist
|
|
6030
|
+
if (!this.statisticsCache.standardFieldMappings[standardField]) {
|
|
6031
|
+
this.statisticsCache.standardFieldMappings[standardField] = {};
|
|
6032
|
+
}
|
|
6033
|
+
// Initialize service entry if it doesn't exist
|
|
6034
|
+
if (!this.statisticsCache.standardFieldMappings[standardField][service]) {
|
|
6035
|
+
this.statisticsCache.standardFieldMappings[standardField][service] = [];
|
|
6036
|
+
}
|
|
6037
|
+
// Add field name to standard field mapping if not already there
|
|
6038
|
+
if (!this.statisticsCache.standardFieldMappings[standardField][service].includes(fieldName)) {
|
|
6039
|
+
this.statisticsCache.standardFieldMappings[standardField][service].push(fieldName);
|
|
6040
|
+
}
|
|
6041
|
+
}
|
|
6042
|
+
}
|
|
6043
|
+
// Update timestamp
|
|
6044
|
+
this.statisticsCache.lastUpdated = new Date().toISOString();
|
|
6045
|
+
// Schedule a batch update
|
|
6046
|
+
this.statisticsModified = true;
|
|
6047
|
+
this.scheduleBatchUpdate();
|
|
6048
|
+
}
|
|
6049
|
+
/**
|
|
6050
|
+
* Get available field names by service
|
|
6051
|
+
* @returns Record of field names by service
|
|
6052
|
+
*/
|
|
6053
|
+
async getAvailableFieldNames() {
|
|
6054
|
+
// Get current statistics from cache or storage
|
|
6055
|
+
let statistics = this.statisticsCache;
|
|
6056
|
+
if (!statistics) {
|
|
6057
|
+
statistics = await this.getStatisticsData();
|
|
6058
|
+
if (!statistics) {
|
|
6059
|
+
return {};
|
|
6060
|
+
}
|
|
6061
|
+
}
|
|
6062
|
+
// Return field names by service
|
|
6063
|
+
return statistics.fieldNames || {};
|
|
6064
|
+
}
|
|
6065
|
+
/**
|
|
6066
|
+
* Get standard field mappings
|
|
6067
|
+
* @returns Record of standard field mappings
|
|
6068
|
+
*/
|
|
6069
|
+
async getStandardFieldMappings() {
|
|
6070
|
+
// Get current statistics from cache or storage
|
|
6071
|
+
let statistics = this.statisticsCache;
|
|
6072
|
+
if (!statistics) {
|
|
6073
|
+
statistics = await this.getStatisticsData();
|
|
6074
|
+
if (!statistics) {
|
|
6075
|
+
return {};
|
|
6076
|
+
}
|
|
6077
|
+
}
|
|
6078
|
+
// Return standard field mappings
|
|
6079
|
+
return statistics.standardFieldMappings || {};
|
|
6080
|
+
}
|
|
5711
6081
|
/**
|
|
5712
6082
|
* Create default statistics data
|
|
5713
6083
|
* @returns Default statistics data
|
|
@@ -5718,6 +6088,8 @@ class BaseStorageAdapter {
|
|
|
5718
6088
|
verbCount: {},
|
|
5719
6089
|
metadataCount: {},
|
|
5720
6090
|
hnswIndexSize: 0,
|
|
6091
|
+
fieldNames: {},
|
|
6092
|
+
standardFieldMappings: {},
|
|
5721
6093
|
lastUpdated: new Date().toISOString()
|
|
5722
6094
|
};
|
|
5723
6095
|
}
|
|
@@ -8055,6 +8427,7 @@ var StorageType;
|
|
|
8055
8427
|
StorageType[StorageType["OPFS"] = 1] = "OPFS";
|
|
8056
8428
|
StorageType[StorageType["FILESYSTEM"] = 2] = "FILESYSTEM";
|
|
8057
8429
|
StorageType[StorageType["S3"] = 3] = "S3";
|
|
8430
|
+
StorageType[StorageType["REMOTE_API"] = 4] = "REMOTE_API";
|
|
8058
8431
|
})(StorageType || (StorageType = {}));
|
|
8059
8432
|
/**
|
|
8060
8433
|
* Multi-level cache manager for efficient data access
|
|
@@ -8078,6 +8451,8 @@ class CacheManager {
|
|
|
8078
8451
|
this.lastAutoTuneTime = 0;
|
|
8079
8452
|
this.autoTuneInterval = 5 * 60 * 1000; // 5 minutes
|
|
8080
8453
|
this.storageStatistics = null;
|
|
8454
|
+
// Store options for later reference
|
|
8455
|
+
this.options = options;
|
|
8081
8456
|
// Detect environment
|
|
8082
8457
|
this.environment = this.detectEnvironment();
|
|
8083
8458
|
// Set storage types based on environment
|
|
@@ -8127,13 +8502,26 @@ class CacheManager {
|
|
|
8127
8502
|
}
|
|
8128
8503
|
}
|
|
8129
8504
|
/**
|
|
8130
|
-
* Detect the optimal cache size based on available memory
|
|
8505
|
+
* Detect the optimal cache size based on available memory and operating mode
|
|
8506
|
+
*
|
|
8507
|
+
* Enhanced to better handle large datasets in S3 or other storage:
|
|
8508
|
+
* - Increases cache size for read-only mode
|
|
8509
|
+
* - Adjusts based on total dataset size when available
|
|
8510
|
+
* - Provides more aggressive caching for large datasets
|
|
8511
|
+
* - Optimizes memory usage based on environment
|
|
8131
8512
|
*/
|
|
8132
8513
|
detectOptimalCacheSize() {
|
|
8133
8514
|
try {
|
|
8134
8515
|
// Default to a conservative value
|
|
8135
8516
|
const defaultSize = 1000;
|
|
8136
|
-
//
|
|
8517
|
+
// Get the total dataset size if available
|
|
8518
|
+
const totalItems = this.storageStatistics ?
|
|
8519
|
+
(this.storageStatistics.totalNodes || 0) + (this.storageStatistics.totalEdges || 0) : 0;
|
|
8520
|
+
// Determine if we're dealing with a large dataset (>100K items)
|
|
8521
|
+
const isLargeDataset = totalItems > 100000;
|
|
8522
|
+
// Check if we're in read-only mode (from parent BrainyData instance)
|
|
8523
|
+
const isReadOnly = this.options?.readOnly || false;
|
|
8524
|
+
// In Node.js, use available system memory with enhanced allocation
|
|
8137
8525
|
if (this.environment === Environment$1.NODE) {
|
|
8138
8526
|
try {
|
|
8139
8527
|
// Use dynamic import to avoid ESLint warning
|
|
@@ -8142,12 +8530,36 @@ class CacheManager {
|
|
|
8142
8530
|
return require('os');
|
|
8143
8531
|
};
|
|
8144
8532
|
const os = getOS();
|
|
8533
|
+
const totalMemory = os.totalmem();
|
|
8145
8534
|
const freeMemory = os.freemem();
|
|
8146
8535
|
// Estimate average entry size (in bytes)
|
|
8147
8536
|
// This is a conservative estimate for complex objects with vectors
|
|
8148
8537
|
const ESTIMATED_BYTES_PER_ENTRY = 1024; // 1KB per entry
|
|
8149
|
-
//
|
|
8150
|
-
|
|
8538
|
+
// Base memory percentage - 10% by default
|
|
8539
|
+
let memoryPercentage = 0.1;
|
|
8540
|
+
// Adjust based on operating mode and dataset size
|
|
8541
|
+
if (isReadOnly) {
|
|
8542
|
+
// In read-only mode, we can use more memory for caching
|
|
8543
|
+
memoryPercentage = 0.25; // 25% of free memory
|
|
8544
|
+
// For large datasets in read-only mode, be even more aggressive
|
|
8545
|
+
if (isLargeDataset) {
|
|
8546
|
+
memoryPercentage = 0.4; // 40% of free memory
|
|
8547
|
+
}
|
|
8548
|
+
}
|
|
8549
|
+
else if (isLargeDataset) {
|
|
8550
|
+
// For large datasets in normal mode, increase slightly
|
|
8551
|
+
memoryPercentage = 0.15; // 15% of free memory
|
|
8552
|
+
}
|
|
8553
|
+
// Calculate optimal size based on adjusted percentage
|
|
8554
|
+
const optimalSize = Math.max(Math.floor(freeMemory * memoryPercentage / ESTIMATED_BYTES_PER_ENTRY), 1000);
|
|
8555
|
+
// If we know the total dataset size, cap at a reasonable percentage
|
|
8556
|
+
if (totalItems > 0) {
|
|
8557
|
+
// In read-only mode, we can cache a larger percentage
|
|
8558
|
+
const maxPercentage = isReadOnly ? 0.5 : 0.3;
|
|
8559
|
+
const maxItems = Math.ceil(totalItems * maxPercentage);
|
|
8560
|
+
// Return the smaller of the two to avoid excessive memory usage
|
|
8561
|
+
return Math.min(optimalSize, maxItems);
|
|
8562
|
+
}
|
|
8151
8563
|
return optimalSize;
|
|
8152
8564
|
}
|
|
8153
8565
|
catch (error) {
|
|
@@ -8155,10 +8567,36 @@ class CacheManager {
|
|
|
8155
8567
|
return defaultSize;
|
|
8156
8568
|
}
|
|
8157
8569
|
}
|
|
8158
|
-
// In browser, use navigator.deviceMemory
|
|
8570
|
+
// In browser, use navigator.deviceMemory with enhanced allocation
|
|
8159
8571
|
if (this.environment === Environment$1.BROWSER && navigator.deviceMemory) {
|
|
8160
|
-
//
|
|
8161
|
-
|
|
8572
|
+
// Base entries per GB
|
|
8573
|
+
let entriesPerGB = 500;
|
|
8574
|
+
// Adjust based on operating mode and dataset size
|
|
8575
|
+
if (isReadOnly) {
|
|
8576
|
+
entriesPerGB = 800; // More aggressive caching in read-only mode
|
|
8577
|
+
if (isLargeDataset) {
|
|
8578
|
+
entriesPerGB = 1000; // Even more aggressive for large datasets
|
|
8579
|
+
}
|
|
8580
|
+
}
|
|
8581
|
+
else if (isLargeDataset) {
|
|
8582
|
+
entriesPerGB = 600; // Slightly more aggressive for large datasets
|
|
8583
|
+
}
|
|
8584
|
+
// Calculate based on device memory
|
|
8585
|
+
const browserCacheSize = Math.max(navigator.deviceMemory * entriesPerGB, 1000);
|
|
8586
|
+
// If we know the total dataset size, cap at a reasonable percentage
|
|
8587
|
+
if (totalItems > 0) {
|
|
8588
|
+
// In read-only mode, we can cache a larger percentage
|
|
8589
|
+
const maxPercentage = isReadOnly ? 0.4 : 0.25;
|
|
8590
|
+
const maxItems = Math.ceil(totalItems * maxPercentage);
|
|
8591
|
+
// Return the smaller of the two to avoid excessive memory usage
|
|
8592
|
+
return Math.min(browserCacheSize, maxItems);
|
|
8593
|
+
}
|
|
8594
|
+
return browserCacheSize;
|
|
8595
|
+
}
|
|
8596
|
+
// For worker environments or when memory detection fails
|
|
8597
|
+
if (this.environment === Environment$1.WORKER) {
|
|
8598
|
+
// Workers typically have limited memory, be conservative
|
|
8599
|
+
return isReadOnly ? 2000 : 1000;
|
|
8162
8600
|
}
|
|
8163
8601
|
return defaultSize;
|
|
8164
8602
|
}
|
|
@@ -8220,30 +8658,57 @@ class CacheManager {
|
|
|
8220
8658
|
}
|
|
8221
8659
|
}
|
|
8222
8660
|
/**
|
|
8223
|
-
* Tune hot cache size based on statistics and
|
|
8661
|
+
* Tune hot cache size based on statistics, environment, and operating mode
|
|
8224
8662
|
*
|
|
8225
8663
|
* The hot cache size is tuned based on:
|
|
8226
8664
|
* 1. Available memory in the current environment
|
|
8227
8665
|
* 2. Total number of nodes and edges in the system
|
|
8228
8666
|
* 3. Cache hit/miss ratio
|
|
8667
|
+
* 4. Operating mode (read-only vs. read-write)
|
|
8668
|
+
* 5. Storage type (S3, filesystem, memory)
|
|
8229
8669
|
*
|
|
8230
|
-
*
|
|
8231
|
-
* - Start with a size based on available memory
|
|
8232
|
-
* -
|
|
8233
|
-
* -
|
|
8234
|
-
* -
|
|
8670
|
+
* Enhanced algorithm:
|
|
8671
|
+
* - Start with a size based on available memory and operating mode
|
|
8672
|
+
* - For large datasets in S3 or other remote storage, use more aggressive caching
|
|
8673
|
+
* - Adjust based on access patterns (read-heavy vs. write-heavy)
|
|
8674
|
+
* - For read-only mode, prioritize cache size over eviction speed
|
|
8675
|
+
* - Dynamically adjust based on hit/miss ratio and query patterns
|
|
8235
8676
|
*/
|
|
8236
8677
|
tuneHotCacheSize() {
|
|
8237
8678
|
// Start with the base size from environment detection
|
|
8238
8679
|
let optimalSize = this.detectOptimalCacheSize();
|
|
8680
|
+
// Check if we're in read-only mode
|
|
8681
|
+
const isReadOnly = this.options?.readOnly || false;
|
|
8682
|
+
// Check if we're using S3 or other remote storage
|
|
8683
|
+
const isRemoteStorage = this.coldStorageType === StorageType.S3 ||
|
|
8684
|
+
this.coldStorageType === StorageType.REMOTE_API;
|
|
8239
8685
|
// If we have storage statistics, adjust based on total nodes/edges
|
|
8240
8686
|
if (this.storageStatistics) {
|
|
8241
8687
|
const totalItems = (this.storageStatistics.totalNodes || 0) +
|
|
8242
8688
|
(this.storageStatistics.totalEdges || 0);
|
|
8243
8689
|
// If total items is significant, adjust cache size
|
|
8244
8690
|
if (totalItems > 0) {
|
|
8245
|
-
//
|
|
8246
|
-
|
|
8691
|
+
// Base percentage to cache - adjusted based on mode and storage
|
|
8692
|
+
let percentageToCache = 0.2; // Cache 20% of items by default
|
|
8693
|
+
// For read-only mode, increase cache percentage
|
|
8694
|
+
if (isReadOnly) {
|
|
8695
|
+
percentageToCache = 0.3; // 30% for read-only mode
|
|
8696
|
+
// For remote storage in read-only mode, be even more aggressive
|
|
8697
|
+
if (isRemoteStorage) {
|
|
8698
|
+
percentageToCache = 0.4; // 40% for remote storage in read-only mode
|
|
8699
|
+
}
|
|
8700
|
+
}
|
|
8701
|
+
// For remote storage in normal mode, increase slightly
|
|
8702
|
+
else if (isRemoteStorage) {
|
|
8703
|
+
percentageToCache = 0.25; // 25% for remote storage
|
|
8704
|
+
}
|
|
8705
|
+
// For large datasets, cap the percentage to avoid excessive memory usage
|
|
8706
|
+
if (totalItems > 1000000) { // Over 1 million items
|
|
8707
|
+
percentageToCache = Math.min(percentageToCache, 0.15);
|
|
8708
|
+
}
|
|
8709
|
+
else if (totalItems > 100000) { // Over 100K items
|
|
8710
|
+
percentageToCache = Math.min(percentageToCache, 0.25);
|
|
8711
|
+
}
|
|
8247
8712
|
const statisticsBasedSize = Math.ceil(totalItems * percentageToCache);
|
|
8248
8713
|
// Use the smaller of the two to avoid memory issues
|
|
8249
8714
|
optimalSize = Math.min(optimalSize, statisticsBasedSize);
|
|
@@ -8253,16 +8718,57 @@ class CacheManager {
|
|
|
8253
8718
|
const totalAccesses = this.stats.hits + this.stats.misses;
|
|
8254
8719
|
if (totalAccesses > 100) {
|
|
8255
8720
|
const hitRatio = this.stats.hits / totalAccesses;
|
|
8256
|
-
//
|
|
8721
|
+
// Base adjustment factor
|
|
8722
|
+
let hitRatioFactor = 1.0;
|
|
8257
8723
|
// If hit ratio is low, we might need a larger cache
|
|
8258
8724
|
if (hitRatio < 0.5) {
|
|
8259
|
-
//
|
|
8260
|
-
const
|
|
8725
|
+
// Calculate adjustment factor based on hit ratio
|
|
8726
|
+
const baseAdjustment = 0.5 - hitRatio;
|
|
8727
|
+
// For read-only mode or remote storage, be more aggressive
|
|
8728
|
+
if (isReadOnly || isRemoteStorage) {
|
|
8729
|
+
hitRatioFactor = 1 + (baseAdjustment * 1.5); // Up to 75% increase
|
|
8730
|
+
}
|
|
8731
|
+
else {
|
|
8732
|
+
hitRatioFactor = 1 + baseAdjustment; // Up to 50% increase
|
|
8733
|
+
}
|
|
8734
|
+
optimalSize = Math.ceil(optimalSize * hitRatioFactor);
|
|
8735
|
+
}
|
|
8736
|
+
// If hit ratio is very high, we might be able to reduce cache size slightly
|
|
8737
|
+
else if (hitRatio > 0.9 && !isReadOnly && !isRemoteStorage) {
|
|
8738
|
+
// Only reduce cache size in normal mode with local storage
|
|
8739
|
+
// and only if hit ratio is very high
|
|
8740
|
+
hitRatioFactor = 0.9; // 10% reduction
|
|
8261
8741
|
optimalSize = Math.ceil(optimalSize * hitRatioFactor);
|
|
8262
8742
|
}
|
|
8263
8743
|
}
|
|
8264
|
-
//
|
|
8265
|
-
|
|
8744
|
+
// Check for operation patterns if available
|
|
8745
|
+
if (this.storageStatistics?.operations) {
|
|
8746
|
+
const ops = this.storageStatistics.operations;
|
|
8747
|
+
const totalOps = ops.total || 1;
|
|
8748
|
+
// Calculate read/write ratio
|
|
8749
|
+
const readOps = (ops.search || 0) + (ops.get || 0);
|
|
8750
|
+
(ops.add || 0) + (ops.update || 0) + (ops.delete || 0);
|
|
8751
|
+
if (totalOps > 100) {
|
|
8752
|
+
const readRatio = readOps / totalOps;
|
|
8753
|
+
// For read-heavy workloads, increase cache size
|
|
8754
|
+
if (readRatio > 0.8) {
|
|
8755
|
+
// More aggressive for remote storage
|
|
8756
|
+
const readAdjustment = isRemoteStorage ? 1.3 : 1.2;
|
|
8757
|
+
optimalSize = Math.ceil(optimalSize * readAdjustment);
|
|
8758
|
+
}
|
|
8759
|
+
}
|
|
8760
|
+
}
|
|
8761
|
+
// Ensure we have a reasonable minimum size based on environment and mode
|
|
8762
|
+
let minSize = 1000; // Default minimum
|
|
8763
|
+
// For read-only mode, use a higher minimum
|
|
8764
|
+
if (isReadOnly) {
|
|
8765
|
+
minSize = 2000;
|
|
8766
|
+
}
|
|
8767
|
+
// For remote storage, use an even higher minimum
|
|
8768
|
+
if (isRemoteStorage) {
|
|
8769
|
+
minSize = isReadOnly ? 3000 : 2000;
|
|
8770
|
+
}
|
|
8771
|
+
optimalSize = Math.max(optimalSize, minSize);
|
|
8266
8772
|
// Update the hot cache max size
|
|
8267
8773
|
this.hotCacheMaxSize = optimalSize;
|
|
8268
8774
|
this.stats.maxSize = optimalSize;
|
|
@@ -8363,7 +8869,7 @@ class CacheManager {
|
|
|
8363
8869
|
this.warmCacheTTL = ttl;
|
|
8364
8870
|
}
|
|
8365
8871
|
/**
|
|
8366
|
-
* Tune batch size based on statistics and
|
|
8872
|
+
* Tune batch size based on environment, statistics, and operating mode
|
|
8367
8873
|
*
|
|
8368
8874
|
* The batch size determines how many items are processed in a single batch
|
|
8369
8875
|
* for operations like prefetching. It is tuned based on:
|
|
@@ -8371,42 +8877,93 @@ class CacheManager {
|
|
|
8371
8877
|
* 2. Available memory
|
|
8372
8878
|
* 3. Operation patterns
|
|
8373
8879
|
* 4. Cache hit/miss ratio
|
|
8880
|
+
* 5. Operating mode (read-only vs. read-write)
|
|
8881
|
+
* 6. Storage type (S3, filesystem, memory)
|
|
8882
|
+
* 7. Dataset size
|
|
8374
8883
|
*
|
|
8375
|
-
*
|
|
8884
|
+
* Enhanced algorithm:
|
|
8376
8885
|
* - Start with a default based on the environment
|
|
8377
|
-
* -
|
|
8378
|
-
* - For
|
|
8379
|
-
* -
|
|
8380
|
-
* -
|
|
8886
|
+
* - For large datasets in S3 or other remote storage, use larger batches
|
|
8887
|
+
* - For read-only mode, use larger batches to improve throughput
|
|
8888
|
+
* - Dynamically adjust based on network latency and throughput
|
|
8889
|
+
* - Balance between memory usage and performance
|
|
8381
8890
|
*/
|
|
8382
8891
|
tuneBatchSize() {
|
|
8383
8892
|
// Default batch size
|
|
8384
8893
|
let batchSize = 10;
|
|
8385
|
-
//
|
|
8894
|
+
// Check if we're in read-only mode
|
|
8895
|
+
const isReadOnly = this.options?.readOnly || false;
|
|
8896
|
+
// Check if we're using S3 or other remote storage
|
|
8897
|
+
const isRemoteStorage = this.coldStorageType === StorageType.S3 ||
|
|
8898
|
+
this.coldStorageType === StorageType.REMOTE_API;
|
|
8899
|
+
// Get the total dataset size if available
|
|
8900
|
+
const totalItems = this.storageStatistics ?
|
|
8901
|
+
(this.storageStatistics.totalNodes || 0) + (this.storageStatistics.totalEdges || 0) : 0;
|
|
8902
|
+
// Determine if we're dealing with a large dataset
|
|
8903
|
+
const isLargeDataset = totalItems > 100000;
|
|
8904
|
+
const isVeryLargeDataset = totalItems > 1000000;
|
|
8905
|
+
// Base batch size adjustment based on environment
|
|
8386
8906
|
if (this.environment === Environment$1.NODE) {
|
|
8387
8907
|
// Node.js can handle larger batches
|
|
8388
|
-
batchSize = 20;
|
|
8908
|
+
batchSize = isReadOnly ? 30 : 20;
|
|
8909
|
+
// For remote storage, increase batch size
|
|
8910
|
+
if (isRemoteStorage) {
|
|
8911
|
+
batchSize = isReadOnly ? 50 : 30;
|
|
8912
|
+
}
|
|
8913
|
+
// For large datasets, adjust batch size
|
|
8914
|
+
if (isLargeDataset) {
|
|
8915
|
+
batchSize = Math.min(100, batchSize * 1.5);
|
|
8916
|
+
}
|
|
8917
|
+
// For very large datasets, adjust even more
|
|
8918
|
+
if (isVeryLargeDataset) {
|
|
8919
|
+
batchSize = Math.min(200, batchSize * 2);
|
|
8920
|
+
}
|
|
8389
8921
|
}
|
|
8390
8922
|
else if (this.environment === Environment$1.BROWSER) {
|
|
8391
8923
|
// Browsers might need smaller batches
|
|
8392
|
-
batchSize = 10;
|
|
8924
|
+
batchSize = isReadOnly ? 15 : 10;
|
|
8393
8925
|
// If we have memory information, adjust accordingly
|
|
8394
8926
|
if (navigator.deviceMemory) {
|
|
8395
8927
|
// Scale batch size with available memory
|
|
8396
|
-
|
|
8928
|
+
const memoryFactor = isReadOnly ? 3 : 2;
|
|
8929
|
+
batchSize = Math.max(5, Math.min(30, Math.floor(navigator.deviceMemory * memoryFactor)));
|
|
8930
|
+
// For large datasets, adjust based on memory
|
|
8931
|
+
if (isLargeDataset && navigator.deviceMemory > 4) {
|
|
8932
|
+
batchSize = Math.min(50, batchSize * 1.5);
|
|
8933
|
+
}
|
|
8397
8934
|
}
|
|
8398
8935
|
}
|
|
8936
|
+
else if (this.environment === Environment$1.WORKER) {
|
|
8937
|
+
// Workers can handle moderate batch sizes
|
|
8938
|
+
batchSize = isReadOnly ? 20 : 15;
|
|
8939
|
+
}
|
|
8399
8940
|
// If we have storage statistics with operation counts, adjust based on operation patterns
|
|
8400
8941
|
if (this.storageStatistics && this.storageStatistics.operations) {
|
|
8401
8942
|
const ops = this.storageStatistics.operations;
|
|
8402
8943
|
const totalOps = ops.total || 1;
|
|
8403
|
-
const
|
|
8944
|
+
const searchOps = (ops.search || 0);
|
|
8945
|
+
const getOps = (ops.get || 0);
|
|
8404
8946
|
if (totalOps > 100) {
|
|
8405
|
-
|
|
8406
|
-
|
|
8407
|
-
|
|
8408
|
-
|
|
8409
|
-
|
|
8947
|
+
// Calculate search and get ratios
|
|
8948
|
+
const searchRatio = searchOps / totalOps;
|
|
8949
|
+
const getRatio = getOps / totalOps;
|
|
8950
|
+
// For search-heavy workloads, use larger batch size
|
|
8951
|
+
if (searchRatio > 0.6) {
|
|
8952
|
+
// Search-heavy, increase batch size
|
|
8953
|
+
const searchFactor = isRemoteStorage ? 1.8 : 1.5;
|
|
8954
|
+
batchSize = Math.min(isRemoteStorage ? 200 : 100, Math.ceil(batchSize * searchFactor));
|
|
8955
|
+
}
|
|
8956
|
+
// For get-heavy workloads, adjust batch size
|
|
8957
|
+
if (getRatio > 0.6) {
|
|
8958
|
+
// Get-heavy, adjust batch size based on storage type
|
|
8959
|
+
if (isRemoteStorage) {
|
|
8960
|
+
// For remote storage, larger batches reduce network overhead
|
|
8961
|
+
batchSize = Math.min(150, Math.ceil(batchSize * 1.5));
|
|
8962
|
+
}
|
|
8963
|
+
else {
|
|
8964
|
+
// For local storage, smaller batches might be more efficient
|
|
8965
|
+
batchSize = Math.max(10, Math.ceil(batchSize * 0.9));
|
|
8966
|
+
}
|
|
8410
8967
|
}
|
|
8411
8968
|
}
|
|
8412
8969
|
}
|
|
@@ -8414,17 +8971,46 @@ class CacheManager {
|
|
|
8414
8971
|
const totalAccesses = this.stats.hits + this.stats.misses;
|
|
8415
8972
|
if (totalAccesses > 100) {
|
|
8416
8973
|
const hitRatio = this.stats.hits / totalAccesses;
|
|
8974
|
+
// Base adjustment factors
|
|
8975
|
+
let increaseFactorForLowHitRatio = isRemoteStorage ? 1.5 : 1.2;
|
|
8976
|
+
let decreaseFactorForHighHitRatio = 0.8;
|
|
8977
|
+
// In read-only mode, be more aggressive with batch size adjustments
|
|
8978
|
+
if (isReadOnly) {
|
|
8979
|
+
increaseFactorForLowHitRatio = isRemoteStorage ? 2.0 : 1.5;
|
|
8980
|
+
decreaseFactorForHighHitRatio = 0.9; // Less reduction in read-only mode
|
|
8981
|
+
}
|
|
8417
8982
|
// If hit ratio is high, we can use smaller batches
|
|
8418
|
-
|
|
8419
|
-
if (hitRatio > 0.8) {
|
|
8983
|
+
if (hitRatio > 0.8 && !isVeryLargeDataset) {
|
|
8420
8984
|
// High hit ratio, decrease batch size slightly
|
|
8421
|
-
|
|
8985
|
+
// But don't decrease too much for large datasets or remote storage
|
|
8986
|
+
if (!(isLargeDataset && isRemoteStorage)) {
|
|
8987
|
+
batchSize = Math.max(isReadOnly ? 10 : 5, Math.floor(batchSize * decreaseFactorForHighHitRatio));
|
|
8988
|
+
}
|
|
8422
8989
|
}
|
|
8990
|
+
// If hit ratio is low, we need larger batches
|
|
8423
8991
|
else if (hitRatio < 0.5) {
|
|
8424
8992
|
// Low hit ratio, increase batch size
|
|
8425
|
-
|
|
8426
|
-
|
|
8427
|
-
|
|
8993
|
+
const maxBatchSize = isRemoteStorage ?
|
|
8994
|
+
(isVeryLargeDataset ? 300 : 200) :
|
|
8995
|
+
(isVeryLargeDataset ? 150 : 100);
|
|
8996
|
+
batchSize = Math.min(maxBatchSize, Math.ceil(batchSize * increaseFactorForLowHitRatio));
|
|
8997
|
+
}
|
|
8998
|
+
}
|
|
8999
|
+
// Set minimum batch sizes based on storage type and mode
|
|
9000
|
+
let minBatchSize = 5;
|
|
9001
|
+
if (isRemoteStorage) {
|
|
9002
|
+
minBatchSize = isReadOnly ? 20 : 10;
|
|
9003
|
+
}
|
|
9004
|
+
else if (isReadOnly) {
|
|
9005
|
+
minBatchSize = 10;
|
|
9006
|
+
}
|
|
9007
|
+
// Ensure batch size is within reasonable limits
|
|
9008
|
+
batchSize = Math.max(minBatchSize, batchSize);
|
|
9009
|
+
// Cap maximum batch size based on environment and storage
|
|
9010
|
+
const maxBatchSize = isRemoteStorage ?
|
|
9011
|
+
(this.environment === Environment$1.NODE ? 300 : 150) :
|
|
9012
|
+
(this.environment === Environment$1.NODE ? 150 : 75);
|
|
9013
|
+
batchSize = Math.min(maxBatchSize, batchSize);
|
|
8428
9014
|
// Update the batch size
|
|
8429
9015
|
this.batchSize = batchSize;
|
|
8430
9016
|
}
|
|
@@ -11493,7 +12079,8 @@ async function createStorage(options = {}) {
|
|
|
11493
12079
|
secretAccessKey: options.s3Storage.secretAccessKey,
|
|
11494
12080
|
sessionToken: options.s3Storage.sessionToken,
|
|
11495
12081
|
serviceType: 's3',
|
|
11496
|
-
operationConfig: options.operationConfig
|
|
12082
|
+
operationConfig: options.operationConfig,
|
|
12083
|
+
cacheConfig: options.cacheConfig
|
|
11497
12084
|
});
|
|
11498
12085
|
}
|
|
11499
12086
|
else {
|
|
@@ -11508,7 +12095,8 @@ async function createStorage(options = {}) {
|
|
|
11508
12095
|
accountId: options.r2Storage.accountId,
|
|
11509
12096
|
accessKeyId: options.r2Storage.accessKeyId,
|
|
11510
12097
|
secretAccessKey: options.r2Storage.secretAccessKey,
|
|
11511
|
-
serviceType: 'r2'
|
|
12098
|
+
serviceType: 'r2',
|
|
12099
|
+
cacheConfig: options.cacheConfig
|
|
11512
12100
|
});
|
|
11513
12101
|
}
|
|
11514
12102
|
else {
|
|
@@ -11524,7 +12112,8 @@ async function createStorage(options = {}) {
|
|
|
11524
12112
|
endpoint: options.gcsStorage.endpoint || 'https://storage.googleapis.com',
|
|
11525
12113
|
accessKeyId: options.gcsStorage.accessKeyId,
|
|
11526
12114
|
secretAccessKey: options.gcsStorage.secretAccessKey,
|
|
11527
|
-
serviceType: 'gcs'
|
|
12115
|
+
serviceType: 'gcs',
|
|
12116
|
+
cacheConfig: options.cacheConfig
|
|
11528
12117
|
});
|
|
11529
12118
|
}
|
|
11530
12119
|
else {
|
|
@@ -11545,7 +12134,8 @@ async function createStorage(options = {}) {
|
|
|
11545
12134
|
endpoint: options.customS3Storage.endpoint,
|
|
11546
12135
|
accessKeyId: options.customS3Storage.accessKeyId,
|
|
11547
12136
|
secretAccessKey: options.customS3Storage.secretAccessKey,
|
|
11548
|
-
serviceType: options.customS3Storage.serviceType || 'custom'
|
|
12137
|
+
serviceType: options.customS3Storage.serviceType || 'custom',
|
|
12138
|
+
cacheConfig: options.cacheConfig
|
|
11549
12139
|
});
|
|
11550
12140
|
}
|
|
11551
12141
|
// If R2 storage is specified, use it
|
|
@@ -11556,7 +12146,8 @@ async function createStorage(options = {}) {
|
|
|
11556
12146
|
accountId: options.r2Storage.accountId,
|
|
11557
12147
|
accessKeyId: options.r2Storage.accessKeyId,
|
|
11558
12148
|
secretAccessKey: options.r2Storage.secretAccessKey,
|
|
11559
|
-
serviceType: 'r2'
|
|
12149
|
+
serviceType: 'r2',
|
|
12150
|
+
cacheConfig: options.cacheConfig
|
|
11560
12151
|
});
|
|
11561
12152
|
}
|
|
11562
12153
|
// If S3 storage is specified, use it
|
|
@@ -11568,7 +12159,8 @@ async function createStorage(options = {}) {
|
|
|
11568
12159
|
accessKeyId: options.s3Storage.accessKeyId,
|
|
11569
12160
|
secretAccessKey: options.s3Storage.secretAccessKey,
|
|
11570
12161
|
sessionToken: options.s3Storage.sessionToken,
|
|
11571
|
-
serviceType: 's3'
|
|
12162
|
+
serviceType: 's3',
|
|
12163
|
+
cacheConfig: options.cacheConfig
|
|
11572
12164
|
});
|
|
11573
12165
|
}
|
|
11574
12166
|
// If GCS storage is specified, use it
|
|
@@ -11580,7 +12172,8 @@ async function createStorage(options = {}) {
|
|
|
11580
12172
|
endpoint: options.gcsStorage.endpoint || 'https://storage.googleapis.com',
|
|
11581
12173
|
accessKeyId: options.gcsStorage.accessKeyId,
|
|
11582
12174
|
secretAccessKey: options.gcsStorage.secretAccessKey,
|
|
11583
|
-
serviceType: 'gcs'
|
|
12175
|
+
serviceType: 'gcs',
|
|
12176
|
+
cacheConfig: options.cacheConfig
|
|
11584
12177
|
});
|
|
11585
12178
|
}
|
|
11586
12179
|
// Auto-detect the best storage adapter based on the environment
|
|
@@ -14138,6 +14731,27 @@ class BrainyData {
|
|
|
14138
14731
|
...config.realtimeUpdates
|
|
14139
14732
|
};
|
|
14140
14733
|
}
|
|
14734
|
+
// Initialize cache configuration with intelligent defaults
|
|
14735
|
+
// These defaults are automatically tuned based on environment and dataset size
|
|
14736
|
+
this.cacheConfig = {
|
|
14737
|
+
// Enable auto-tuning by default for optimal performance
|
|
14738
|
+
autoTune: true,
|
|
14739
|
+
// Set auto-tune interval to 1 minute for faster initial optimization
|
|
14740
|
+
// This is especially important for large datasets
|
|
14741
|
+
autoTuneInterval: 60000, // 1 minute
|
|
14742
|
+
// Read-only mode specific optimizations
|
|
14743
|
+
readOnlyMode: {
|
|
14744
|
+
// Use aggressive prefetching in read-only mode for better performance
|
|
14745
|
+
prefetchStrategy: 'aggressive'
|
|
14746
|
+
}
|
|
14747
|
+
};
|
|
14748
|
+
// Override defaults with user-provided configuration if available
|
|
14749
|
+
if (config.cache) {
|
|
14750
|
+
this.cacheConfig = {
|
|
14751
|
+
...this.cacheConfig,
|
|
14752
|
+
...config.cache
|
|
14753
|
+
};
|
|
14754
|
+
}
|
|
14141
14755
|
}
|
|
14142
14756
|
/**
|
|
14143
14757
|
* Check if the database is in read-only mode and throw an error if it is
|
|
@@ -14430,6 +15044,18 @@ class BrainyData {
|
|
|
14430
15044
|
return 'default';
|
|
14431
15045
|
}
|
|
14432
15046
|
}
|
|
15047
|
+
/**
|
|
15048
|
+
* Get the service name from options or fallback to current augmentation
|
|
15049
|
+
* This provides a consistent way to handle service names across all methods
|
|
15050
|
+
* @param options Options object that may contain a service property
|
|
15051
|
+
* @returns The service name to use for operations
|
|
15052
|
+
*/
|
|
15053
|
+
getServiceName(options) {
|
|
15054
|
+
if (options?.service) {
|
|
15055
|
+
return options.service;
|
|
15056
|
+
}
|
|
15057
|
+
return this.getCurrentAugmentation();
|
|
15058
|
+
}
|
|
14433
15059
|
/**
|
|
14434
15060
|
* Initialize the database
|
|
14435
15061
|
* Loads existing data from storage if available
|
|
@@ -14482,6 +15108,14 @@ class BrainyData {
|
|
|
14482
15108
|
...this.storageConfig,
|
|
14483
15109
|
requestPersistentStorage: this.requestPersistentStorage
|
|
14484
15110
|
};
|
|
15111
|
+
// Add cache configuration if provided
|
|
15112
|
+
if (this.cacheConfig) {
|
|
15113
|
+
storageOptions.cacheConfig = {
|
|
15114
|
+
...this.cacheConfig,
|
|
15115
|
+
// Pass read-only flag to optimize cache behavior
|
|
15116
|
+
readOnly: this.readOnly
|
|
15117
|
+
};
|
|
15118
|
+
}
|
|
14485
15119
|
// Ensure s3Storage has all required fields if it's provided
|
|
14486
15120
|
if (storageOptions.s3Storage) {
|
|
14487
15121
|
// Only include s3Storage if all required fields are present
|
|
@@ -14619,7 +15253,33 @@ class BrainyData {
|
|
|
14619
15253
|
else {
|
|
14620
15254
|
// Input needs to be vectorized
|
|
14621
15255
|
try {
|
|
14622
|
-
|
|
15256
|
+
// Check if input is a JSON object and process it specially
|
|
15257
|
+
if (typeof vectorOrData === 'object' &&
|
|
15258
|
+
vectorOrData !== null &&
|
|
15259
|
+
!Array.isArray(vectorOrData)) {
|
|
15260
|
+
// Process JSON object for better vectorization
|
|
15261
|
+
const preparedText = prepareJsonForVectorization(vectorOrData, {
|
|
15262
|
+
// Prioritize common name/title fields if they exist
|
|
15263
|
+
priorityFields: [
|
|
15264
|
+
'name',
|
|
15265
|
+
'title',
|
|
15266
|
+
'company',
|
|
15267
|
+
'organization',
|
|
15268
|
+
'description',
|
|
15269
|
+
'summary'
|
|
15270
|
+
]
|
|
15271
|
+
});
|
|
15272
|
+
vector = await this.embeddingFunction(preparedText);
|
|
15273
|
+
// Track field names for this JSON document
|
|
15274
|
+
const service = this.getServiceName(options);
|
|
15275
|
+
if (this.storage) {
|
|
15276
|
+
await this.storage.trackFieldNames(vectorOrData, service);
|
|
15277
|
+
}
|
|
15278
|
+
}
|
|
15279
|
+
else {
|
|
15280
|
+
// Use standard embedding for non-JSON data
|
|
15281
|
+
vector = await this.embeddingFunction(vectorOrData);
|
|
15282
|
+
}
|
|
14623
15283
|
}
|
|
14624
15284
|
catch (embedError) {
|
|
14625
15285
|
throw new Error(`Failed to vectorize data: ${embedError}`);
|
|
@@ -14648,7 +15308,7 @@ class BrainyData {
|
|
|
14648
15308
|
// Save noun to storage
|
|
14649
15309
|
await this.storage.saveNoun(noun);
|
|
14650
15310
|
// Track noun statistics
|
|
14651
|
-
const service =
|
|
15311
|
+
const service = this.getServiceName(options);
|
|
14652
15312
|
await this.storage.incrementStatistic('noun', service);
|
|
14653
15313
|
// Save metadata if provided and not empty
|
|
14654
15314
|
if (metadata !== undefined) {
|
|
@@ -14701,7 +15361,7 @@ class BrainyData {
|
|
|
14701
15361
|
}
|
|
14702
15362
|
await this.storage.saveMetadata(id, metadataToSave);
|
|
14703
15363
|
// Track metadata statistics
|
|
14704
|
-
const metadataService =
|
|
15364
|
+
const metadataService = this.getServiceName(options);
|
|
14705
15365
|
await this.storage.incrementStatistic('metadata', metadataService);
|
|
14706
15366
|
}
|
|
14707
15367
|
}
|
|
@@ -15122,12 +15782,43 @@ class BrainyData {
|
|
|
15122
15782
|
}
|
|
15123
15783
|
// Check if database is in write-only mode
|
|
15124
15784
|
this.checkWriteOnly();
|
|
15125
|
-
//
|
|
15785
|
+
// Process the query input for vectorization
|
|
15126
15786
|
let queryToUse = queryVectorOrData;
|
|
15787
|
+
// Handle string queries
|
|
15127
15788
|
if (typeof queryVectorOrData === 'string' && !options.forceEmbed) {
|
|
15128
15789
|
queryToUse = await this.embed(queryVectorOrData);
|
|
15129
15790
|
options.forceEmbed = false; // Already embedded, don't force again
|
|
15130
15791
|
}
|
|
15792
|
+
// Handle JSON object queries with special processing
|
|
15793
|
+
else if (typeof queryVectorOrData === 'object' &&
|
|
15794
|
+
queryVectorOrData !== null &&
|
|
15795
|
+
!Array.isArray(queryVectorOrData) &&
|
|
15796
|
+
!options.forceEmbed) {
|
|
15797
|
+
// If searching within a specific field
|
|
15798
|
+
if (options.searchField) {
|
|
15799
|
+
// Extract text from the specific field
|
|
15800
|
+
const fieldText = extractFieldFromJson(queryVectorOrData, options.searchField);
|
|
15801
|
+
if (fieldText) {
|
|
15802
|
+
queryToUse = await this.embeddingFunction(fieldText);
|
|
15803
|
+
options.forceEmbed = false; // Already embedded, don't force again
|
|
15804
|
+
}
|
|
15805
|
+
}
|
|
15806
|
+
// Otherwise process the entire object with priority fields
|
|
15807
|
+
else {
|
|
15808
|
+
const preparedText = prepareJsonForVectorization(queryVectorOrData, {
|
|
15809
|
+
priorityFields: options.priorityFields || [
|
|
15810
|
+
'name',
|
|
15811
|
+
'title',
|
|
15812
|
+
'company',
|
|
15813
|
+
'organization',
|
|
15814
|
+
'description',
|
|
15815
|
+
'summary'
|
|
15816
|
+
]
|
|
15817
|
+
});
|
|
15818
|
+
queryToUse = await this.embeddingFunction(preparedText);
|
|
15819
|
+
options.forceEmbed = false; // Already embedded, don't force again
|
|
15820
|
+
}
|
|
15821
|
+
}
|
|
15131
15822
|
// If noun types are specified, use searchByNounTypes
|
|
15132
15823
|
let searchResults;
|
|
15133
15824
|
if (options.nounTypes && options.nounTypes.length > 0) {
|
|
@@ -15424,7 +16115,7 @@ class BrainyData {
|
|
|
15424
16115
|
// Remove from storage
|
|
15425
16116
|
await this.storage.deleteNoun(actualId);
|
|
15426
16117
|
// Track deletion statistics
|
|
15427
|
-
const service = options
|
|
16118
|
+
const service = this.getServiceName(options);
|
|
15428
16119
|
await this.storage.decrementStatistic('noun', service);
|
|
15429
16120
|
// Try to remove metadata (ignore errors)
|
|
15430
16121
|
try {
|
|
@@ -15751,7 +16442,7 @@ class BrainyData {
|
|
|
15751
16442
|
// Save verb to storage
|
|
15752
16443
|
await this.storage.saveVerb(verb);
|
|
15753
16444
|
// Track verb statistics
|
|
15754
|
-
const serviceForStats = options
|
|
16445
|
+
const serviceForStats = this.getServiceName(options);
|
|
15755
16446
|
await this.storage.incrementStatistic('verb', serviceForStats);
|
|
15756
16447
|
// Update HNSW index size (excluding verbs)
|
|
15757
16448
|
await this.storage.updateHnswIndexSize(await this.getNounCount());
|
|
@@ -15899,7 +16590,7 @@ class BrainyData {
|
|
|
15899
16590
|
// Remove from storage
|
|
15900
16591
|
await this.storage.deleteVerb(id);
|
|
15901
16592
|
// Track deletion statistics
|
|
15902
|
-
const service = options
|
|
16593
|
+
const service = this.getServiceName(options);
|
|
15903
16594
|
await this.storage.decrementStatistic('verb', service);
|
|
15904
16595
|
return true;
|
|
15905
16596
|
}
|
|
@@ -16986,6 +17677,82 @@ class BrainyData {
|
|
|
16986
17677
|
throw new Error(`Failed to generate random graph: ${error}`);
|
|
16987
17678
|
}
|
|
16988
17679
|
}
|
|
17680
|
+
/**
|
|
17681
|
+
* Get available field names by service
|
|
17682
|
+
* This helps users understand what fields are available for searching from different data sources
|
|
17683
|
+
* @returns Record of field names by service
|
|
17684
|
+
*/
|
|
17685
|
+
async getAvailableFieldNames() {
|
|
17686
|
+
await this.ensureInitialized();
|
|
17687
|
+
if (!this.storage) {
|
|
17688
|
+
return {};
|
|
17689
|
+
}
|
|
17690
|
+
return this.storage.getAvailableFieldNames();
|
|
17691
|
+
}
|
|
17692
|
+
/**
|
|
17693
|
+
* Get standard field mappings
|
|
17694
|
+
* This helps users understand how fields from different services map to standard field names
|
|
17695
|
+
* @returns Record of standard field mappings
|
|
17696
|
+
*/
|
|
17697
|
+
async getStandardFieldMappings() {
|
|
17698
|
+
await this.ensureInitialized();
|
|
17699
|
+
if (!this.storage) {
|
|
17700
|
+
return {};
|
|
17701
|
+
}
|
|
17702
|
+
return this.storage.getStandardFieldMappings();
|
|
17703
|
+
}
|
|
17704
|
+
/**
|
|
17705
|
+
* Search using a standard field name
|
|
17706
|
+
* This allows searching across multiple services using a standardized field name
|
|
17707
|
+
* @param standardField The standard field name to search in
|
|
17708
|
+
* @param searchTerm The term to search for
|
|
17709
|
+
* @param k Number of results to return
|
|
17710
|
+
* @param options Additional search options
|
|
17711
|
+
* @returns Array of search results
|
|
17712
|
+
*/
|
|
17713
|
+
async searchByStandardField(standardField, searchTerm, k = 10, options = {}) {
|
|
17714
|
+
await this.ensureInitialized();
|
|
17715
|
+
// Check if database is in write-only mode
|
|
17716
|
+
this.checkWriteOnly();
|
|
17717
|
+
// Get standard field mappings
|
|
17718
|
+
const standardFieldMappings = await this.getStandardFieldMappings();
|
|
17719
|
+
// If the standard field doesn't exist, return empty results
|
|
17720
|
+
if (!standardFieldMappings[standardField]) {
|
|
17721
|
+
return [];
|
|
17722
|
+
}
|
|
17723
|
+
// Filter by services if specified
|
|
17724
|
+
let serviceFieldMappings = standardFieldMappings[standardField];
|
|
17725
|
+
if (options.services && options.services.length > 0) {
|
|
17726
|
+
const filteredMappings = {};
|
|
17727
|
+
for (const service of options.services) {
|
|
17728
|
+
if (serviceFieldMappings[service]) {
|
|
17729
|
+
filteredMappings[service] = serviceFieldMappings[service];
|
|
17730
|
+
}
|
|
17731
|
+
}
|
|
17732
|
+
serviceFieldMappings = filteredMappings;
|
|
17733
|
+
}
|
|
17734
|
+
// If no mappings after filtering, return empty results
|
|
17735
|
+
if (Object.keys(serviceFieldMappings).length === 0) {
|
|
17736
|
+
return [];
|
|
17737
|
+
}
|
|
17738
|
+
// Search in each service's fields and combine results
|
|
17739
|
+
const allResults = [];
|
|
17740
|
+
for (const [service, fieldNames] of Object.entries(serviceFieldMappings)) {
|
|
17741
|
+
for (const fieldName of fieldNames) {
|
|
17742
|
+
// Search using the specific field name for this service
|
|
17743
|
+
const results = await this.search(searchTerm, k, {
|
|
17744
|
+
searchField: fieldName,
|
|
17745
|
+
service,
|
|
17746
|
+
includeVerbs: options.includeVerbs,
|
|
17747
|
+
searchMode: options.searchMode
|
|
17748
|
+
});
|
|
17749
|
+
// Add results to the combined list
|
|
17750
|
+
allResults.push(...results);
|
|
17751
|
+
}
|
|
17752
|
+
}
|
|
17753
|
+
// Sort by score and limit to k results
|
|
17754
|
+
return allResults.sort((a, b) => b.score - a.score).slice(0, k);
|
|
17755
|
+
}
|
|
16989
17756
|
}
|
|
16990
17757
|
|
|
16991
17758
|
/**
|