@soulcraft/brainy 3.37.8 β 3.39.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/import/ImportCoordinator.d.ts +4 -0
- package/dist/import/ImportCoordinator.js +9 -2
- package/dist/importers/SmartExcelImporter.d.ts +7 -1
- package/dist/importers/SmartExcelImporter.js +123 -96
- package/dist/neural/entityExtractor.d.ts +29 -1
- package/dist/neural/entityExtractor.js +69 -4
- package/dist/storage/adapters/gcsStorage.js +11 -80
- package/dist/storage/adapters/s3CompatibleStorage.js +13 -62
- package/package.json +1 -1
|
@@ -62,6 +62,10 @@ export interface ImportProgress {
|
|
|
62
62
|
total?: number;
|
|
63
63
|
entities?: number;
|
|
64
64
|
relationships?: number;
|
|
65
|
+
/** Rows per second (v3.38.0) */
|
|
66
|
+
throughput?: number;
|
|
67
|
+
/** Estimated time remaining in ms (v3.38.0) */
|
|
68
|
+
eta?: number;
|
|
65
69
|
}
|
|
66
70
|
export interface ImportResult {
|
|
67
71
|
/** Import ID for history tracking */
|
|
@@ -237,13 +237,20 @@ export class ImportCoordinator {
|
|
|
237
237
|
enableConceptExtraction: options.enableConceptExtraction !== false,
|
|
238
238
|
confidenceThreshold: options.confidenceThreshold || 0.6,
|
|
239
239
|
onProgress: (stats) => {
|
|
240
|
+
// Enhanced progress reporting (v3.38.0) with throughput and ETA
|
|
241
|
+
const message = stats.throughput
|
|
242
|
+
? `Extracting entities from ${format} (${stats.throughput} rows/sec, ETA: ${Math.round(stats.eta / 1000)}s)...`
|
|
243
|
+
: `Extracting entities from ${format}...`;
|
|
240
244
|
options.onProgress?.({
|
|
241
245
|
stage: 'extracting',
|
|
242
|
-
message
|
|
246
|
+
message,
|
|
243
247
|
processed: stats.processed,
|
|
244
248
|
total: stats.total,
|
|
245
249
|
entities: stats.entities,
|
|
246
|
-
relationships: stats.relationships
|
|
250
|
+
relationships: stats.relationships,
|
|
251
|
+
// Pass through enhanced metrics if available
|
|
252
|
+
throughput: stats.throughput,
|
|
253
|
+
eta: stats.eta
|
|
247
254
|
});
|
|
248
255
|
}
|
|
249
256
|
};
|
|
@@ -25,12 +25,18 @@ export interface SmartExcelOptions extends FormatHandlerOptions {
|
|
|
25
25
|
definitionColumn?: string;
|
|
26
26
|
typeColumn?: string;
|
|
27
27
|
relatedColumn?: string;
|
|
28
|
-
/** Progress callback */
|
|
28
|
+
/** Progress callback (v3.38.0: Enhanced with performance metrics) */
|
|
29
29
|
onProgress?: (stats: {
|
|
30
30
|
processed: number;
|
|
31
31
|
total: number;
|
|
32
32
|
entities: number;
|
|
33
33
|
relationships: number;
|
|
34
|
+
/** Rows per second (v3.38.0) */
|
|
35
|
+
throughput?: number;
|
|
36
|
+
/** Estimated time remaining in ms (v3.38.0) */
|
|
37
|
+
eta?: number;
|
|
38
|
+
/** Current phase (v3.38.0) */
|
|
39
|
+
phase?: string;
|
|
34
40
|
}) => void;
|
|
35
41
|
}
|
|
36
42
|
export interface ExtractedRow {
|
|
@@ -66,114 +66,141 @@ export class SmartExcelImporter {
|
|
|
66
66
|
}
|
|
67
67
|
// Detect column names
|
|
68
68
|
const columns = this.detectColumns(rows[0], opts);
|
|
69
|
-
// Process each row
|
|
69
|
+
// Process each row with BATCHED PARALLEL PROCESSING (v3.38.0)
|
|
70
70
|
const extractedRows = [];
|
|
71
71
|
const entityMap = new Map();
|
|
72
72
|
const stats = {
|
|
73
73
|
byType: {},
|
|
74
74
|
byConfidence: { high: 0, medium: 0, low: 0 }
|
|
75
75
|
};
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
const
|
|
83
|
-
//
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
//
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
}
|
|
76
|
+
// Batch processing configuration
|
|
77
|
+
const CHUNK_SIZE = 10; // Process 10 rows at a time for optimal performance
|
|
78
|
+
let totalProcessed = 0;
|
|
79
|
+
const performanceStartTime = Date.now();
|
|
80
|
+
// Process rows in chunks
|
|
81
|
+
for (let chunkStart = 0; chunkStart < rows.length; chunkStart += CHUNK_SIZE) {
|
|
82
|
+
const chunk = rows.slice(chunkStart, Math.min(chunkStart + CHUNK_SIZE, rows.length));
|
|
83
|
+
// Process chunk in parallel for massive speedup
|
|
84
|
+
const chunkResults = await Promise.all(chunk.map(async (row, chunkIndex) => {
|
|
85
|
+
const i = chunkStart + chunkIndex;
|
|
86
|
+
// Extract data from row
|
|
87
|
+
const term = this.getColumnValue(row, columns.term) || `Entity_${i}`;
|
|
88
|
+
const definition = this.getColumnValue(row, columns.definition) || '';
|
|
89
|
+
const type = this.getColumnValue(row, columns.type);
|
|
90
|
+
const relatedTerms = this.getColumnValue(row, columns.related);
|
|
91
|
+
// Parallel extraction: entities AND concepts at the same time
|
|
92
|
+
const [relatedEntities, concepts] = await Promise.all([
|
|
93
|
+
// Extract entities from definition
|
|
94
|
+
opts.enableNeuralExtraction && definition
|
|
95
|
+
? this.extractor.extract(definition, {
|
|
96
|
+
confidence: opts.confidenceThreshold * 0.8,
|
|
97
|
+
neuralMatching: true,
|
|
98
|
+
cache: { enabled: true }
|
|
99
|
+
}).then(entities =>
|
|
100
|
+
// Filter out the main term from related entities
|
|
101
|
+
entities.filter(e => e.text.toLowerCase() !== term.toLowerCase()))
|
|
102
|
+
: Promise.resolve([]),
|
|
103
|
+
// Extract concepts (in parallel with entity extraction)
|
|
104
|
+
opts.enableConceptExtraction && definition
|
|
105
|
+
? this.brain.extractConcepts(definition, { limit: 10 }).catch(() => [])
|
|
106
|
+
: Promise.resolve([])
|
|
107
|
+
]);
|
|
108
|
+
// Determine main entity type
|
|
109
|
+
const mainEntityType = type ?
|
|
110
|
+
this.mapTypeString(type) :
|
|
111
|
+
(relatedEntities.length > 0 ? relatedEntities[0].type : NounType.Thing);
|
|
112
|
+
// Generate entity ID
|
|
113
|
+
const entityId = this.generateEntityId(term);
|
|
114
|
+
// Create main entity
|
|
115
|
+
const mainEntity = {
|
|
116
|
+
id: entityId,
|
|
117
|
+
name: term,
|
|
118
|
+
type: mainEntityType,
|
|
119
|
+
description: definition,
|
|
120
|
+
confidence: 0.95,
|
|
121
|
+
metadata: {
|
|
122
|
+
source: 'excel',
|
|
123
|
+
row: i + 1,
|
|
124
|
+
originalData: row,
|
|
125
|
+
concepts,
|
|
126
|
+
extractedAt: Date.now()
|
|
127
|
+
}
|
|
128
|
+
};
|
|
129
|
+
// Infer relationships
|
|
130
|
+
const relationships = [];
|
|
131
|
+
if (opts.enableRelationshipInference) {
|
|
132
|
+
// Extract relationships from definition text
|
|
133
|
+
for (const relEntity of relatedEntities) {
|
|
134
|
+
const verbType = await this.inferRelationship(term, relEntity.text, definition);
|
|
135
|
+
relationships.push({
|
|
136
|
+
from: entityId,
|
|
137
|
+
to: relEntity.text,
|
|
138
|
+
type: verbType,
|
|
139
|
+
confidence: relEntity.confidence,
|
|
140
|
+
evidence: `Extracted from: "${definition.substring(0, 100)}..."`
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
// Parse explicit "Related Terms" column
|
|
144
|
+
if (relatedTerms) {
|
|
145
|
+
const terms = relatedTerms.split(/[,;]/).map(t => t.trim()).filter(Boolean);
|
|
146
|
+
for (const relTerm of terms) {
|
|
147
|
+
if (relTerm.toLowerCase() !== term.toLowerCase()) {
|
|
148
|
+
relationships.push({
|
|
149
|
+
from: entityId,
|
|
150
|
+
to: relTerm,
|
|
151
|
+
type: VerbType.RelatedTo,
|
|
152
|
+
confidence: 0.9,
|
|
153
|
+
evidence: `Explicitly listed in "Related" column`
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
156
|
}
|
|
157
157
|
}
|
|
158
158
|
}
|
|
159
|
+
return {
|
|
160
|
+
term,
|
|
161
|
+
entityId,
|
|
162
|
+
mainEntity,
|
|
163
|
+
mainEntityType,
|
|
164
|
+
relatedEntities,
|
|
165
|
+
relationships,
|
|
166
|
+
concepts
|
|
167
|
+
};
|
|
168
|
+
}));
|
|
169
|
+
// Process chunk results sequentially to maintain order
|
|
170
|
+
for (const result of chunkResults) {
|
|
171
|
+
// Store entity ID mapping
|
|
172
|
+
entityMap.set(result.term.toLowerCase(), result.entityId);
|
|
173
|
+
// Track statistics
|
|
174
|
+
this.updateStats(stats, result.mainEntityType, result.mainEntity.confidence);
|
|
175
|
+
// Add extracted row
|
|
176
|
+
extractedRows.push({
|
|
177
|
+
entity: result.mainEntity,
|
|
178
|
+
relatedEntities: result.relatedEntities.map(e => ({
|
|
179
|
+
name: e.text,
|
|
180
|
+
type: e.type,
|
|
181
|
+
confidence: e.confidence
|
|
182
|
+
})),
|
|
183
|
+
relationships: result.relationships,
|
|
184
|
+
concepts: result.concepts
|
|
185
|
+
});
|
|
159
186
|
}
|
|
160
|
-
//
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
relationships,
|
|
169
|
-
concepts
|
|
170
|
-
});
|
|
171
|
-
// Report progress
|
|
187
|
+
// Update progress tracking
|
|
188
|
+
totalProcessed += chunk.length;
|
|
189
|
+
// Calculate performance metrics
|
|
190
|
+
const elapsed = Date.now() - performanceStartTime;
|
|
191
|
+
const rowsPerSecond = totalProcessed / (elapsed / 1000);
|
|
192
|
+
const remainingRows = rows.length - totalProcessed;
|
|
193
|
+
const estimatedTimeRemaining = remainingRows / rowsPerSecond;
|
|
194
|
+
// Report progress with enhanced metrics
|
|
172
195
|
opts.onProgress({
|
|
173
|
-
processed:
|
|
196
|
+
processed: totalProcessed,
|
|
174
197
|
total: rows.length,
|
|
175
|
-
entities: extractedRows.
|
|
176
|
-
relationships: relationships.length
|
|
198
|
+
entities: extractedRows.reduce((sum, row) => sum + 1 + row.relatedEntities.length, 0),
|
|
199
|
+
relationships: extractedRows.reduce((sum, row) => sum + row.relationships.length, 0),
|
|
200
|
+
// Additional performance metrics (v3.38.0)
|
|
201
|
+
throughput: Math.round(rowsPerSecond * 10) / 10,
|
|
202
|
+
eta: Math.round(estimatedTimeRemaining),
|
|
203
|
+
phase: 'extracting'
|
|
177
204
|
});
|
|
178
205
|
}
|
|
179
206
|
return {
|
|
@@ -24,6 +24,8 @@ export declare class NeuralEntityExtractor {
|
|
|
24
24
|
private typeEmbeddings;
|
|
25
25
|
private initialized;
|
|
26
26
|
private cache;
|
|
27
|
+
private embeddingCache;
|
|
28
|
+
private embeddingCacheStats;
|
|
27
29
|
constructor(brain: Brainy | Brainy<any>, cacheOptions?: EntityCacheOptions);
|
|
28
30
|
/**
|
|
29
31
|
* Initialize type embeddings for neural matching
|
|
@@ -61,7 +63,10 @@ export declare class NeuralEntityExtractor {
|
|
|
61
63
|
*/
|
|
62
64
|
private classifyByRules;
|
|
63
65
|
/**
|
|
64
|
-
* Get embedding for text
|
|
66
|
+
* Get embedding for text with caching (v3.38.0)
|
|
67
|
+
*
|
|
68
|
+
* PERFORMANCE OPTIMIZATION: Caches embeddings during extraction session
|
|
69
|
+
* to avoid redundant model calls for repeated text (common in large imports)
|
|
65
70
|
*/
|
|
66
71
|
private getEmbedding;
|
|
67
72
|
/**
|
|
@@ -96,4 +101,27 @@ export declare class NeuralEntityExtractor {
|
|
|
96
101
|
* Cleanup expired cache entries
|
|
97
102
|
*/
|
|
98
103
|
cleanupCache(): number;
|
|
104
|
+
/**
|
|
105
|
+
* Clear embedding cache (v3.38.0)
|
|
106
|
+
*
|
|
107
|
+
* Clears the runtime embedding cache. Useful for:
|
|
108
|
+
* - Freeing memory after large imports
|
|
109
|
+
* - Testing with fresh cache state
|
|
110
|
+
*/
|
|
111
|
+
clearEmbeddingCache(): void;
|
|
112
|
+
/**
|
|
113
|
+
* Get embedding cache statistics (v3.38.0)
|
|
114
|
+
*
|
|
115
|
+
* Returns performance metrics for the embedding cache:
|
|
116
|
+
* - hits: Number of cache hits (avoided model calls)
|
|
117
|
+
* - misses: Number of cache misses (required model calls)
|
|
118
|
+
* - size: Current cache size
|
|
119
|
+
* - hitRate: Percentage of requests served from cache
|
|
120
|
+
*/
|
|
121
|
+
getEmbeddingCacheStats(): {
|
|
122
|
+
hitRate: number;
|
|
123
|
+
hits: number;
|
|
124
|
+
misses: number;
|
|
125
|
+
size: number;
|
|
126
|
+
};
|
|
99
127
|
}
|
|
@@ -12,6 +12,14 @@ export class NeuralEntityExtractor {
|
|
|
12
12
|
// Type embeddings for similarity matching
|
|
13
13
|
this.typeEmbeddings = new Map();
|
|
14
14
|
this.initialized = false;
|
|
15
|
+
// Runtime embedding cache for performance (v3.38.0)
|
|
16
|
+
// Caches candidate embeddings during an extraction session to avoid redundant model calls
|
|
17
|
+
this.embeddingCache = new Map();
|
|
18
|
+
this.embeddingCacheStats = {
|
|
19
|
+
hits: 0,
|
|
20
|
+
misses: 0,
|
|
21
|
+
size: 0
|
|
22
|
+
};
|
|
15
23
|
this.brain = brain;
|
|
16
24
|
this.cache = new EntityExtractionCache(cacheOptions);
|
|
17
25
|
}
|
|
@@ -253,20 +261,46 @@ export class NeuralEntityExtractor {
|
|
|
253
261
|
return { type: NounType.Thing, confidence: 0.3 };
|
|
254
262
|
}
|
|
255
263
|
/**
|
|
256
|
-
* Get embedding for text
|
|
264
|
+
* Get embedding for text with caching (v3.38.0)
|
|
265
|
+
*
|
|
266
|
+
* PERFORMANCE OPTIMIZATION: Caches embeddings during extraction session
|
|
267
|
+
* to avoid redundant model calls for repeated text (common in large imports)
|
|
257
268
|
*/
|
|
258
269
|
async getEmbedding(text) {
|
|
270
|
+
// Normalize text for cache key
|
|
271
|
+
const normalizedText = text.trim().toLowerCase();
|
|
272
|
+
// Check cache first
|
|
273
|
+
const cached = this.embeddingCache.get(normalizedText);
|
|
274
|
+
if (cached) {
|
|
275
|
+
this.embeddingCacheStats.hits++;
|
|
276
|
+
return cached;
|
|
277
|
+
}
|
|
278
|
+
// Cache miss - generate embedding
|
|
279
|
+
this.embeddingCacheStats.misses++;
|
|
280
|
+
let vector;
|
|
259
281
|
if ('embed' in this.brain && typeof this.brain.embed === 'function') {
|
|
260
|
-
|
|
282
|
+
vector = await this.brain.embed(text);
|
|
261
283
|
}
|
|
262
284
|
else {
|
|
263
285
|
// Fallback - create simple hash-based vector
|
|
264
|
-
|
|
286
|
+
vector = new Array(384).fill(0);
|
|
265
287
|
for (let i = 0; i < text.length; i++) {
|
|
266
288
|
vector[i % 384] += text.charCodeAt(i) / 255;
|
|
267
289
|
}
|
|
268
|
-
|
|
290
|
+
vector = vector.map(v => v / text.length);
|
|
291
|
+
}
|
|
292
|
+
// Store in cache
|
|
293
|
+
this.embeddingCache.set(normalizedText, vector);
|
|
294
|
+
this.embeddingCacheStats.size = this.embeddingCache.size;
|
|
295
|
+
// Memory management: Clear cache if it grows too large (>10000 entries)
|
|
296
|
+
if (this.embeddingCache.size > 10000) {
|
|
297
|
+
// Keep most recent 5000 entries (simple LRU approximation)
|
|
298
|
+
const entries = Array.from(this.embeddingCache.entries());
|
|
299
|
+
this.embeddingCache.clear();
|
|
300
|
+
entries.slice(-5000).forEach(([k, v]) => this.embeddingCache.set(k, v));
|
|
301
|
+
this.embeddingCacheStats.size = this.embeddingCache.size;
|
|
269
302
|
}
|
|
303
|
+
return vector;
|
|
270
304
|
}
|
|
271
305
|
/**
|
|
272
306
|
* Calculate cosine similarity between vectors
|
|
@@ -355,5 +389,36 @@ export class NeuralEntityExtractor {
|
|
|
355
389
|
cleanupCache() {
|
|
356
390
|
return this.cache.cleanup();
|
|
357
391
|
}
|
|
392
|
+
/**
|
|
393
|
+
* Clear embedding cache (v3.38.0)
|
|
394
|
+
*
|
|
395
|
+
* Clears the runtime embedding cache. Useful for:
|
|
396
|
+
* - Freeing memory after large imports
|
|
397
|
+
* - Testing with fresh cache state
|
|
398
|
+
*/
|
|
399
|
+
clearEmbeddingCache() {
|
|
400
|
+
this.embeddingCache.clear();
|
|
401
|
+
this.embeddingCacheStats = {
|
|
402
|
+
hits: 0,
|
|
403
|
+
misses: 0,
|
|
404
|
+
size: 0
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Get embedding cache statistics (v3.38.0)
|
|
409
|
+
*
|
|
410
|
+
* Returns performance metrics for the embedding cache:
|
|
411
|
+
* - hits: Number of cache hits (avoided model calls)
|
|
412
|
+
* - misses: Number of cache misses (required model calls)
|
|
413
|
+
* - size: Current cache size
|
|
414
|
+
* - hitRate: Percentage of requests served from cache
|
|
415
|
+
*/
|
|
416
|
+
getEmbeddingCacheStats() {
|
|
417
|
+
const total = this.embeddingCacheStats.hits + this.embeddingCacheStats.misses;
|
|
418
|
+
return {
|
|
419
|
+
...this.embeddingCacheStats,
|
|
420
|
+
hitRate: total > 0 ? this.embeddingCacheStats.hits / total : 0
|
|
421
|
+
};
|
|
422
|
+
}
|
|
358
423
|
}
|
|
359
424
|
//# sourceMappingURL=entityExtractor.js.map
|
|
@@ -347,9 +347,7 @@ export class GcsStorage extends BaseStorage {
|
|
|
347
347
|
if (node.vector && Array.isArray(node.vector) && node.vector.length > 0) {
|
|
348
348
|
this.nounCacheManager.set(node.id, node);
|
|
349
349
|
}
|
|
350
|
-
|
|
351
|
-
prodLog.warn(`[saveNode] Not caching node ${node.id.substring(0, 8)}... with empty vector (HNSW lazy mode)`);
|
|
352
|
-
}
|
|
350
|
+
// Note: Empty vectors are intentional during HNSW lazy mode - not logged
|
|
353
351
|
// Increment noun count
|
|
354
352
|
const metadata = await this.getNounMetadata(node.id);
|
|
355
353
|
if (metadata && metadata.type) {
|
|
@@ -392,53 +390,28 @@ export class GcsStorage extends BaseStorage {
|
|
|
392
390
|
*/
|
|
393
391
|
async getNode(id) {
|
|
394
392
|
await this.ensureInitialized();
|
|
395
|
-
// Check cache first
|
|
393
|
+
// Check cache first
|
|
396
394
|
const cached = await this.nounCacheManager.get(id);
|
|
397
|
-
//
|
|
398
|
-
prodLog.info(`[getNode] π Cache check for ${id.substring(0, 8)}...:`, {
|
|
399
|
-
hasCached: cached !== undefined,
|
|
400
|
-
isNull: cached === null,
|
|
401
|
-
isObject: cached !== null && typeof cached === 'object',
|
|
402
|
-
type: typeof cached
|
|
403
|
-
});
|
|
404
|
-
// CRITICAL FIX (v3.37.8): Validate cached object before returning
|
|
395
|
+
// Validate cached object before returning (v3.37.8+)
|
|
405
396
|
if (cached !== undefined && cached !== null) {
|
|
406
|
-
// Log cached object structure to diagnose incomplete objects
|
|
407
|
-
prodLog.info(`[getNode] Cached object structure:`, {
|
|
408
|
-
hasId: !!cached.id,
|
|
409
|
-
idMatches: cached.id === id,
|
|
410
|
-
hasVector: !!cached.vector,
|
|
411
|
-
vectorLength: cached.vector?.length,
|
|
412
|
-
hasConnections: !!cached.connections,
|
|
413
|
-
connectionsType: typeof cached.connections,
|
|
414
|
-
hasLevel: cached.level !== undefined,
|
|
415
|
-
level: cached.level,
|
|
416
|
-
objectKeys: Object.keys(cached || {})
|
|
417
|
-
});
|
|
418
397
|
// Validate cached object has required fields (including non-empty vector!)
|
|
419
398
|
if (!cached.id || !cached.vector || !Array.isArray(cached.vector) || cached.vector.length === 0) {
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
'unknown'
|
|
426
|
-
});
|
|
427
|
-
prodLog.error(`[getNode] Removing invalid object from cache and loading from GCS`);
|
|
399
|
+
// Invalid cache detected - log and auto-recover
|
|
400
|
+
prodLog.warn(`[GCS] Invalid cached object for ${id.substring(0, 8)} (${!cached.id ? 'missing id' :
|
|
401
|
+
!cached.vector ? 'missing vector' :
|
|
402
|
+
!Array.isArray(cached.vector) ? 'vector not array' :
|
|
403
|
+
'empty vector'}) - removing from cache and reloading`);
|
|
428
404
|
this.nounCacheManager.delete(id);
|
|
429
405
|
// Fall through to load from GCS
|
|
430
406
|
}
|
|
431
407
|
else {
|
|
432
|
-
|
|
408
|
+
// Valid cache hit
|
|
433
409
|
this.logger.trace(`Cache hit for noun ${id}`);
|
|
434
410
|
return cached;
|
|
435
411
|
}
|
|
436
412
|
}
|
|
437
413
|
else if (cached === null) {
|
|
438
|
-
prodLog.warn(`[
|
|
439
|
-
}
|
|
440
|
-
else {
|
|
441
|
-
prodLog.info(`[getNode] β Cache MISS - loading from GCS for ${id.substring(0, 8)}...`);
|
|
414
|
+
prodLog.warn(`[GCS] Cache contains null for ${id.substring(0, 8)} - reloading from storage`);
|
|
442
415
|
}
|
|
443
416
|
// Apply backpressure
|
|
444
417
|
const requestId = await this.applyBackpressure();
|
|
@@ -446,20 +419,11 @@ export class GcsStorage extends BaseStorage {
|
|
|
446
419
|
this.logger.trace(`Getting node ${id}`);
|
|
447
420
|
// Get the GCS key with UUID-based sharding
|
|
448
421
|
const key = this.getNounKey(id);
|
|
449
|
-
// DIAGNOSTIC LOGGING: Show exact path being accessed
|
|
450
|
-
prodLog.info(`[getNode] π Attempting to load:`);
|
|
451
|
-
prodLog.info(`[getNode] UUID: ${id}`);
|
|
452
|
-
prodLog.info(`[getNode] Path: ${key}`);
|
|
453
|
-
prodLog.info(`[getNode] Bucket: ${this.bucketName}`);
|
|
454
422
|
// Download from GCS
|
|
455
423
|
const file = this.bucket.file(key);
|
|
456
|
-
prodLog.info(`[getNode] π₯ Downloading file...`);
|
|
457
424
|
const [contents] = await file.download();
|
|
458
|
-
prodLog.info(`[getNode] β
Download successful: ${contents.length} bytes`);
|
|
459
425
|
// Parse JSON
|
|
460
|
-
prodLog.info(`[getNode] π§ Parsing JSON...`);
|
|
461
426
|
const data = JSON.parse(contents.toString());
|
|
462
|
-
prodLog.info(`[getNode] β
JSON parsed successfully, id: ${data.id}`);
|
|
463
427
|
// Convert serialized connections back to Map<number, Set<string>>
|
|
464
428
|
const connections = new Map();
|
|
465
429
|
for (const [level, nounIds] of Object.entries(data.connections || {})) {
|
|
@@ -477,10 +441,9 @@ export class GcsStorage extends BaseStorage {
|
|
|
477
441
|
// CRITICAL FIX: Only cache valid nodes with non-empty vectors (never cache null or empty)
|
|
478
442
|
if (node && node.id && node.vector && Array.isArray(node.vector) && node.vector.length > 0) {
|
|
479
443
|
this.nounCacheManager.set(id, node);
|
|
480
|
-
prodLog.info(`[getNode] πΎ Cached node ${id.substring(0, 8)}... successfully`);
|
|
481
444
|
}
|
|
482
445
|
else {
|
|
483
|
-
prodLog.warn(`[
|
|
446
|
+
prodLog.warn(`[GCS] Not caching invalid node ${id.substring(0, 8)} (missing id/vector or empty vector)`);
|
|
484
447
|
}
|
|
485
448
|
this.logger.trace(`Successfully retrieved node ${id}`);
|
|
486
449
|
this.releaseBackpressure(true, requestId);
|
|
@@ -868,13 +831,6 @@ export class GcsStorage extends BaseStorage {
|
|
|
868
831
|
await this.ensureInitialized(); // CRITICAL: Must initialize before using this.bucket
|
|
869
832
|
const limit = options.limit || 100;
|
|
870
833
|
const useCache = options.useCache !== false;
|
|
871
|
-
// DIAGNOSTIC LOGGING: Track pagination performance
|
|
872
|
-
prodLog.info(`[getNodesWithPagination] Starting pagination: limit=${limit}, cursor=${options.cursor || 'none'}`);
|
|
873
|
-
const startTime = Date.now();
|
|
874
|
-
let shardsChecked = 0;
|
|
875
|
-
let filesFound = 0;
|
|
876
|
-
let nodesLoaded = 0;
|
|
877
|
-
let nodesFailed = 0;
|
|
878
834
|
try {
|
|
879
835
|
const nodes = [];
|
|
880
836
|
// Parse cursor (format: "shardIndex:gcsPageToken")
|
|
@@ -889,7 +845,6 @@ export class GcsStorage extends BaseStorage {
|
|
|
889
845
|
for (let shardIndex = startShardIndex; shardIndex < TOTAL_SHARDS; shardIndex++) {
|
|
890
846
|
const shardId = getShardIdByIndex(shardIndex);
|
|
891
847
|
const shardPrefix = `${this.nounPrefix}${shardId}/`;
|
|
892
|
-
shardsChecked++;
|
|
893
848
|
// List objects in this shard
|
|
894
849
|
// Cap maxResults to GCS API limit to prevent "Invalid unsigned integer" errors
|
|
895
850
|
const requestedPageSize = limit - nodes.length;
|
|
@@ -899,12 +854,6 @@ export class GcsStorage extends BaseStorage {
|
|
|
899
854
|
maxResults: cappedPageSize,
|
|
900
855
|
pageToken: shardIndex === startShardIndex ? gcsPageToken : undefined
|
|
901
856
|
});
|
|
902
|
-
// DIAGNOSTIC LOGGING: Show files found per shard (only log non-empty shards)
|
|
903
|
-
if (files && files.length > 0) {
|
|
904
|
-
filesFound += files.length;
|
|
905
|
-
prodLog.info(`[Shard ${shardId}] Found ${files.length} files in "${shardPrefix}"`);
|
|
906
|
-
prodLog.info(`[Shard ${shardId}] Sample file names: ${files.slice(0, 3).map((f) => f.name).join(', ')}`);
|
|
907
|
-
}
|
|
908
857
|
// Extract node IDs from file names
|
|
909
858
|
if (files && files.length > 0) {
|
|
910
859
|
const nodeIds = files
|
|
@@ -921,21 +870,11 @@ export class GcsStorage extends BaseStorage {
|
|
|
921
870
|
return name;
|
|
922
871
|
})
|
|
923
872
|
.filter((id) => id && id.length > 0);
|
|
924
|
-
// DIAGNOSTIC LOGGING: Show extracted UUIDs
|
|
925
|
-
prodLog.info(`[Shard ${shardId}] Extracted ${nodeIds.length} UUIDs: ${nodeIds.slice(0, 3).join(', ')}...`);
|
|
926
873
|
// Load nodes
|
|
927
874
|
for (const id of nodeIds) {
|
|
928
|
-
// DIAGNOSTIC LOGGING: Show each getNode() attempt
|
|
929
|
-
prodLog.info(`[Shard ${shardId}] Calling getNode("${id}")...`);
|
|
930
875
|
const node = await this.getNode(id);
|
|
931
876
|
if (node) {
|
|
932
877
|
nodes.push(node);
|
|
933
|
-
nodesLoaded++;
|
|
934
|
-
prodLog.info(`[Shard ${shardId}] β
Successfully loaded node ${id}`);
|
|
935
|
-
}
|
|
936
|
-
else {
|
|
937
|
-
nodesFailed++;
|
|
938
|
-
prodLog.warn(`[Shard ${shardId}] β getNode("${id}") returned null!`);
|
|
939
878
|
}
|
|
940
879
|
if (nodes.length >= limit) {
|
|
941
880
|
break;
|
|
@@ -968,14 +907,6 @@ export class GcsStorage extends BaseStorage {
|
|
|
968
907
|
// Continue to next shard
|
|
969
908
|
}
|
|
970
909
|
// No more shards or nodes
|
|
971
|
-
// DIAGNOSTIC LOGGING: Final summary
|
|
972
|
-
const elapsedTime = Date.now() - startTime;
|
|
973
|
-
prodLog.info(`[getNodesWithPagination] COMPLETED in ${elapsedTime}ms:`);
|
|
974
|
-
prodLog.info(` - Shards checked: ${shardsChecked}/${TOTAL_SHARDS}`);
|
|
975
|
-
prodLog.info(` - Files found: ${filesFound}`);
|
|
976
|
-
prodLog.info(` - Nodes loaded: ${nodesLoaded}`);
|
|
977
|
-
prodLog.info(` - Nodes failed: ${nodesFailed}`);
|
|
978
|
-
prodLog.info(` - Success rate: ${filesFound > 0 ? ((nodesLoaded / filesFound) * 100).toFixed(1) : 'N/A'}%`);
|
|
979
910
|
return {
|
|
980
911
|
nodes,
|
|
981
912
|
totalCount: this.totalNounCount,
|
|
@@ -818,80 +818,47 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
818
818
|
*/
|
|
819
819
|
async getNode(id) {
|
|
820
820
|
await this.ensureInitialized();
|
|
821
|
-
// Check cache first
|
|
821
|
+
// Check cache first
|
|
822
822
|
const cached = this.nodeCache.get(id);
|
|
823
|
-
//
|
|
824
|
-
prodLog.info(`[getNode] π Cache check for ${id.substring(0, 8)}...:`, {
|
|
825
|
-
hasCached: cached !== undefined,
|
|
826
|
-
isNull: cached === null,
|
|
827
|
-
isObject: cached !== null && typeof cached === 'object',
|
|
828
|
-
type: typeof cached
|
|
829
|
-
});
|
|
830
|
-
// CRITICAL FIX (v3.37.8): Validate cached object before returning
|
|
823
|
+
// Validate cached object before returning (v3.37.8+)
|
|
831
824
|
if (cached !== undefined && cached !== null) {
|
|
832
|
-
// Log cached object structure to diagnose incomplete objects
|
|
833
|
-
prodLog.info(`[getNode] Cached object structure:`, {
|
|
834
|
-
hasId: !!cached.id,
|
|
835
|
-
idMatches: cached.id === id,
|
|
836
|
-
hasVector: !!cached.vector,
|
|
837
|
-
vectorLength: cached.vector?.length,
|
|
838
|
-
hasConnections: !!cached.connections,
|
|
839
|
-
connectionsType: typeof cached.connections,
|
|
840
|
-
objectKeys: Object.keys(cached || {})
|
|
841
|
-
});
|
|
842
825
|
// Validate cached object has required fields (including non-empty vector!)
|
|
843
826
|
if (!cached.id || !cached.vector || !Array.isArray(cached.vector) || cached.vector.length === 0) {
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
'unknown'
|
|
850
|
-
});
|
|
851
|
-
prodLog.error(`[getNode] Removing invalid object from cache and loading from S3`);
|
|
827
|
+
// Invalid cache detected - log and auto-recover
|
|
828
|
+
prodLog.warn(`[S3] Invalid cached object for ${id.substring(0, 8)} (${!cached.id ? 'missing id' :
|
|
829
|
+
!cached.vector ? 'missing vector' :
|
|
830
|
+
!Array.isArray(cached.vector) ? 'vector not array' :
|
|
831
|
+
'empty vector'}) - removing from cache and reloading`);
|
|
852
832
|
this.nodeCache.delete(id);
|
|
853
833
|
// Fall through to load from S3
|
|
854
834
|
}
|
|
855
835
|
else {
|
|
856
|
-
|
|
836
|
+
// Valid cache hit
|
|
857
837
|
this.logger.trace(`Cache hit for node ${id}`);
|
|
858
838
|
return cached;
|
|
859
839
|
}
|
|
860
840
|
}
|
|
861
841
|
else if (cached === null) {
|
|
862
|
-
prodLog.warn(`[
|
|
863
|
-
}
|
|
864
|
-
else {
|
|
865
|
-
prodLog.info(`[getNode] β Cache MISS - loading from S3 for ${id.substring(0, 8)}...`);
|
|
842
|
+
prodLog.warn(`[S3] Cache contains null for ${id.substring(0, 8)} - reloading from storage`);
|
|
866
843
|
}
|
|
867
844
|
try {
|
|
868
845
|
// Import the GetObjectCommand only when needed
|
|
869
846
|
const { GetObjectCommand } = await import('@aws-sdk/client-s3');
|
|
870
847
|
// Use getNounKey() to properly handle sharding
|
|
871
848
|
const key = this.getNounKey(id);
|
|
872
|
-
// DIAGNOSTIC LOGGING: Show exact path being accessed
|
|
873
|
-
prodLog.info(`[getNode] π Attempting to load:`);
|
|
874
|
-
prodLog.info(`[getNode] UUID: ${id}`);
|
|
875
|
-
prodLog.info(`[getNode] Path: ${key}`);
|
|
876
|
-
prodLog.info(`[getNode] Bucket: ${this.bucketName}`);
|
|
877
849
|
// Try to get the node from the nouns directory
|
|
878
|
-
prodLog.info(`[getNode] π₯ Downloading file...`);
|
|
879
850
|
const response = await this.s3Client.send(new GetObjectCommand({
|
|
880
851
|
Bucket: this.bucketName,
|
|
881
852
|
Key: key
|
|
882
853
|
}));
|
|
883
854
|
// Check if response is null or undefined
|
|
884
855
|
if (!response || !response.Body) {
|
|
885
|
-
prodLog.warn(`[
|
|
856
|
+
prodLog.warn(`[S3] Response or Body is null/undefined for ${id.substring(0, 8)}`);
|
|
886
857
|
return null;
|
|
887
858
|
}
|
|
888
|
-
// Convert the response body to a string
|
|
859
|
+
// Convert the response body to a string and parse JSON
|
|
889
860
|
const bodyContents = await response.Body.transformToString();
|
|
890
|
-
prodLog.info(`[getNode] β
Download successful: ${bodyContents.length} bytes`);
|
|
891
|
-
// Parse the JSON string
|
|
892
|
-
prodLog.info(`[getNode] π§ Parsing JSON...`);
|
|
893
861
|
const parsedNode = JSON.parse(bodyContents);
|
|
894
|
-
prodLog.info(`[getNode] β
JSON parsed successfully, id: ${parsedNode.id}`);
|
|
895
862
|
// Ensure the parsed node has the expected properties
|
|
896
863
|
if (!parsedNode ||
|
|
897
864
|
!parsedNode.id ||
|
|
@@ -917,41 +884,25 @@ export class S3CompatibleStorage extends BaseStorage {
|
|
|
917
884
|
// CRITICAL FIX: Only cache valid nodes with non-empty vectors (never cache null or empty)
|
|
918
885
|
if (node && node.id && node.vector && Array.isArray(node.vector) && node.vector.length > 0) {
|
|
919
886
|
this.nodeCache.set(id, node);
|
|
920
|
-
prodLog.info(`[getNode] πΎ Cached node ${id.substring(0, 8)}... successfully`);
|
|
921
887
|
}
|
|
922
888
|
else {
|
|
923
|
-
prodLog.warn(`[
|
|
889
|
+
prodLog.warn(`[S3] Not caching invalid node ${id.substring(0, 8)} (missing id/vector or empty vector)`);
|
|
924
890
|
}
|
|
925
891
|
this.logger.trace(`Successfully retrieved node ${id}`);
|
|
926
892
|
return node;
|
|
927
893
|
}
|
|
928
894
|
catch (error) {
|
|
929
|
-
// DIAGNOSTIC LOGGING: Log EVERY error before any conditional checks
|
|
930
|
-
const key = this.getNounKey(id);
|
|
931
|
-
prodLog.error(`[getNode] β EXCEPTION CAUGHT:`);
|
|
932
|
-
prodLog.error(`[getNode] UUID: ${id}`);
|
|
933
|
-
prodLog.error(`[getNode] Path: ${key}`);
|
|
934
|
-
prodLog.error(`[getNode] Bucket: ${this.bucketName}`);
|
|
935
|
-
prodLog.error(`[getNode] Error type: ${error?.constructor?.name || typeof error}`);
|
|
936
|
-
prodLog.error(`[getNode] Error name: ${error?.name}`);
|
|
937
|
-
prodLog.error(`[getNode] Error code: ${JSON.stringify(error?.Code || error?.code)}`);
|
|
938
|
-
prodLog.error(`[getNode] Error message: ${error?.message || String(error)}`);
|
|
939
|
-
prodLog.error(`[getNode] HTTP status: ${error?.$metadata?.httpStatusCode}`);
|
|
940
|
-
prodLog.error(`[getNode] Error object:`, JSON.stringify(error, null, 2));
|
|
941
895
|
// Check if this is a "not found" error (S3 uses "NoSuchKey")
|
|
942
896
|
if (error?.name === 'NoSuchKey' || error?.Code === 'NoSuchKey' || error?.$metadata?.httpStatusCode === 404) {
|
|
943
|
-
|
|
944
|
-
// CRITICAL FIX: Do NOT cache null values
|
|
897
|
+
// File not found - not cached, just return null
|
|
945
898
|
return null;
|
|
946
899
|
}
|
|
947
900
|
// Handle throttling
|
|
948
901
|
if (this.isThrottlingError(error)) {
|
|
949
|
-
prodLog.warn(`[getNode] Identified as throttling error - rethrowing`);
|
|
950
902
|
await this.handleThrottling(error);
|
|
951
903
|
throw error;
|
|
952
904
|
}
|
|
953
905
|
// All other errors should throw, not return null
|
|
954
|
-
prodLog.error(`[getNode] Unhandled error - rethrowing`);
|
|
955
906
|
this.logger.error(`Failed to get node ${id}:`, error);
|
|
956
907
|
throw BrainyError.fromError(error, `getNoun(${id})`);
|
|
957
908
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@soulcraft/brainy",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.39.0",
|
|
4
4
|
"description": "Universal Knowledge Protocolβ’ - World's first Triple Intelligence database unifying vector, graph, and document search in one API. 31 nouns Γ 40 verbs for infinite expressiveness.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.js",
|