@soulcraft/brainy 3.38.0 → 3.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,13 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
4
4
 
5
+ ## [3.40.0](https://github.com/soulcraftlabs/brainy/compare/v3.39.0...v3.40.0) (2025-10-13)
6
+
7
+
8
+ ### ✨ Features
9
+
10
+ * extend batch processing and enhanced progress to CSV and PDF imports ([bb46da2](https://github.com/soulcraftlabs/brainy/commit/bb46da2ee7fc3cd0b5becc7e42afff7d7034ecfe))
11
+
5
12
  ### [3.37.3](https://github.com/soulcraftlabs/brainy/compare/v3.37.2...v3.37.3) (2025-10-10)
6
13
 
7
14
  - fix: populate totalNodes/totalEdges in ALL storage adapters for HNSW rebuild (a21a845)
@@ -62,6 +62,10 @@ export interface ImportProgress {
62
62
  total?: number;
63
63
  entities?: number;
64
64
  relationships?: number;
65
+ /** Rows per second (v3.38.0) */
66
+ throughput?: number;
67
+ /** Estimated time remaining in ms (v3.38.0) */
68
+ eta?: number;
65
69
  }
66
70
  export interface ImportResult {
67
71
  /** Import ID for history tracking */
@@ -237,13 +237,20 @@ export class ImportCoordinator {
237
237
  enableConceptExtraction: options.enableConceptExtraction !== false,
238
238
  confidenceThreshold: options.confidenceThreshold || 0.6,
239
239
  onProgress: (stats) => {
240
+ // Enhanced progress reporting (v3.38.0) with throughput and ETA
241
+ const message = stats.throughput
242
+ ? `Extracting entities from ${format} (${stats.throughput} rows/sec, ETA: ${Math.round(stats.eta / 1000)}s)...`
243
+ : `Extracting entities from ${format}...`;
240
244
  options.onProgress?.({
241
245
  stage: 'extracting',
242
- message: `Extracting entities from ${format}...`,
246
+ message,
243
247
  processed: stats.processed,
244
248
  total: stats.total,
245
249
  entities: stats.entities,
246
- relationships: stats.relationships
250
+ relationships: stats.relationships,
251
+ // Pass through enhanced metrics if available
252
+ throughput: stats.throughput,
253
+ eta: stats.eta
247
254
  });
248
255
  }
249
256
  };
@@ -30,12 +30,18 @@ export interface SmartCSVOptions extends FormatHandlerOptions {
30
30
  /** CSV-specific options */
31
31
  csvDelimiter?: string;
32
32
  csvHeaders?: boolean;
33
- /** Progress callback */
33
+ /** Progress callback (v3.39.0: Enhanced with performance metrics) */
34
34
  onProgress?: (stats: {
35
35
  processed: number;
36
36
  total: number;
37
37
  entities: number;
38
38
  relationships: number;
39
+ /** Rows per second (v3.39.0) */
40
+ throughput?: number;
41
+ /** Estimated time remaining in ms (v3.39.0) */
42
+ eta?: number;
43
+ /** Current phase (v3.39.0) */
44
+ phase?: string;
39
45
  }) => void;
40
46
  }
41
47
  export interface ExtractedRow {
@@ -62,113 +62,141 @@ export class SmartCSVImporter {
62
62
  }
63
63
  // Detect column names
64
64
  const columns = this.detectColumns(rows[0], opts);
65
- // Process each row
65
+ // Process each row with BATCHED PARALLEL PROCESSING (v3.39.0)
66
66
  const extractedRows = [];
67
67
  const entityMap = new Map();
68
68
  const stats = {
69
69
  byType: {},
70
70
  byConfidence: { high: 0, medium: 0, low: 0 }
71
71
  };
72
- for (let i = 0; i < rows.length; i++) {
73
- const row = rows[i];
74
- // Extract data from row
75
- const term = this.getColumnValue(row, columns.term) || `Entity_${i}`;
76
- const definition = this.getColumnValue(row, columns.definition) || '';
77
- const type = this.getColumnValue(row, columns.type);
78
- const relatedTerms = this.getColumnValue(row, columns.related);
79
- // Extract entities from definition
80
- let relatedEntities = [];
81
- if (opts.enableNeuralExtraction && definition) {
82
- relatedEntities = await this.extractor.extract(definition, {
83
- confidence: opts.confidenceThreshold * 0.8, // Lower threshold for related entities
84
- neuralMatching: true,
85
- cache: { enabled: true }
86
- });
87
- // Filter out the main term from related entities
88
- relatedEntities = relatedEntities.filter(e => e.text.toLowerCase() !== term.toLowerCase());
89
- }
90
- // Determine main entity type
91
- const mainEntityType = type ?
92
- this.mapTypeString(type) :
93
- (relatedEntities.length > 0 ? relatedEntities[0].type : NounType.Thing);
94
- // Generate entity ID
95
- const entityId = this.generateEntityId(term);
96
- entityMap.set(term.toLowerCase(), entityId);
97
- // Extract concepts
98
- let concepts = [];
99
- if (opts.enableConceptExtraction && definition) {
100
- try {
101
- concepts = await this.brain.extractConcepts(definition, { limit: 10 });
102
- }
103
- catch (error) {
104
- concepts = [];
105
- }
106
- }
107
- // Create main entity
108
- const mainEntity = {
109
- id: entityId,
110
- name: term,
111
- type: mainEntityType,
112
- description: definition,
113
- confidence: 0.95, // Main entity from row has high confidence
114
- metadata: {
115
- source: 'csv',
116
- row: i + 1,
117
- originalData: row,
118
- concepts,
119
- extractedAt: Date.now()
120
- }
121
- };
122
- // Track statistics
123
- this.updateStats(stats, mainEntityType, mainEntity.confidence);
124
- // Infer relationships
125
- const relationships = [];
126
- if (opts.enableRelationshipInference) {
127
- // Extract relationships from definition text
128
- for (const relEntity of relatedEntities) {
129
- const verbType = await this.inferRelationship(term, relEntity.text, definition);
130
- relationships.push({
131
- from: entityId,
132
- to: relEntity.text,
133
- type: verbType,
134
- confidence: relEntity.confidence,
135
- evidence: `Extracted from: "${definition.substring(0, 100)}..."`
136
- });
137
- }
138
- // Parse explicit "Related" column
139
- if (relatedTerms) {
140
- const terms = relatedTerms.split(/[,;|]/).map(t => t.trim()).filter(Boolean);
141
- for (const relTerm of terms) {
142
- // Ensure we don't create self-relationships
143
- if (relTerm.toLowerCase() !== term.toLowerCase()) {
144
- relationships.push({
145
- from: entityId,
146
- to: relTerm,
147
- type: VerbType.RelatedTo,
148
- confidence: 0.9,
149
- evidence: `Explicitly listed in "Related" column`
150
- });
72
+ // Batch processing configuration
73
+ const CHUNK_SIZE = 10; // Process 10 rows at a time for optimal performance
74
+ let totalProcessed = 0;
75
+ const performanceStartTime = Date.now();
76
+ // Process rows in chunks
77
+ for (let chunkStart = 0; chunkStart < rows.length; chunkStart += CHUNK_SIZE) {
78
+ const chunk = rows.slice(chunkStart, Math.min(chunkStart + CHUNK_SIZE, rows.length));
79
+ // Process chunk in parallel for massive speedup
80
+ const chunkResults = await Promise.all(chunk.map(async (row, chunkIndex) => {
81
+ const i = chunkStart + chunkIndex;
82
+ // Extract data from row
83
+ const term = this.getColumnValue(row, columns.term) || `Entity_${i}`;
84
+ const definition = this.getColumnValue(row, columns.definition) || '';
85
+ const type = this.getColumnValue(row, columns.type);
86
+ const relatedTerms = this.getColumnValue(row, columns.related);
87
+ // Parallel extraction: entities AND concepts at the same time
88
+ const [relatedEntities, concepts] = await Promise.all([
89
+ // Extract entities from definition
90
+ opts.enableNeuralExtraction && definition
91
+ ? this.extractor.extract(definition, {
92
+ confidence: opts.confidenceThreshold * 0.8,
93
+ neuralMatching: true,
94
+ cache: { enabled: true }
95
+ }).then(entities =>
96
+ // Filter out the main term from related entities
97
+ entities.filter(e => e.text.toLowerCase() !== term.toLowerCase()))
98
+ : Promise.resolve([]),
99
+ // Extract concepts (in parallel with entity extraction)
100
+ opts.enableConceptExtraction && definition
101
+ ? this.brain.extractConcepts(definition, { limit: 10 }).catch(() => [])
102
+ : Promise.resolve([])
103
+ ]);
104
+ // Determine main entity type
105
+ const mainEntityType = type ?
106
+ this.mapTypeString(type) :
107
+ (relatedEntities.length > 0 ? relatedEntities[0].type : NounType.Thing);
108
+ // Generate entity ID
109
+ const entityId = this.generateEntityId(term);
110
+ // Create main entity
111
+ const mainEntity = {
112
+ id: entityId,
113
+ name: term,
114
+ type: mainEntityType,
115
+ description: definition,
116
+ confidence: 0.95,
117
+ metadata: {
118
+ source: 'csv',
119
+ row: i + 1,
120
+ originalData: row,
121
+ concepts,
122
+ extractedAt: Date.now()
123
+ }
124
+ };
125
+ // Infer relationships
126
+ const relationships = [];
127
+ if (opts.enableRelationshipInference) {
128
+ // Extract relationships from definition text
129
+ for (const relEntity of relatedEntities) {
130
+ const verbType = await this.inferRelationship(term, relEntity.text, definition);
131
+ relationships.push({
132
+ from: entityId,
133
+ to: relEntity.text,
134
+ type: verbType,
135
+ confidence: relEntity.confidence,
136
+ evidence: `Extracted from: "${definition.substring(0, 100)}..."`
137
+ });
138
+ }
139
+ // Parse explicit "Related" column
140
+ if (relatedTerms) {
141
+ const terms = relatedTerms.split(/[,;|]/).map(t => t.trim()).filter(Boolean);
142
+ for (const relTerm of terms) {
143
+ if (relTerm.toLowerCase() !== term.toLowerCase()) {
144
+ relationships.push({
145
+ from: entityId,
146
+ to: relTerm,
147
+ type: VerbType.RelatedTo,
148
+ confidence: 0.9,
149
+ evidence: `Explicitly listed in "Related" column`
150
+ });
151
+ }
151
152
  }
152
153
  }
153
154
  }
155
+ return {
156
+ term,
157
+ entityId,
158
+ mainEntity,
159
+ mainEntityType,
160
+ relatedEntities,
161
+ relationships,
162
+ concepts
163
+ };
164
+ }));
165
+ // Process chunk results sequentially to maintain order
166
+ for (const result of chunkResults) {
167
+ // Store entity ID mapping
168
+ entityMap.set(result.term.toLowerCase(), result.entityId);
169
+ // Track statistics
170
+ this.updateStats(stats, result.mainEntityType, result.mainEntity.confidence);
171
+ // Add extracted row
172
+ extractedRows.push({
173
+ entity: result.mainEntity,
174
+ relatedEntities: result.relatedEntities.map(e => ({
175
+ name: e.text,
176
+ type: e.type,
177
+ confidence: e.confidence
178
+ })),
179
+ relationships: result.relationships,
180
+ concepts: result.concepts
181
+ });
154
182
  }
155
- // Add extracted row
156
- extractedRows.push({
157
- entity: mainEntity,
158
- relatedEntities: relatedEntities.map(e => ({
159
- name: e.text,
160
- type: e.type,
161
- confidence: e.confidence
162
- })),
163
- relationships,
164
- concepts
165
- });
166
- // Report progress
183
+ // Update progress tracking
184
+ totalProcessed += chunk.length;
185
+ // Calculate performance metrics
186
+ const elapsed = Date.now() - performanceStartTime;
187
+ const rowsPerSecond = totalProcessed / (elapsed / 1000);
188
+ const remainingRows = rows.length - totalProcessed;
189
+ const estimatedTimeRemaining = remainingRows / rowsPerSecond;
190
+ // Report progress with enhanced metrics
167
191
  opts.onProgress({
168
- processed: i + 1,
192
+ processed: totalProcessed,
169
193
  total: rows.length,
170
- entities: extractedRows.length + relatedEntities.length,
171
- relationships: relationships.length
194
+ entities: extractedRows.reduce((sum, row) => sum + 1 + row.relatedEntities.length, 0),
195
+ relationships: extractedRows.reduce((sum, row) => sum + row.relationships.length, 0),
196
+ // Additional performance metrics (v3.39.0)
197
+ throughput: Math.round(rowsPerSecond * 10) / 10,
198
+ eta: Math.round(estimatedTimeRemaining),
199
+ phase: 'extracting'
172
200
  });
173
201
  }
174
202
  return {
@@ -25,12 +25,18 @@ export interface SmartExcelOptions extends FormatHandlerOptions {
25
25
  definitionColumn?: string;
26
26
  typeColumn?: string;
27
27
  relatedColumn?: string;
28
- /** Progress callback */
28
+ /** Progress callback (v3.38.0: Enhanced with performance metrics) */
29
29
  onProgress?: (stats: {
30
30
  processed: number;
31
31
  total: number;
32
32
  entities: number;
33
33
  relationships: number;
34
+ /** Rows per second (v3.38.0) */
35
+ throughput?: number;
36
+ /** Estimated time remaining in ms (v3.38.0) */
37
+ eta?: number;
38
+ /** Current phase (v3.38.0) */
39
+ phase?: string;
34
40
  }) => void;
35
41
  }
36
42
  export interface ExtractedRow {
@@ -66,114 +66,141 @@ export class SmartExcelImporter {
66
66
  }
67
67
  // Detect column names
68
68
  const columns = this.detectColumns(rows[0], opts);
69
- // Process each row
69
+ // Process each row with BATCHED PARALLEL PROCESSING (v3.38.0)
70
70
  const extractedRows = [];
71
71
  const entityMap = new Map();
72
72
  const stats = {
73
73
  byType: {},
74
74
  byConfidence: { high: 0, medium: 0, low: 0 }
75
75
  };
76
- for (let i = 0; i < rows.length; i++) {
77
- const row = rows[i];
78
- // Extract data from row
79
- const term = this.getColumnValue(row, columns.term) || `Entity_${i}`;
80
- const definition = this.getColumnValue(row, columns.definition) || '';
81
- const type = this.getColumnValue(row, columns.type);
82
- const relatedTerms = this.getColumnValue(row, columns.related);
83
- // Extract entities from definition
84
- let relatedEntities = [];
85
- if (opts.enableNeuralExtraction && definition) {
86
- relatedEntities = await this.extractor.extract(definition, {
87
- confidence: opts.confidenceThreshold * 0.8, // Lower threshold for related entities
88
- neuralMatching: true,
89
- cache: { enabled: true }
90
- });
91
- // Filter out the main term from related entities
92
- relatedEntities = relatedEntities.filter(e => e.text.toLowerCase() !== term.toLowerCase());
93
- }
94
- // Determine main entity type
95
- const mainEntityType = type ?
96
- this.mapTypeString(type) :
97
- (relatedEntities.length > 0 ? relatedEntities[0].type : NounType.Thing);
98
- // Generate entity ID
99
- const entityId = this.generateEntityId(term);
100
- entityMap.set(term.toLowerCase(), entityId);
101
- // Extract concepts
102
- let concepts = [];
103
- if (opts.enableConceptExtraction && definition) {
104
- try {
105
- concepts = await this.brain.extractConcepts(definition, { limit: 10 });
106
- }
107
- catch (error) {
108
- // Concept extraction is optional
109
- concepts = [];
110
- }
111
- }
112
- // Create main entity
113
- const mainEntity = {
114
- id: entityId,
115
- name: term,
116
- type: mainEntityType,
117
- description: definition,
118
- confidence: 0.95, // Main entity from row has high confidence
119
- metadata: {
120
- source: 'excel',
121
- row: i + 1,
122
- originalData: row,
123
- concepts,
124
- extractedAt: Date.now()
125
- }
126
- };
127
- // Track statistics
128
- this.updateStats(stats, mainEntityType, mainEntity.confidence);
129
- // Infer relationships
130
- const relationships = [];
131
- if (opts.enableRelationshipInference) {
132
- // Extract relationships from definition text
133
- for (const relEntity of relatedEntities) {
134
- const verbType = await this.inferRelationship(term, relEntity.text, definition);
135
- relationships.push({
136
- from: entityId,
137
- to: relEntity.text, // Use entity name directly, will be resolved later
138
- type: verbType,
139
- confidence: relEntity.confidence,
140
- evidence: `Extracted from: "${definition.substring(0, 100)}..."`
141
- });
142
- }
143
- // Parse explicit "Related Terms" column
144
- if (relatedTerms) {
145
- const terms = relatedTerms.split(/[,;]/).map(t => t.trim()).filter(Boolean);
146
- for (const relTerm of terms) {
147
- // Ensure we don't create self-relationships
148
- if (relTerm.toLowerCase() !== term.toLowerCase()) {
149
- relationships.push({
150
- from: entityId,
151
- to: relTerm, // Use term name directly
152
- type: VerbType.RelatedTo,
153
- confidence: 0.9, // Explicit relationships have high confidence
154
- evidence: `Explicitly listed in "Related" column`
155
- });
76
+ // Batch processing configuration
77
+ const CHUNK_SIZE = 10; // Process 10 rows at a time for optimal performance
78
+ let totalProcessed = 0;
79
+ const performanceStartTime = Date.now();
80
+ // Process rows in chunks
81
+ for (let chunkStart = 0; chunkStart < rows.length; chunkStart += CHUNK_SIZE) {
82
+ const chunk = rows.slice(chunkStart, Math.min(chunkStart + CHUNK_SIZE, rows.length));
83
+ // Process chunk in parallel for massive speedup
84
+ const chunkResults = await Promise.all(chunk.map(async (row, chunkIndex) => {
85
+ const i = chunkStart + chunkIndex;
86
+ // Extract data from row
87
+ const term = this.getColumnValue(row, columns.term) || `Entity_${i}`;
88
+ const definition = this.getColumnValue(row, columns.definition) || '';
89
+ const type = this.getColumnValue(row, columns.type);
90
+ const relatedTerms = this.getColumnValue(row, columns.related);
91
+ // Parallel extraction: entities AND concepts at the same time
92
+ const [relatedEntities, concepts] = await Promise.all([
93
+ // Extract entities from definition
94
+ opts.enableNeuralExtraction && definition
95
+ ? this.extractor.extract(definition, {
96
+ confidence: opts.confidenceThreshold * 0.8,
97
+ neuralMatching: true,
98
+ cache: { enabled: true }
99
+ }).then(entities =>
100
+ // Filter out the main term from related entities
101
+ entities.filter(e => e.text.toLowerCase() !== term.toLowerCase()))
102
+ : Promise.resolve([]),
103
+ // Extract concepts (in parallel with entity extraction)
104
+ opts.enableConceptExtraction && definition
105
+ ? this.brain.extractConcepts(definition, { limit: 10 }).catch(() => [])
106
+ : Promise.resolve([])
107
+ ]);
108
+ // Determine main entity type
109
+ const mainEntityType = type ?
110
+ this.mapTypeString(type) :
111
+ (relatedEntities.length > 0 ? relatedEntities[0].type : NounType.Thing);
112
+ // Generate entity ID
113
+ const entityId = this.generateEntityId(term);
114
+ // Create main entity
115
+ const mainEntity = {
116
+ id: entityId,
117
+ name: term,
118
+ type: mainEntityType,
119
+ description: definition,
120
+ confidence: 0.95,
121
+ metadata: {
122
+ source: 'excel',
123
+ row: i + 1,
124
+ originalData: row,
125
+ concepts,
126
+ extractedAt: Date.now()
127
+ }
128
+ };
129
+ // Infer relationships
130
+ const relationships = [];
131
+ if (opts.enableRelationshipInference) {
132
+ // Extract relationships from definition text
133
+ for (const relEntity of relatedEntities) {
134
+ const verbType = await this.inferRelationship(term, relEntity.text, definition);
135
+ relationships.push({
136
+ from: entityId,
137
+ to: relEntity.text,
138
+ type: verbType,
139
+ confidence: relEntity.confidence,
140
+ evidence: `Extracted from: "${definition.substring(0, 100)}..."`
141
+ });
142
+ }
143
+ // Parse explicit "Related Terms" column
144
+ if (relatedTerms) {
145
+ const terms = relatedTerms.split(/[,;]/).map(t => t.trim()).filter(Boolean);
146
+ for (const relTerm of terms) {
147
+ if (relTerm.toLowerCase() !== term.toLowerCase()) {
148
+ relationships.push({
149
+ from: entityId,
150
+ to: relTerm,
151
+ type: VerbType.RelatedTo,
152
+ confidence: 0.9,
153
+ evidence: `Explicitly listed in "Related" column`
154
+ });
155
+ }
156
156
  }
157
157
  }
158
158
  }
159
+ return {
160
+ term,
161
+ entityId,
162
+ mainEntity,
163
+ mainEntityType,
164
+ relatedEntities,
165
+ relationships,
166
+ concepts
167
+ };
168
+ }));
169
+ // Process chunk results sequentially to maintain order
170
+ for (const result of chunkResults) {
171
+ // Store entity ID mapping
172
+ entityMap.set(result.term.toLowerCase(), result.entityId);
173
+ // Track statistics
174
+ this.updateStats(stats, result.mainEntityType, result.mainEntity.confidence);
175
+ // Add extracted row
176
+ extractedRows.push({
177
+ entity: result.mainEntity,
178
+ relatedEntities: result.relatedEntities.map(e => ({
179
+ name: e.text,
180
+ type: e.type,
181
+ confidence: e.confidence
182
+ })),
183
+ relationships: result.relationships,
184
+ concepts: result.concepts
185
+ });
159
186
  }
160
- // Add extracted row
161
- extractedRows.push({
162
- entity: mainEntity,
163
- relatedEntities: relatedEntities.map(e => ({
164
- name: e.text,
165
- type: e.type,
166
- confidence: e.confidence
167
- })),
168
- relationships,
169
- concepts
170
- });
171
- // Report progress
187
+ // Update progress tracking
188
+ totalProcessed += chunk.length;
189
+ // Calculate performance metrics
190
+ const elapsed = Date.now() - performanceStartTime;
191
+ const rowsPerSecond = totalProcessed / (elapsed / 1000);
192
+ const remainingRows = rows.length - totalProcessed;
193
+ const estimatedTimeRemaining = remainingRows / rowsPerSecond;
194
+ // Report progress with enhanced metrics
172
195
  opts.onProgress({
173
- processed: i + 1,
196
+ processed: totalProcessed,
174
197
  total: rows.length,
175
- entities: extractedRows.length + relatedEntities.length,
176
- relationships: relationships.length
198
+ entities: extractedRows.reduce((sum, row) => sum + 1 + row.relatedEntities.length, 0),
199
+ relationships: extractedRows.reduce((sum, row) => sum + row.relationships.length, 0),
200
+ // Additional performance metrics (v3.38.0)
201
+ throughput: Math.round(rowsPerSecond * 10) / 10,
202
+ eta: Math.round(estimatedTimeRemaining),
203
+ phase: 'extracting'
177
204
  });
178
205
  }
179
206
  return {
@@ -27,12 +27,18 @@ export interface SmartPDFOptions extends FormatHandlerOptions {
27
27
  extractFromTables?: boolean;
28
28
  /** Group by page or full document */
29
29
  groupBy?: 'page' | 'document';
30
- /** Progress callback */
30
+ /** Progress callback (v3.39.0: Enhanced with performance metrics) */
31
31
  onProgress?: (stats: {
32
32
  processed: number;
33
33
  total: number;
34
34
  entities: number;
35
35
  relationships: number;
36
+ /** Sections per second (v3.39.0) */
37
+ throughput?: number;
38
+ /** Estimated time remaining in ms (v3.39.0) */
39
+ eta?: number;
40
+ /** Current phase (v3.39.0) */
41
+ phase?: string;
36
42
  }) => void;
37
43
  }
38
44
  export interface ExtractedSection {
@@ -55,7 +55,7 @@ export class SmartPDFImporter {
55
55
  }
56
56
  // Group data by page or combine into single document
57
57
  const grouped = this.groupData(data, opts);
58
- // Process each group
58
+ // Process each group with BATCHED PARALLEL PROCESSING (v3.39.0)
59
59
  const sections = [];
60
60
  const entityMap = new Map();
61
61
  const stats = {
@@ -63,17 +63,35 @@ export class SmartPDFImporter {
63
63
  byConfidence: { high: 0, medium: 0, low: 0 },
64
64
  bySource: { paragraphs: 0, tables: 0 }
65
65
  };
66
- let processedCount = 0;
66
+ // Batch processing configuration
67
+ const CHUNK_SIZE = 5; // Process 5 sections at a time (smaller than rows due to section size)
68
+ let totalProcessed = 0;
69
+ const performanceStartTime = Date.now();
67
70
  const totalGroups = grouped.length;
68
- for (const group of grouped) {
69
- const sectionResult = await this.processSection(group, opts, stats, entityMap);
70
- sections.push(sectionResult);
71
- processedCount++;
71
+ // Process sections in chunks
72
+ for (let chunkStart = 0; chunkStart < grouped.length; chunkStart += CHUNK_SIZE) {
73
+ const chunk = grouped.slice(chunkStart, Math.min(chunkStart + CHUNK_SIZE, grouped.length));
74
+ // Process chunk in parallel for better performance
75
+ const chunkResults = await Promise.all(chunk.map(group => this.processSection(group, opts, stats, entityMap)));
76
+ // Add results sequentially to maintain order
77
+ sections.push(...chunkResults);
78
+ // Update progress tracking
79
+ totalProcessed += chunk.length;
80
+ // Calculate performance metrics
81
+ const elapsed = Date.now() - performanceStartTime;
82
+ const sectionsPerSecond = totalProcessed / (elapsed / 1000);
83
+ const remainingSections = grouped.length - totalProcessed;
84
+ const estimatedTimeRemaining = remainingSections / sectionsPerSecond;
85
+ // Report progress with enhanced metrics
72
86
  opts.onProgress({
73
- processed: processedCount,
87
+ processed: totalProcessed,
74
88
  total: totalGroups,
75
89
  entities: sections.reduce((sum, s) => sum + s.entities.length, 0),
76
- relationships: sections.reduce((sum, s) => sum + s.relationships.length, 0)
90
+ relationships: sections.reduce((sum, s) => sum + s.relationships.length, 0),
91
+ // Additional performance metrics (v3.39.0)
92
+ throughput: Math.round(sectionsPerSecond * 10) / 10,
93
+ eta: Math.round(estimatedTimeRemaining),
94
+ phase: 'extracting'
77
95
  });
78
96
  }
79
97
  const pagesProcessed = new Set(data.map(d => d._page)).size;
@@ -150,25 +168,21 @@ export class SmartPDFImporter {
150
168
  }
151
169
  }
152
170
  const combinedText = texts.join('\n\n');
153
- // Extract entities if enabled
154
- let extractedEntities = [];
155
- if (options.enableNeuralExtraction && combinedText.length > 0) {
156
- extractedEntities = await this.extractor.extract(combinedText, {
157
- confidence: options.confidenceThreshold || 0.6,
158
- neuralMatching: true,
159
- cache: { enabled: true }
160
- });
161
- }
162
- // Extract concepts if enabled
163
- let concepts = [];
164
- if (options.enableConceptExtraction && combinedText.length > 0) {
165
- try {
166
- concepts = await this.brain.extractConcepts(combinedText, { limit: 15 });
167
- }
168
- catch (error) {
169
- concepts = [];
170
- }
171
- }
171
+ // Parallel extraction: entities AND concepts at the same time (v3.39.0)
172
+ const [extractedEntities, concepts] = await Promise.all([
173
+ // Extract entities if enabled
174
+ options.enableNeuralExtraction && combinedText.length > 0
175
+ ? this.extractor.extract(combinedText, {
176
+ confidence: options.confidenceThreshold || 0.6,
177
+ neuralMatching: true,
178
+ cache: { enabled: true }
179
+ })
180
+ : Promise.resolve([]),
181
+ // Extract concepts (in parallel with entity extraction)
182
+ options.enableConceptExtraction && combinedText.length > 0
183
+ ? this.brain.extractConcepts(combinedText, { limit: 15 }).catch(() => [])
184
+ : Promise.resolve([])
185
+ ]);
172
186
  // Create entity objects
173
187
  const entities = extractedEntities.map(e => {
174
188
  const entityId = this.generateEntityId(e.text, group.id);
@@ -24,6 +24,8 @@ export declare class NeuralEntityExtractor {
24
24
  private typeEmbeddings;
25
25
  private initialized;
26
26
  private cache;
27
+ private embeddingCache;
28
+ private embeddingCacheStats;
27
29
  constructor(brain: Brainy | Brainy<any>, cacheOptions?: EntityCacheOptions);
28
30
  /**
29
31
  * Initialize type embeddings for neural matching
@@ -61,7 +63,10 @@ export declare class NeuralEntityExtractor {
61
63
  */
62
64
  private classifyByRules;
63
65
  /**
64
- * Get embedding for text
66
+ * Get embedding for text with caching (v3.38.0)
67
+ *
68
+ * PERFORMANCE OPTIMIZATION: Caches embeddings during extraction session
69
+ * to avoid redundant model calls for repeated text (common in large imports)
65
70
  */
66
71
  private getEmbedding;
67
72
  /**
@@ -96,4 +101,27 @@ export declare class NeuralEntityExtractor {
96
101
  * Cleanup expired cache entries
97
102
  */
98
103
  cleanupCache(): number;
104
+ /**
105
+ * Clear embedding cache (v3.38.0)
106
+ *
107
+ * Clears the runtime embedding cache. Useful for:
108
+ * - Freeing memory after large imports
109
+ * - Testing with fresh cache state
110
+ */
111
+ clearEmbeddingCache(): void;
112
+ /**
113
+ * Get embedding cache statistics (v3.38.0)
114
+ *
115
+ * Returns performance metrics for the embedding cache:
116
+ * - hits: Number of cache hits (avoided model calls)
117
+ * - misses: Number of cache misses (required model calls)
118
+ * - size: Current cache size
119
+ * - hitRate: Percentage of requests served from cache
120
+ */
121
+ getEmbeddingCacheStats(): {
122
+ hitRate: number;
123
+ hits: number;
124
+ misses: number;
125
+ size: number;
126
+ };
99
127
  }
@@ -12,6 +12,14 @@ export class NeuralEntityExtractor {
12
12
  // Type embeddings for similarity matching
13
13
  this.typeEmbeddings = new Map();
14
14
  this.initialized = false;
15
+ // Runtime embedding cache for performance (v3.38.0)
16
+ // Caches candidate embeddings during an extraction session to avoid redundant model calls
17
+ this.embeddingCache = new Map();
18
+ this.embeddingCacheStats = {
19
+ hits: 0,
20
+ misses: 0,
21
+ size: 0
22
+ };
15
23
  this.brain = brain;
16
24
  this.cache = new EntityExtractionCache(cacheOptions);
17
25
  }
@@ -253,20 +261,46 @@ export class NeuralEntityExtractor {
253
261
  return { type: NounType.Thing, confidence: 0.3 };
254
262
  }
255
263
  /**
256
- * Get embedding for text
264
+ * Get embedding for text with caching (v3.38.0)
265
+ *
266
+ * PERFORMANCE OPTIMIZATION: Caches embeddings during extraction session
267
+ * to avoid redundant model calls for repeated text (common in large imports)
257
268
  */
258
269
  async getEmbedding(text) {
270
+ // Normalize text for cache key
271
+ const normalizedText = text.trim().toLowerCase();
272
+ // Check cache first
273
+ const cached = this.embeddingCache.get(normalizedText);
274
+ if (cached) {
275
+ this.embeddingCacheStats.hits++;
276
+ return cached;
277
+ }
278
+ // Cache miss - generate embedding
279
+ this.embeddingCacheStats.misses++;
280
+ let vector;
259
281
  if ('embed' in this.brain && typeof this.brain.embed === 'function') {
260
- return await this.brain.embed(text);
282
+ vector = await this.brain.embed(text);
261
283
  }
262
284
  else {
263
285
  // Fallback - create simple hash-based vector
264
- const vector = new Array(384).fill(0);
286
+ vector = new Array(384).fill(0);
265
287
  for (let i = 0; i < text.length; i++) {
266
288
  vector[i % 384] += text.charCodeAt(i) / 255;
267
289
  }
268
- return vector.map(v => v / text.length);
290
+ vector = vector.map(v => v / text.length);
291
+ }
292
+ // Store in cache
293
+ this.embeddingCache.set(normalizedText, vector);
294
+ this.embeddingCacheStats.size = this.embeddingCache.size;
295
+ // Memory management: Clear cache if it grows too large (>10000 entries)
296
+ if (this.embeddingCache.size > 10000) {
297
+ // Keep most recent 5000 entries (simple LRU approximation)
298
+ const entries = Array.from(this.embeddingCache.entries());
299
+ this.embeddingCache.clear();
300
+ entries.slice(-5000).forEach(([k, v]) => this.embeddingCache.set(k, v));
301
+ this.embeddingCacheStats.size = this.embeddingCache.size;
269
302
  }
303
+ return vector;
270
304
  }
271
305
  /**
272
306
  * Calculate cosine similarity between vectors
@@ -355,5 +389,36 @@ export class NeuralEntityExtractor {
355
389
  cleanupCache() {
356
390
  return this.cache.cleanup();
357
391
  }
392
+ /**
393
+ * Clear embedding cache (v3.38.0)
394
+ *
395
+ * Clears the runtime embedding cache. Useful for:
396
+ * - Freeing memory after large imports
397
+ * - Testing with fresh cache state
398
+ */
399
+ clearEmbeddingCache() {
400
+ this.embeddingCache.clear();
401
+ this.embeddingCacheStats = {
402
+ hits: 0,
403
+ misses: 0,
404
+ size: 0
405
+ };
406
+ }
407
+ /**
408
+ * Get embedding cache statistics (v3.38.0)
409
+ *
410
+ * Returns performance metrics for the embedding cache:
411
+ * - hits: Number of cache hits (avoided model calls)
412
+ * - misses: Number of cache misses (required model calls)
413
+ * - size: Current cache size
414
+ * - hitRate: Percentage of requests served from cache
415
+ */
416
+ getEmbeddingCacheStats() {
417
+ const total = this.embeddingCacheStats.hits + this.embeddingCacheStats.misses;
418
+ return {
419
+ ...this.embeddingCacheStats,
420
+ hitRate: total > 0 ? this.embeddingCacheStats.hits / total : 0
421
+ };
422
+ }
358
423
  }
359
424
  //# sourceMappingURL=entityExtractor.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@soulcraft/brainy",
3
- "version": "3.38.0",
3
+ "version": "3.40.0",
4
4
  "description": "Universal Knowledge Protocol™ - World's first Triple Intelligence database unifying vector, graph, and document search in one API. 31 nouns × 40 verbs for infinite expressiveness.",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.js",