@soulcraft/brainy 3.38.0 → 3.40.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/dist/import/ImportCoordinator.d.ts +4 -0
- package/dist/import/ImportCoordinator.js +9 -2
- package/dist/importers/SmartCSVImporter.d.ts +7 -1
- package/dist/importers/SmartCSVImporter.js +123 -95
- package/dist/importers/SmartExcelImporter.d.ts +7 -1
- package/dist/importers/SmartExcelImporter.js +123 -96
- package/dist/importers/SmartPDFImporter.d.ts +7 -1
- package/dist/importers/SmartPDFImporter.js +41 -27
- package/dist/neural/entityExtractor.d.ts +29 -1
- package/dist/neural/entityExtractor.js +69 -4
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
|
|
4
4
|
|
|
5
|
+
## [3.40.0](https://github.com/soulcraftlabs/brainy/compare/v3.39.0...v3.40.0) (2025-10-13)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
### ✨ Features
|
|
9
|
+
|
|
10
|
+
* extend batch processing and enhanced progress to CSV and PDF imports ([bb46da2](https://github.com/soulcraftlabs/brainy/commit/bb46da2ee7fc3cd0b5becc7e42afff7d7034ecfe))
|
|
11
|
+
|
|
5
12
|
### [3.37.3](https://github.com/soulcraftlabs/brainy/compare/v3.37.2...v3.37.3) (2025-10-10)
|
|
6
13
|
|
|
7
14
|
- fix: populate totalNodes/totalEdges in ALL storage adapters for HNSW rebuild (a21a845)
|
|
@@ -62,6 +62,10 @@ export interface ImportProgress {
|
|
|
62
62
|
total?: number;
|
|
63
63
|
entities?: number;
|
|
64
64
|
relationships?: number;
|
|
65
|
+
/** Rows per second (v3.38.0) */
|
|
66
|
+
throughput?: number;
|
|
67
|
+
/** Estimated time remaining in ms (v3.38.0) */
|
|
68
|
+
eta?: number;
|
|
65
69
|
}
|
|
66
70
|
export interface ImportResult {
|
|
67
71
|
/** Import ID for history tracking */
|
|
@@ -237,13 +237,20 @@ export class ImportCoordinator {
|
|
|
237
237
|
enableConceptExtraction: options.enableConceptExtraction !== false,
|
|
238
238
|
confidenceThreshold: options.confidenceThreshold || 0.6,
|
|
239
239
|
onProgress: (stats) => {
|
|
240
|
+
// Enhanced progress reporting (v3.38.0) with throughput and ETA
|
|
241
|
+
const message = stats.throughput
|
|
242
|
+
? `Extracting entities from ${format} (${stats.throughput} rows/sec, ETA: ${Math.round(stats.eta / 1000)}s)...`
|
|
243
|
+
: `Extracting entities from ${format}...`;
|
|
240
244
|
options.onProgress?.({
|
|
241
245
|
stage: 'extracting',
|
|
242
|
-
message
|
|
246
|
+
message,
|
|
243
247
|
processed: stats.processed,
|
|
244
248
|
total: stats.total,
|
|
245
249
|
entities: stats.entities,
|
|
246
|
-
relationships: stats.relationships
|
|
250
|
+
relationships: stats.relationships,
|
|
251
|
+
// Pass through enhanced metrics if available
|
|
252
|
+
throughput: stats.throughput,
|
|
253
|
+
eta: stats.eta
|
|
247
254
|
});
|
|
248
255
|
}
|
|
249
256
|
};
|
|
@@ -30,12 +30,18 @@ export interface SmartCSVOptions extends FormatHandlerOptions {
|
|
|
30
30
|
/** CSV-specific options */
|
|
31
31
|
csvDelimiter?: string;
|
|
32
32
|
csvHeaders?: boolean;
|
|
33
|
-
/** Progress callback */
|
|
33
|
+
/** Progress callback (v3.39.0: Enhanced with performance metrics) */
|
|
34
34
|
onProgress?: (stats: {
|
|
35
35
|
processed: number;
|
|
36
36
|
total: number;
|
|
37
37
|
entities: number;
|
|
38
38
|
relationships: number;
|
|
39
|
+
/** Rows per second (v3.39.0) */
|
|
40
|
+
throughput?: number;
|
|
41
|
+
/** Estimated time remaining in ms (v3.39.0) */
|
|
42
|
+
eta?: number;
|
|
43
|
+
/** Current phase (v3.39.0) */
|
|
44
|
+
phase?: string;
|
|
39
45
|
}) => void;
|
|
40
46
|
}
|
|
41
47
|
export interface ExtractedRow {
|
|
@@ -62,113 +62,141 @@ export class SmartCSVImporter {
|
|
|
62
62
|
}
|
|
63
63
|
// Detect column names
|
|
64
64
|
const columns = this.detectColumns(rows[0], opts);
|
|
65
|
-
// Process each row
|
|
65
|
+
// Process each row with BATCHED PARALLEL PROCESSING (v3.39.0)
|
|
66
66
|
const extractedRows = [];
|
|
67
67
|
const entityMap = new Map();
|
|
68
68
|
const stats = {
|
|
69
69
|
byType: {},
|
|
70
70
|
byConfidence: { high: 0, medium: 0, low: 0 }
|
|
71
71
|
};
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
const
|
|
79
|
-
//
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
//
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
72
|
+
// Batch processing configuration
|
|
73
|
+
const CHUNK_SIZE = 10; // Process 10 rows at a time for optimal performance
|
|
74
|
+
let totalProcessed = 0;
|
|
75
|
+
const performanceStartTime = Date.now();
|
|
76
|
+
// Process rows in chunks
|
|
77
|
+
for (let chunkStart = 0; chunkStart < rows.length; chunkStart += CHUNK_SIZE) {
|
|
78
|
+
const chunk = rows.slice(chunkStart, Math.min(chunkStart + CHUNK_SIZE, rows.length));
|
|
79
|
+
// Process chunk in parallel for massive speedup
|
|
80
|
+
const chunkResults = await Promise.all(chunk.map(async (row, chunkIndex) => {
|
|
81
|
+
const i = chunkStart + chunkIndex;
|
|
82
|
+
// Extract data from row
|
|
83
|
+
const term = this.getColumnValue(row, columns.term) || `Entity_${i}`;
|
|
84
|
+
const definition = this.getColumnValue(row, columns.definition) || '';
|
|
85
|
+
const type = this.getColumnValue(row, columns.type);
|
|
86
|
+
const relatedTerms = this.getColumnValue(row, columns.related);
|
|
87
|
+
// Parallel extraction: entities AND concepts at the same time
|
|
88
|
+
const [relatedEntities, concepts] = await Promise.all([
|
|
89
|
+
// Extract entities from definition
|
|
90
|
+
opts.enableNeuralExtraction && definition
|
|
91
|
+
? this.extractor.extract(definition, {
|
|
92
|
+
confidence: opts.confidenceThreshold * 0.8,
|
|
93
|
+
neuralMatching: true,
|
|
94
|
+
cache: { enabled: true }
|
|
95
|
+
}).then(entities =>
|
|
96
|
+
// Filter out the main term from related entities
|
|
97
|
+
entities.filter(e => e.text.toLowerCase() !== term.toLowerCase()))
|
|
98
|
+
: Promise.resolve([]),
|
|
99
|
+
// Extract concepts (in parallel with entity extraction)
|
|
100
|
+
opts.enableConceptExtraction && definition
|
|
101
|
+
? this.brain.extractConcepts(definition, { limit: 10 }).catch(() => [])
|
|
102
|
+
: Promise.resolve([])
|
|
103
|
+
]);
|
|
104
|
+
// Determine main entity type
|
|
105
|
+
const mainEntityType = type ?
|
|
106
|
+
this.mapTypeString(type) :
|
|
107
|
+
(relatedEntities.length > 0 ? relatedEntities[0].type : NounType.Thing);
|
|
108
|
+
// Generate entity ID
|
|
109
|
+
const entityId = this.generateEntityId(term);
|
|
110
|
+
// Create main entity
|
|
111
|
+
const mainEntity = {
|
|
112
|
+
id: entityId,
|
|
113
|
+
name: term,
|
|
114
|
+
type: mainEntityType,
|
|
115
|
+
description: definition,
|
|
116
|
+
confidence: 0.95,
|
|
117
|
+
metadata: {
|
|
118
|
+
source: 'csv',
|
|
119
|
+
row: i + 1,
|
|
120
|
+
originalData: row,
|
|
121
|
+
concepts,
|
|
122
|
+
extractedAt: Date.now()
|
|
123
|
+
}
|
|
124
|
+
};
|
|
125
|
+
// Infer relationships
|
|
126
|
+
const relationships = [];
|
|
127
|
+
if (opts.enableRelationshipInference) {
|
|
128
|
+
// Extract relationships from definition text
|
|
129
|
+
for (const relEntity of relatedEntities) {
|
|
130
|
+
const verbType = await this.inferRelationship(term, relEntity.text, definition);
|
|
131
|
+
relationships.push({
|
|
132
|
+
from: entityId,
|
|
133
|
+
to: relEntity.text,
|
|
134
|
+
type: verbType,
|
|
135
|
+
confidence: relEntity.confidence,
|
|
136
|
+
evidence: `Extracted from: "${definition.substring(0, 100)}..."`
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
// Parse explicit "Related" column
|
|
140
|
+
if (relatedTerms) {
|
|
141
|
+
const terms = relatedTerms.split(/[,;|]/).map(t => t.trim()).filter(Boolean);
|
|
142
|
+
for (const relTerm of terms) {
|
|
143
|
+
if (relTerm.toLowerCase() !== term.toLowerCase()) {
|
|
144
|
+
relationships.push({
|
|
145
|
+
from: entityId,
|
|
146
|
+
to: relTerm,
|
|
147
|
+
type: VerbType.RelatedTo,
|
|
148
|
+
confidence: 0.9,
|
|
149
|
+
evidence: `Explicitly listed in "Related" column`
|
|
150
|
+
});
|
|
151
|
+
}
|
|
151
152
|
}
|
|
152
153
|
}
|
|
153
154
|
}
|
|
155
|
+
return {
|
|
156
|
+
term,
|
|
157
|
+
entityId,
|
|
158
|
+
mainEntity,
|
|
159
|
+
mainEntityType,
|
|
160
|
+
relatedEntities,
|
|
161
|
+
relationships,
|
|
162
|
+
concepts
|
|
163
|
+
};
|
|
164
|
+
}));
|
|
165
|
+
// Process chunk results sequentially to maintain order
|
|
166
|
+
for (const result of chunkResults) {
|
|
167
|
+
// Store entity ID mapping
|
|
168
|
+
entityMap.set(result.term.toLowerCase(), result.entityId);
|
|
169
|
+
// Track statistics
|
|
170
|
+
this.updateStats(stats, result.mainEntityType, result.mainEntity.confidence);
|
|
171
|
+
// Add extracted row
|
|
172
|
+
extractedRows.push({
|
|
173
|
+
entity: result.mainEntity,
|
|
174
|
+
relatedEntities: result.relatedEntities.map(e => ({
|
|
175
|
+
name: e.text,
|
|
176
|
+
type: e.type,
|
|
177
|
+
confidence: e.confidence
|
|
178
|
+
})),
|
|
179
|
+
relationships: result.relationships,
|
|
180
|
+
concepts: result.concepts
|
|
181
|
+
});
|
|
154
182
|
}
|
|
155
|
-
//
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
relationships,
|
|
164
|
-
concepts
|
|
165
|
-
});
|
|
166
|
-
// Report progress
|
|
183
|
+
// Update progress tracking
|
|
184
|
+
totalProcessed += chunk.length;
|
|
185
|
+
// Calculate performance metrics
|
|
186
|
+
const elapsed = Date.now() - performanceStartTime;
|
|
187
|
+
const rowsPerSecond = totalProcessed / (elapsed / 1000);
|
|
188
|
+
const remainingRows = rows.length - totalProcessed;
|
|
189
|
+
const estimatedTimeRemaining = remainingRows / rowsPerSecond;
|
|
190
|
+
// Report progress with enhanced metrics
|
|
167
191
|
opts.onProgress({
|
|
168
|
-
processed:
|
|
192
|
+
processed: totalProcessed,
|
|
169
193
|
total: rows.length,
|
|
170
|
-
entities: extractedRows.
|
|
171
|
-
relationships: relationships.length
|
|
194
|
+
entities: extractedRows.reduce((sum, row) => sum + 1 + row.relatedEntities.length, 0),
|
|
195
|
+
relationships: extractedRows.reduce((sum, row) => sum + row.relationships.length, 0),
|
|
196
|
+
// Additional performance metrics (v3.39.0)
|
|
197
|
+
throughput: Math.round(rowsPerSecond * 10) / 10,
|
|
198
|
+
eta: Math.round(estimatedTimeRemaining),
|
|
199
|
+
phase: 'extracting'
|
|
172
200
|
});
|
|
173
201
|
}
|
|
174
202
|
return {
|
|
@@ -25,12 +25,18 @@ export interface SmartExcelOptions extends FormatHandlerOptions {
|
|
|
25
25
|
definitionColumn?: string;
|
|
26
26
|
typeColumn?: string;
|
|
27
27
|
relatedColumn?: string;
|
|
28
|
-
/** Progress callback */
|
|
28
|
+
/** Progress callback (v3.38.0: Enhanced with performance metrics) */
|
|
29
29
|
onProgress?: (stats: {
|
|
30
30
|
processed: number;
|
|
31
31
|
total: number;
|
|
32
32
|
entities: number;
|
|
33
33
|
relationships: number;
|
|
34
|
+
/** Rows per second (v3.38.0) */
|
|
35
|
+
throughput?: number;
|
|
36
|
+
/** Estimated time remaining in ms (v3.38.0) */
|
|
37
|
+
eta?: number;
|
|
38
|
+
/** Current phase (v3.38.0) */
|
|
39
|
+
phase?: string;
|
|
34
40
|
}) => void;
|
|
35
41
|
}
|
|
36
42
|
export interface ExtractedRow {
|
|
@@ -66,114 +66,141 @@ export class SmartExcelImporter {
|
|
|
66
66
|
}
|
|
67
67
|
// Detect column names
|
|
68
68
|
const columns = this.detectColumns(rows[0], opts);
|
|
69
|
-
// Process each row
|
|
69
|
+
// Process each row with BATCHED PARALLEL PROCESSING (v3.38.0)
|
|
70
70
|
const extractedRows = [];
|
|
71
71
|
const entityMap = new Map();
|
|
72
72
|
const stats = {
|
|
73
73
|
byType: {},
|
|
74
74
|
byConfidence: { high: 0, medium: 0, low: 0 }
|
|
75
75
|
};
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
const
|
|
83
|
-
//
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
//
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
}
|
|
76
|
+
// Batch processing configuration
|
|
77
|
+
const CHUNK_SIZE = 10; // Process 10 rows at a time for optimal performance
|
|
78
|
+
let totalProcessed = 0;
|
|
79
|
+
const performanceStartTime = Date.now();
|
|
80
|
+
// Process rows in chunks
|
|
81
|
+
for (let chunkStart = 0; chunkStart < rows.length; chunkStart += CHUNK_SIZE) {
|
|
82
|
+
const chunk = rows.slice(chunkStart, Math.min(chunkStart + CHUNK_SIZE, rows.length));
|
|
83
|
+
// Process chunk in parallel for massive speedup
|
|
84
|
+
const chunkResults = await Promise.all(chunk.map(async (row, chunkIndex) => {
|
|
85
|
+
const i = chunkStart + chunkIndex;
|
|
86
|
+
// Extract data from row
|
|
87
|
+
const term = this.getColumnValue(row, columns.term) || `Entity_${i}`;
|
|
88
|
+
const definition = this.getColumnValue(row, columns.definition) || '';
|
|
89
|
+
const type = this.getColumnValue(row, columns.type);
|
|
90
|
+
const relatedTerms = this.getColumnValue(row, columns.related);
|
|
91
|
+
// Parallel extraction: entities AND concepts at the same time
|
|
92
|
+
const [relatedEntities, concepts] = await Promise.all([
|
|
93
|
+
// Extract entities from definition
|
|
94
|
+
opts.enableNeuralExtraction && definition
|
|
95
|
+
? this.extractor.extract(definition, {
|
|
96
|
+
confidence: opts.confidenceThreshold * 0.8,
|
|
97
|
+
neuralMatching: true,
|
|
98
|
+
cache: { enabled: true }
|
|
99
|
+
}).then(entities =>
|
|
100
|
+
// Filter out the main term from related entities
|
|
101
|
+
entities.filter(e => e.text.toLowerCase() !== term.toLowerCase()))
|
|
102
|
+
: Promise.resolve([]),
|
|
103
|
+
// Extract concepts (in parallel with entity extraction)
|
|
104
|
+
opts.enableConceptExtraction && definition
|
|
105
|
+
? this.brain.extractConcepts(definition, { limit: 10 }).catch(() => [])
|
|
106
|
+
: Promise.resolve([])
|
|
107
|
+
]);
|
|
108
|
+
// Determine main entity type
|
|
109
|
+
const mainEntityType = type ?
|
|
110
|
+
this.mapTypeString(type) :
|
|
111
|
+
(relatedEntities.length > 0 ? relatedEntities[0].type : NounType.Thing);
|
|
112
|
+
// Generate entity ID
|
|
113
|
+
const entityId = this.generateEntityId(term);
|
|
114
|
+
// Create main entity
|
|
115
|
+
const mainEntity = {
|
|
116
|
+
id: entityId,
|
|
117
|
+
name: term,
|
|
118
|
+
type: mainEntityType,
|
|
119
|
+
description: definition,
|
|
120
|
+
confidence: 0.95,
|
|
121
|
+
metadata: {
|
|
122
|
+
source: 'excel',
|
|
123
|
+
row: i + 1,
|
|
124
|
+
originalData: row,
|
|
125
|
+
concepts,
|
|
126
|
+
extractedAt: Date.now()
|
|
127
|
+
}
|
|
128
|
+
};
|
|
129
|
+
// Infer relationships
|
|
130
|
+
const relationships = [];
|
|
131
|
+
if (opts.enableRelationshipInference) {
|
|
132
|
+
// Extract relationships from definition text
|
|
133
|
+
for (const relEntity of relatedEntities) {
|
|
134
|
+
const verbType = await this.inferRelationship(term, relEntity.text, definition);
|
|
135
|
+
relationships.push({
|
|
136
|
+
from: entityId,
|
|
137
|
+
to: relEntity.text,
|
|
138
|
+
type: verbType,
|
|
139
|
+
confidence: relEntity.confidence,
|
|
140
|
+
evidence: `Extracted from: "${definition.substring(0, 100)}..."`
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
// Parse explicit "Related Terms" column
|
|
144
|
+
if (relatedTerms) {
|
|
145
|
+
const terms = relatedTerms.split(/[,;]/).map(t => t.trim()).filter(Boolean);
|
|
146
|
+
for (const relTerm of terms) {
|
|
147
|
+
if (relTerm.toLowerCase() !== term.toLowerCase()) {
|
|
148
|
+
relationships.push({
|
|
149
|
+
from: entityId,
|
|
150
|
+
to: relTerm,
|
|
151
|
+
type: VerbType.RelatedTo,
|
|
152
|
+
confidence: 0.9,
|
|
153
|
+
evidence: `Explicitly listed in "Related" column`
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
156
|
}
|
|
157
157
|
}
|
|
158
158
|
}
|
|
159
|
+
return {
|
|
160
|
+
term,
|
|
161
|
+
entityId,
|
|
162
|
+
mainEntity,
|
|
163
|
+
mainEntityType,
|
|
164
|
+
relatedEntities,
|
|
165
|
+
relationships,
|
|
166
|
+
concepts
|
|
167
|
+
};
|
|
168
|
+
}));
|
|
169
|
+
// Process chunk results sequentially to maintain order
|
|
170
|
+
for (const result of chunkResults) {
|
|
171
|
+
// Store entity ID mapping
|
|
172
|
+
entityMap.set(result.term.toLowerCase(), result.entityId);
|
|
173
|
+
// Track statistics
|
|
174
|
+
this.updateStats(stats, result.mainEntityType, result.mainEntity.confidence);
|
|
175
|
+
// Add extracted row
|
|
176
|
+
extractedRows.push({
|
|
177
|
+
entity: result.mainEntity,
|
|
178
|
+
relatedEntities: result.relatedEntities.map(e => ({
|
|
179
|
+
name: e.text,
|
|
180
|
+
type: e.type,
|
|
181
|
+
confidence: e.confidence
|
|
182
|
+
})),
|
|
183
|
+
relationships: result.relationships,
|
|
184
|
+
concepts: result.concepts
|
|
185
|
+
});
|
|
159
186
|
}
|
|
160
|
-
//
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
relationships,
|
|
169
|
-
concepts
|
|
170
|
-
});
|
|
171
|
-
// Report progress
|
|
187
|
+
// Update progress tracking
|
|
188
|
+
totalProcessed += chunk.length;
|
|
189
|
+
// Calculate performance metrics
|
|
190
|
+
const elapsed = Date.now() - performanceStartTime;
|
|
191
|
+
const rowsPerSecond = totalProcessed / (elapsed / 1000);
|
|
192
|
+
const remainingRows = rows.length - totalProcessed;
|
|
193
|
+
const estimatedTimeRemaining = remainingRows / rowsPerSecond;
|
|
194
|
+
// Report progress with enhanced metrics
|
|
172
195
|
opts.onProgress({
|
|
173
|
-
processed:
|
|
196
|
+
processed: totalProcessed,
|
|
174
197
|
total: rows.length,
|
|
175
|
-
entities: extractedRows.
|
|
176
|
-
relationships: relationships.length
|
|
198
|
+
entities: extractedRows.reduce((sum, row) => sum + 1 + row.relatedEntities.length, 0),
|
|
199
|
+
relationships: extractedRows.reduce((sum, row) => sum + row.relationships.length, 0),
|
|
200
|
+
// Additional performance metrics (v3.38.0)
|
|
201
|
+
throughput: Math.round(rowsPerSecond * 10) / 10,
|
|
202
|
+
eta: Math.round(estimatedTimeRemaining),
|
|
203
|
+
phase: 'extracting'
|
|
177
204
|
});
|
|
178
205
|
}
|
|
179
206
|
return {
|
|
@@ -27,12 +27,18 @@ export interface SmartPDFOptions extends FormatHandlerOptions {
|
|
|
27
27
|
extractFromTables?: boolean;
|
|
28
28
|
/** Group by page or full document */
|
|
29
29
|
groupBy?: 'page' | 'document';
|
|
30
|
-
/** Progress callback */
|
|
30
|
+
/** Progress callback (v3.39.0: Enhanced with performance metrics) */
|
|
31
31
|
onProgress?: (stats: {
|
|
32
32
|
processed: number;
|
|
33
33
|
total: number;
|
|
34
34
|
entities: number;
|
|
35
35
|
relationships: number;
|
|
36
|
+
/** Sections per second (v3.39.0) */
|
|
37
|
+
throughput?: number;
|
|
38
|
+
/** Estimated time remaining in ms (v3.39.0) */
|
|
39
|
+
eta?: number;
|
|
40
|
+
/** Current phase (v3.39.0) */
|
|
41
|
+
phase?: string;
|
|
36
42
|
}) => void;
|
|
37
43
|
}
|
|
38
44
|
export interface ExtractedSection {
|
|
@@ -55,7 +55,7 @@ export class SmartPDFImporter {
|
|
|
55
55
|
}
|
|
56
56
|
// Group data by page or combine into single document
|
|
57
57
|
const grouped = this.groupData(data, opts);
|
|
58
|
-
// Process each group
|
|
58
|
+
// Process each group with BATCHED PARALLEL PROCESSING (v3.39.0)
|
|
59
59
|
const sections = [];
|
|
60
60
|
const entityMap = new Map();
|
|
61
61
|
const stats = {
|
|
@@ -63,17 +63,35 @@ export class SmartPDFImporter {
|
|
|
63
63
|
byConfidence: { high: 0, medium: 0, low: 0 },
|
|
64
64
|
bySource: { paragraphs: 0, tables: 0 }
|
|
65
65
|
};
|
|
66
|
-
|
|
66
|
+
// Batch processing configuration
|
|
67
|
+
const CHUNK_SIZE = 5; // Process 5 sections at a time (smaller than rows due to section size)
|
|
68
|
+
let totalProcessed = 0;
|
|
69
|
+
const performanceStartTime = Date.now();
|
|
67
70
|
const totalGroups = grouped.length;
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
// Process sections in chunks
|
|
72
|
+
for (let chunkStart = 0; chunkStart < grouped.length; chunkStart += CHUNK_SIZE) {
|
|
73
|
+
const chunk = grouped.slice(chunkStart, Math.min(chunkStart + CHUNK_SIZE, grouped.length));
|
|
74
|
+
// Process chunk in parallel for better performance
|
|
75
|
+
const chunkResults = await Promise.all(chunk.map(group => this.processSection(group, opts, stats, entityMap)));
|
|
76
|
+
// Add results sequentially to maintain order
|
|
77
|
+
sections.push(...chunkResults);
|
|
78
|
+
// Update progress tracking
|
|
79
|
+
totalProcessed += chunk.length;
|
|
80
|
+
// Calculate performance metrics
|
|
81
|
+
const elapsed = Date.now() - performanceStartTime;
|
|
82
|
+
const sectionsPerSecond = totalProcessed / (elapsed / 1000);
|
|
83
|
+
const remainingSections = grouped.length - totalProcessed;
|
|
84
|
+
const estimatedTimeRemaining = remainingSections / sectionsPerSecond;
|
|
85
|
+
// Report progress with enhanced metrics
|
|
72
86
|
opts.onProgress({
|
|
73
|
-
processed:
|
|
87
|
+
processed: totalProcessed,
|
|
74
88
|
total: totalGroups,
|
|
75
89
|
entities: sections.reduce((sum, s) => sum + s.entities.length, 0),
|
|
76
|
-
relationships: sections.reduce((sum, s) => sum + s.relationships.length, 0)
|
|
90
|
+
relationships: sections.reduce((sum, s) => sum + s.relationships.length, 0),
|
|
91
|
+
// Additional performance metrics (v3.39.0)
|
|
92
|
+
throughput: Math.round(sectionsPerSecond * 10) / 10,
|
|
93
|
+
eta: Math.round(estimatedTimeRemaining),
|
|
94
|
+
phase: 'extracting'
|
|
77
95
|
});
|
|
78
96
|
}
|
|
79
97
|
const pagesProcessed = new Set(data.map(d => d._page)).size;
|
|
@@ -150,25 +168,21 @@ export class SmartPDFImporter {
|
|
|
150
168
|
}
|
|
151
169
|
}
|
|
152
170
|
const combinedText = texts.join('\n\n');
|
|
153
|
-
//
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
catch (error) {
|
|
169
|
-
concepts = [];
|
|
170
|
-
}
|
|
171
|
-
}
|
|
171
|
+
// Parallel extraction: entities AND concepts at the same time (v3.39.0)
|
|
172
|
+
const [extractedEntities, concepts] = await Promise.all([
|
|
173
|
+
// Extract entities if enabled
|
|
174
|
+
options.enableNeuralExtraction && combinedText.length > 0
|
|
175
|
+
? this.extractor.extract(combinedText, {
|
|
176
|
+
confidence: options.confidenceThreshold || 0.6,
|
|
177
|
+
neuralMatching: true,
|
|
178
|
+
cache: { enabled: true }
|
|
179
|
+
})
|
|
180
|
+
: Promise.resolve([]),
|
|
181
|
+
// Extract concepts (in parallel with entity extraction)
|
|
182
|
+
options.enableConceptExtraction && combinedText.length > 0
|
|
183
|
+
? this.brain.extractConcepts(combinedText, { limit: 15 }).catch(() => [])
|
|
184
|
+
: Promise.resolve([])
|
|
185
|
+
]);
|
|
172
186
|
// Create entity objects
|
|
173
187
|
const entities = extractedEntities.map(e => {
|
|
174
188
|
const entityId = this.generateEntityId(e.text, group.id);
|
|
@@ -24,6 +24,8 @@ export declare class NeuralEntityExtractor {
|
|
|
24
24
|
private typeEmbeddings;
|
|
25
25
|
private initialized;
|
|
26
26
|
private cache;
|
|
27
|
+
private embeddingCache;
|
|
28
|
+
private embeddingCacheStats;
|
|
27
29
|
constructor(brain: Brainy | Brainy<any>, cacheOptions?: EntityCacheOptions);
|
|
28
30
|
/**
|
|
29
31
|
* Initialize type embeddings for neural matching
|
|
@@ -61,7 +63,10 @@ export declare class NeuralEntityExtractor {
|
|
|
61
63
|
*/
|
|
62
64
|
private classifyByRules;
|
|
63
65
|
/**
|
|
64
|
-
* Get embedding for text
|
|
66
|
+
* Get embedding for text with caching (v3.38.0)
|
|
67
|
+
*
|
|
68
|
+
* PERFORMANCE OPTIMIZATION: Caches embeddings during extraction session
|
|
69
|
+
* to avoid redundant model calls for repeated text (common in large imports)
|
|
65
70
|
*/
|
|
66
71
|
private getEmbedding;
|
|
67
72
|
/**
|
|
@@ -96,4 +101,27 @@ export declare class NeuralEntityExtractor {
|
|
|
96
101
|
* Cleanup expired cache entries
|
|
97
102
|
*/
|
|
98
103
|
cleanupCache(): number;
|
|
104
|
+
/**
|
|
105
|
+
* Clear embedding cache (v3.38.0)
|
|
106
|
+
*
|
|
107
|
+
* Clears the runtime embedding cache. Useful for:
|
|
108
|
+
* - Freeing memory after large imports
|
|
109
|
+
* - Testing with fresh cache state
|
|
110
|
+
*/
|
|
111
|
+
clearEmbeddingCache(): void;
|
|
112
|
+
/**
|
|
113
|
+
* Get embedding cache statistics (v3.38.0)
|
|
114
|
+
*
|
|
115
|
+
* Returns performance metrics for the embedding cache:
|
|
116
|
+
* - hits: Number of cache hits (avoided model calls)
|
|
117
|
+
* - misses: Number of cache misses (required model calls)
|
|
118
|
+
* - size: Current cache size
|
|
119
|
+
* - hitRate: Percentage of requests served from cache
|
|
120
|
+
*/
|
|
121
|
+
getEmbeddingCacheStats(): {
|
|
122
|
+
hitRate: number;
|
|
123
|
+
hits: number;
|
|
124
|
+
misses: number;
|
|
125
|
+
size: number;
|
|
126
|
+
};
|
|
99
127
|
}
|
|
@@ -12,6 +12,14 @@ export class NeuralEntityExtractor {
|
|
|
12
12
|
// Type embeddings for similarity matching
|
|
13
13
|
this.typeEmbeddings = new Map();
|
|
14
14
|
this.initialized = false;
|
|
15
|
+
// Runtime embedding cache for performance (v3.38.0)
|
|
16
|
+
// Caches candidate embeddings during an extraction session to avoid redundant model calls
|
|
17
|
+
this.embeddingCache = new Map();
|
|
18
|
+
this.embeddingCacheStats = {
|
|
19
|
+
hits: 0,
|
|
20
|
+
misses: 0,
|
|
21
|
+
size: 0
|
|
22
|
+
};
|
|
15
23
|
this.brain = brain;
|
|
16
24
|
this.cache = new EntityExtractionCache(cacheOptions);
|
|
17
25
|
}
|
|
@@ -253,20 +261,46 @@ export class NeuralEntityExtractor {
|
|
|
253
261
|
return { type: NounType.Thing, confidence: 0.3 };
|
|
254
262
|
}
|
|
255
263
|
/**
|
|
256
|
-
* Get embedding for text
|
|
264
|
+
* Get embedding for text with caching (v3.38.0)
|
|
265
|
+
*
|
|
266
|
+
* PERFORMANCE OPTIMIZATION: Caches embeddings during extraction session
|
|
267
|
+
* to avoid redundant model calls for repeated text (common in large imports)
|
|
257
268
|
*/
|
|
258
269
|
async getEmbedding(text) {
|
|
270
|
+
// Normalize text for cache key
|
|
271
|
+
const normalizedText = text.trim().toLowerCase();
|
|
272
|
+
// Check cache first
|
|
273
|
+
const cached = this.embeddingCache.get(normalizedText);
|
|
274
|
+
if (cached) {
|
|
275
|
+
this.embeddingCacheStats.hits++;
|
|
276
|
+
return cached;
|
|
277
|
+
}
|
|
278
|
+
// Cache miss - generate embedding
|
|
279
|
+
this.embeddingCacheStats.misses++;
|
|
280
|
+
let vector;
|
|
259
281
|
if ('embed' in this.brain && typeof this.brain.embed === 'function') {
|
|
260
|
-
|
|
282
|
+
vector = await this.brain.embed(text);
|
|
261
283
|
}
|
|
262
284
|
else {
|
|
263
285
|
// Fallback - create simple hash-based vector
|
|
264
|
-
|
|
286
|
+
vector = new Array(384).fill(0);
|
|
265
287
|
for (let i = 0; i < text.length; i++) {
|
|
266
288
|
vector[i % 384] += text.charCodeAt(i) / 255;
|
|
267
289
|
}
|
|
268
|
-
|
|
290
|
+
vector = vector.map(v => v / text.length);
|
|
291
|
+
}
|
|
292
|
+
// Store in cache
|
|
293
|
+
this.embeddingCache.set(normalizedText, vector);
|
|
294
|
+
this.embeddingCacheStats.size = this.embeddingCache.size;
|
|
295
|
+
// Memory management: Clear cache if it grows too large (>10000 entries)
|
|
296
|
+
if (this.embeddingCache.size > 10000) {
|
|
297
|
+
// Keep most recent 5000 entries (simple LRU approximation)
|
|
298
|
+
const entries = Array.from(this.embeddingCache.entries());
|
|
299
|
+
this.embeddingCache.clear();
|
|
300
|
+
entries.slice(-5000).forEach(([k, v]) => this.embeddingCache.set(k, v));
|
|
301
|
+
this.embeddingCacheStats.size = this.embeddingCache.size;
|
|
269
302
|
}
|
|
303
|
+
return vector;
|
|
270
304
|
}
|
|
271
305
|
/**
|
|
272
306
|
* Calculate cosine similarity between vectors
|
|
@@ -355,5 +389,36 @@ export class NeuralEntityExtractor {
|
|
|
355
389
|
cleanupCache() {
|
|
356
390
|
return this.cache.cleanup();
|
|
357
391
|
}
|
|
392
|
+
/**
|
|
393
|
+
* Clear embedding cache (v3.38.0)
|
|
394
|
+
*
|
|
395
|
+
* Clears the runtime embedding cache. Useful for:
|
|
396
|
+
* - Freeing memory after large imports
|
|
397
|
+
* - Testing with fresh cache state
|
|
398
|
+
*/
|
|
399
|
+
clearEmbeddingCache() {
|
|
400
|
+
this.embeddingCache.clear();
|
|
401
|
+
this.embeddingCacheStats = {
|
|
402
|
+
hits: 0,
|
|
403
|
+
misses: 0,
|
|
404
|
+
size: 0
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Get embedding cache statistics (v3.38.0)
|
|
409
|
+
*
|
|
410
|
+
* Returns performance metrics for the embedding cache:
|
|
411
|
+
* - hits: Number of cache hits (avoided model calls)
|
|
412
|
+
* - misses: Number of cache misses (required model calls)
|
|
413
|
+
* - size: Current cache size
|
|
414
|
+
* - hitRate: Percentage of requests served from cache
|
|
415
|
+
*/
|
|
416
|
+
getEmbeddingCacheStats() {
|
|
417
|
+
const total = this.embeddingCacheStats.hits + this.embeddingCacheStats.misses;
|
|
418
|
+
return {
|
|
419
|
+
...this.embeddingCacheStats,
|
|
420
|
+
hitRate: total > 0 ? this.embeddingCacheStats.hits / total : 0
|
|
421
|
+
};
|
|
422
|
+
}
|
|
358
423
|
}
|
|
359
424
|
//# sourceMappingURL=entityExtractor.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@soulcraft/brainy",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.40.0",
|
|
4
4
|
"description": "Universal Knowledge Protocol™ - World's first Triple Intelligence database unifying vector, graph, and document search in one API. 31 nouns × 40 verbs for infinite expressiveness.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"module": "dist/index.js",
|