@soulcraft/brainy 0.40.0 → 0.43.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +605 -194
- package/dist/augmentations/conduitAugmentations.js +1158 -0
- package/dist/augmentations/conduitAugmentations.js.map +1 -0
- package/dist/augmentations/memoryAugmentations.d.ts +2 -0
- package/dist/augmentations/memoryAugmentations.d.ts.map +1 -1
- package/dist/augmentations/memoryAugmentations.js +270 -0
- package/dist/augmentations/memoryAugmentations.js.map +1 -0
- package/dist/augmentations/serverSearchAugmentations.js +531 -0
- package/dist/augmentations/serverSearchAugmentations.js.map +1 -0
- package/dist/browserFramework.d.ts +15 -0
- package/dist/demo.d.ts +106 -0
- package/dist/examples/basicUsage.js +118 -0
- package/dist/examples/basicUsage.js.map +1 -0
- package/dist/hnsw/distributedSearch.js +452 -0
- package/dist/hnsw/distributedSearch.js.map +1 -0
- package/dist/hnsw/hnswIndex.js +602 -0
- package/dist/hnsw/hnswIndex.js.map +1 -0
- package/dist/hnsw/hnswIndexOptimized.js +471 -0
- package/dist/hnsw/hnswIndexOptimized.js.map +1 -0
- package/dist/hnsw/optimizedHNSWIndex.js +313 -0
- package/dist/hnsw/optimizedHNSWIndex.js.map +1 -0
- package/dist/hnsw/partitionedHNSWIndex.js +304 -0
- package/dist/hnsw/partitionedHNSWIndex.js.map +1 -0
- package/dist/hnsw/scaledHNSWSystem.js +559 -0
- package/dist/hnsw/scaledHNSWSystem.js.map +1 -0
- package/dist/index.d.ts +5 -3
- package/dist/index.js +81 -0
- package/dist/mcp/brainyMCPAdapter.js +142 -0
- package/dist/mcp/brainyMCPAdapter.js.map +1 -0
- package/dist/mcp/brainyMCPService.js +248 -0
- package/dist/mcp/brainyMCPService.js.map +1 -0
- package/dist/mcp/index.js +17 -0
- package/dist/mcp/index.js.map +1 -0
- package/dist/mcp/mcpAugmentationToolset.js +180 -0
- package/dist/mcp/mcpAugmentationToolset.js.map +1 -0
- package/dist/storage/adapters/baseStorageAdapter.js +349 -0
- package/dist/storage/adapters/baseStorageAdapter.js.map +1 -0
- package/dist/storage/adapters/batchS3Operations.js +287 -0
- package/dist/storage/adapters/batchS3Operations.js.map +1 -0
- package/dist/storage/adapters/fileSystemStorage.js +846 -0
- package/dist/storage/adapters/fileSystemStorage.js.map +1 -0
- package/dist/storage/adapters/memoryStorage.js +532 -0
- package/dist/storage/adapters/memoryStorage.js.map +1 -0
- package/dist/storage/adapters/opfsStorage.d.ts.map +1 -1
- package/dist/storage/adapters/opfsStorage.js +1118 -0
- package/dist/storage/adapters/opfsStorage.js.map +1 -0
- package/dist/storage/adapters/optimizedS3Search.d.ts +79 -0
- package/dist/storage/adapters/optimizedS3Search.d.ts.map +1 -0
- package/dist/storage/adapters/optimizedS3Search.js +248 -0
- package/dist/storage/adapters/optimizedS3Search.js.map +1 -0
- package/dist/storage/adapters/s3CompatibleStorage.d.ts +21 -0
- package/dist/storage/adapters/s3CompatibleStorage.d.ts.map +1 -1
- package/dist/storage/adapters/s3CompatibleStorage.js +2026 -0
- package/dist/storage/adapters/s3CompatibleStorage.js.map +1 -0
- package/dist/storage/baseStorage.d.ts +1 -0
- package/dist/storage/baseStorage.d.ts.map +1 -1
- package/dist/storage/baseStorage.js +603 -0
- package/dist/storage/baseStorage.js.map +1 -0
- package/dist/storage/cacheManager.js +1306 -0
- package/dist/storage/cacheManager.js.map +1 -0
- package/dist/storage/enhancedCacheManager.js +520 -0
- package/dist/storage/enhancedCacheManager.js.map +1 -0
- package/dist/storage/readOnlyOptimizations.js +425 -0
- package/dist/storage/readOnlyOptimizations.js.map +1 -0
- package/dist/storage/storageFactory.d.ts +0 -1
- package/dist/storage/storageFactory.d.ts.map +1 -1
- package/dist/storage/storageFactory.js +227 -0
- package/dist/storage/storageFactory.js.map +1 -0
- package/dist/types/augmentations.js +16 -0
- package/dist/types/augmentations.js.map +1 -0
- package/dist/types/brainyDataInterface.js +8 -0
- package/dist/types/brainyDataInterface.js.map +1 -0
- package/dist/types/distributedTypes.js +6 -0
- package/dist/types/distributedTypes.js.map +1 -0
- package/dist/types/fileSystemTypes.js +8 -0
- package/dist/types/fileSystemTypes.js.map +1 -0
- package/dist/types/graphTypes.js +247 -0
- package/dist/types/graphTypes.js.map +1 -0
- package/dist/types/mcpTypes.js +22 -0
- package/dist/types/mcpTypes.js.map +1 -0
- package/dist/types/paginationTypes.js +5 -0
- package/dist/types/paginationTypes.js.map +1 -0
- package/dist/types/pipelineTypes.js +7 -0
- package/dist/types/pipelineTypes.js.map +1 -0
- package/dist/types/tensorflowTypes.js +6 -0
- package/dist/types/tensorflowTypes.js.map +1 -0
- package/dist/unified.js +52 -128048
- package/dist/utils/autoConfiguration.js +341 -0
- package/dist/utils/autoConfiguration.js.map +1 -0
- package/dist/utils/cacheAutoConfig.js +261 -0
- package/dist/utils/cacheAutoConfig.js.map +1 -0
- package/dist/utils/crypto.js +45 -0
- package/dist/utils/crypto.js.map +1 -0
- package/dist/utils/distance.js +239 -0
- package/dist/utils/distance.js.map +1 -0
- package/dist/utils/embedding.d.ts.map +1 -1
- package/dist/utils/embedding.js +702 -0
- package/dist/utils/embedding.js.map +1 -0
- package/dist/utils/environment.js +75 -0
- package/dist/utils/environment.js.map +1 -0
- package/dist/utils/fieldNameTracking.js +90 -0
- package/dist/utils/fieldNameTracking.js.map +1 -0
- package/dist/utils/index.d.ts +1 -0
- package/dist/utils/index.d.ts.map +1 -1
- package/dist/utils/index.js +8 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/jsonProcessing.js +179 -0
- package/dist/utils/jsonProcessing.js.map +1 -0
- package/dist/utils/logger.d.ts +45 -92
- package/dist/utils/logger.d.ts.map +1 -1
- package/dist/utils/logger.js +129 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/utils/operationUtils.js +126 -0
- package/dist/utils/operationUtils.js.map +1 -0
- package/dist/utils/robustModelLoader.d.ts +14 -0
- package/dist/utils/robustModelLoader.d.ts.map +1 -1
- package/dist/utils/robustModelLoader.js +537 -0
- package/dist/utils/robustModelLoader.js.map +1 -0
- package/dist/utils/searchCache.js +248 -0
- package/dist/utils/searchCache.js.map +1 -0
- package/dist/utils/statistics.js +25 -0
- package/dist/utils/statistics.js.map +1 -0
- package/dist/utils/statisticsCollector.js +224 -0
- package/dist/utils/statisticsCollector.js.map +1 -0
- package/dist/utils/textEncoding.js +309 -0
- package/dist/utils/textEncoding.js.map +1 -0
- package/dist/utils/typeUtils.js +40 -0
- package/dist/utils/typeUtils.js.map +1 -0
- package/dist/utils/version.d.ts +15 -3
- package/dist/utils/version.d.ts.map +1 -1
- package/dist/utils/version.js +24 -0
- package/dist/utils/version.js.map +1 -0
- package/dist/utils/workerUtils.js +458 -0
- package/dist/utils/workerUtils.js.map +1 -0
- package/package.json +23 -15
- package/dist/brainy.js +0 -90220
- package/dist/brainy.min.js +0 -12511
- package/dist/patched-platform-node.d.ts +0 -17
- package/dist/statistics/statisticsManager.d.ts +0 -121
- package/dist/storage/fileSystemStorage.d.ts +0 -73
- package/dist/storage/fileSystemStorage.d.ts.map +0 -1
- package/dist/storage/opfsStorage.d.ts +0 -236
- package/dist/storage/opfsStorage.d.ts.map +0 -1
- package/dist/storage/s3CompatibleStorage.d.ts +0 -157
- package/dist/storage/s3CompatibleStorage.d.ts.map +0 -1
- package/dist/testing/prettyReporter.d.ts +0 -23
- package/dist/testing/prettySummaryReporter.d.ts +0 -22
- package/dist/unified.min.js +0 -16153
- package/dist/utils/environmentDetection.d.ts +0 -47
- package/dist/utils/environmentDetection.d.ts.map +0 -1
- package/dist/utils/tensorflowUtils.d.ts +0 -17
- package/dist/utils/tensorflowUtils.d.ts.map +0 -1
|
@@ -0,0 +1,702 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding functions for converting data to vectors
|
|
3
|
+
*/
|
|
4
|
+
import { executeInThread } from './workerUtils.js';
|
|
5
|
+
import { isBrowser } from './environment.js';
|
|
6
|
+
import { createRobustModelLoader, getUniversalSentenceEncoderFallbacks } from './robustModelLoader.js';
|
|
7
|
+
export class UniversalSentenceEncoder {
|
|
8
|
+
/**
|
|
9
|
+
* Create a new UniversalSentenceEncoder instance
|
|
10
|
+
* @param options Configuration options including reliability settings
|
|
11
|
+
*/
|
|
12
|
+
constructor(options = {}) {
|
|
13
|
+
this.model = null;
|
|
14
|
+
this.initialized = false;
|
|
15
|
+
this.tf = null;
|
|
16
|
+
this.use = null;
|
|
17
|
+
this.backend = 'cpu'; // Default to CPU
|
|
18
|
+
this.verbose = true; // Whether to log non-essential messages
|
|
19
|
+
this.verbose = options.verbose !== undefined ? options.verbose : true;
|
|
20
|
+
// Create robust model loader with enhanced reliability features
|
|
21
|
+
this.robustLoader = createRobustModelLoader({
|
|
22
|
+
maxRetries: options.maxRetries ?? 3,
|
|
23
|
+
initialRetryDelay: options.initialRetryDelay ?? 1000,
|
|
24
|
+
maxRetryDelay: options.maxRetryDelay ?? 30000,
|
|
25
|
+
timeout: options.timeout ?? 60000,
|
|
26
|
+
useExponentialBackoff: options.useExponentialBackoff ?? true,
|
|
27
|
+
fallbackUrls: options.fallbackUrls ?? getUniversalSentenceEncoderFallbacks(),
|
|
28
|
+
verbose: this.verbose,
|
|
29
|
+
preferLocalModel: options.preferLocalModel ?? true
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Add polyfills and patches for TensorFlow.js compatibility
|
|
34
|
+
* This addresses issues with TensorFlow.js across all server environments
|
|
35
|
+
* (Node.js, serverless, and other server environments)
|
|
36
|
+
*
|
|
37
|
+
* Note: The main TensorFlow.js patching is now centralized in textEncoding.ts
|
|
38
|
+
* and applied through setup.ts. This method only adds additional utility functions
|
|
39
|
+
* that might be needed by TensorFlow.js.
|
|
40
|
+
*/
|
|
41
|
+
addServerCompatibilityPolyfills() {
|
|
42
|
+
// Apply in all non-browser environments (Node.js, serverless, server environments)
|
|
43
|
+
if (isBrowser()) {
|
|
44
|
+
return; // Browser environments don't need these polyfills
|
|
45
|
+
}
|
|
46
|
+
// Get the appropriate global object for the current environment
|
|
47
|
+
const globalObj = (() => {
|
|
48
|
+
if (typeof globalThis !== 'undefined')
|
|
49
|
+
return globalThis;
|
|
50
|
+
if (typeof global !== 'undefined')
|
|
51
|
+
return global;
|
|
52
|
+
if (typeof self !== 'undefined')
|
|
53
|
+
return self;
|
|
54
|
+
return {}; // Fallback for unknown environments
|
|
55
|
+
})();
|
|
56
|
+
// Add polyfill for utility functions across all server environments
|
|
57
|
+
// This fixes issues like "Cannot read properties of undefined (reading 'isFloat32Array')"
|
|
58
|
+
try {
|
|
59
|
+
// Ensure the util object exists
|
|
60
|
+
if (!globalObj.util) {
|
|
61
|
+
globalObj.util = {};
|
|
62
|
+
}
|
|
63
|
+
// Add isFloat32Array method if it doesn't exist
|
|
64
|
+
if (!globalObj.util.isFloat32Array) {
|
|
65
|
+
globalObj.util.isFloat32Array = (obj) => {
|
|
66
|
+
return !!(obj instanceof Float32Array ||
|
|
67
|
+
(obj &&
|
|
68
|
+
Object.prototype.toString.call(obj) === '[object Float32Array]'));
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
// Add isTypedArray method if it doesn't exist
|
|
72
|
+
if (!globalObj.util.isTypedArray) {
|
|
73
|
+
globalObj.util.isTypedArray = (obj) => {
|
|
74
|
+
return !!(ArrayBuffer.isView(obj) && !(obj instanceof DataView));
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
catch (error) {
|
|
79
|
+
console.warn('Failed to add utility polyfills:', error);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Check if we're running in a test environment
|
|
84
|
+
*/
|
|
85
|
+
isTestEnvironment() {
|
|
86
|
+
// Safely check for Node.js environment first
|
|
87
|
+
if (typeof process === 'undefined') {
|
|
88
|
+
return false;
|
|
89
|
+
}
|
|
90
|
+
return (process.env.NODE_ENV === 'test' ||
|
|
91
|
+
process.env.VITEST === 'true' ||
|
|
92
|
+
(typeof global !== 'undefined' && global.__vitest__) ||
|
|
93
|
+
process.argv.some((arg) => arg.includes('vitest')));
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Log message only if verbose mode is enabled or if it's an error
|
|
97
|
+
* This helps suppress non-essential log messages
|
|
98
|
+
*/
|
|
99
|
+
logger(level, message, ...args) {
|
|
100
|
+
// Always log errors, but only log other messages if verbose mode is enabled
|
|
101
|
+
if (level === 'error' || this.verbose) {
|
|
102
|
+
console[level](message, ...args);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Load the Universal Sentence Encoder model with robust retry and fallback mechanisms
|
|
107
|
+
* @param loadFunction The function to load the model from TensorFlow Hub
|
|
108
|
+
*/
|
|
109
|
+
async loadModelFromLocal(loadFunction) {
|
|
110
|
+
this.logger('log', 'Loading Universal Sentence Encoder model with robust loader...');
|
|
111
|
+
try {
|
|
112
|
+
// Use the robust model loader to handle all retry logic, timeouts, and fallbacks
|
|
113
|
+
const model = await this.robustLoader.loadModel(loadFunction, 'universal-sentence-encoder');
|
|
114
|
+
this.logger('log', 'Successfully loaded Universal Sentence Encoder model');
|
|
115
|
+
return model;
|
|
116
|
+
}
|
|
117
|
+
catch (error) {
|
|
118
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
119
|
+
this.logger('error', `Failed to load Universal Sentence Encoder model: ${errorMessage}`);
|
|
120
|
+
// Log loading statistics for debugging
|
|
121
|
+
const stats = this.robustLoader.getLoadingStats();
|
|
122
|
+
if (Object.keys(stats).length > 0) {
|
|
123
|
+
this.logger('log', 'Loading attempt statistics:', stats);
|
|
124
|
+
}
|
|
125
|
+
throw error;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Initialize the embedding model
|
|
130
|
+
*/
|
|
131
|
+
async init() {
|
|
132
|
+
// Use a mock implementation in test environments
|
|
133
|
+
if (this.isTestEnvironment()) {
|
|
134
|
+
this.logger('log', 'Using mock Universal Sentence Encoder for tests');
|
|
135
|
+
// Create a mock model that returns fixed embeddings
|
|
136
|
+
this.model = {
|
|
137
|
+
embed: async (sentences) => {
|
|
138
|
+
// Create a tensor-like object with a mock array method
|
|
139
|
+
return {
|
|
140
|
+
array: async () => {
|
|
141
|
+
// Return fixed embeddings for each input sentence
|
|
142
|
+
const inputArray = Array.isArray(sentences)
|
|
143
|
+
? sentences
|
|
144
|
+
: [sentences];
|
|
145
|
+
return inputArray.map(() => new Array(512).fill(0).map((_, i) => (i % 2 === 0 ? 0.1 : -0.1)));
|
|
146
|
+
},
|
|
147
|
+
dispose: () => { }
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
};
|
|
151
|
+
this.initialized = true;
|
|
152
|
+
return;
|
|
153
|
+
}
|
|
154
|
+
try {
|
|
155
|
+
// Save original console.warn
|
|
156
|
+
const originalWarn = console.warn;
|
|
157
|
+
// Override console.warn to suppress TensorFlow.js Node.js backend message
|
|
158
|
+
console.warn = function (message, ...optionalParams) {
|
|
159
|
+
if (message &&
|
|
160
|
+
typeof message === 'string' &&
|
|
161
|
+
message.includes('Hi, looks like you are running TensorFlow.js in Node.js')) {
|
|
162
|
+
return; // Suppress the specific warning
|
|
163
|
+
}
|
|
164
|
+
originalWarn(message, ...optionalParams);
|
|
165
|
+
};
|
|
166
|
+
// Add polyfills for TensorFlow.js compatibility
|
|
167
|
+
this.addServerCompatibilityPolyfills();
|
|
168
|
+
// CRITICAL: Ensure TextEncoder/TextDecoder are available before TensorFlow.js loads
|
|
169
|
+
try {
|
|
170
|
+
// Get the appropriate global object for the current environment
|
|
171
|
+
const globalObj = (() => {
|
|
172
|
+
if (typeof globalThis !== 'undefined')
|
|
173
|
+
return globalThis;
|
|
174
|
+
if (typeof global !== 'undefined')
|
|
175
|
+
return global;
|
|
176
|
+
if (typeof self !== 'undefined')
|
|
177
|
+
return self;
|
|
178
|
+
return null;
|
|
179
|
+
})();
|
|
180
|
+
// Ensure TextEncoder/TextDecoder are globally available in server environments
|
|
181
|
+
if (globalObj) {
|
|
182
|
+
// Try to use Node.js util module if available (Node.js environments)
|
|
183
|
+
try {
|
|
184
|
+
if (typeof process !== 'undefined' &&
|
|
185
|
+
process.versions &&
|
|
186
|
+
process.versions.node) {
|
|
187
|
+
const util = await import('util');
|
|
188
|
+
if (!globalObj.TextEncoder) {
|
|
189
|
+
globalObj.TextEncoder = util.TextEncoder;
|
|
190
|
+
}
|
|
191
|
+
if (!globalObj.TextDecoder) {
|
|
192
|
+
globalObj.TextDecoder =
|
|
193
|
+
util.TextDecoder;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
catch (utilError) {
|
|
198
|
+
// Fallback to standard TextEncoder/TextDecoder for non-Node.js server environments
|
|
199
|
+
if (!globalObj.TextEncoder) {
|
|
200
|
+
globalObj.TextEncoder = TextEncoder;
|
|
201
|
+
}
|
|
202
|
+
if (!globalObj.TextDecoder) {
|
|
203
|
+
globalObj.TextDecoder = TextDecoder;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
// Apply the TensorFlow.js patch
|
|
208
|
+
const { applyTensorFlowPatch } = await import('./textEncoding.js');
|
|
209
|
+
await applyTensorFlowPatch();
|
|
210
|
+
// Now load TensorFlow.js core module using dynamic imports
|
|
211
|
+
this.tf = await import('@tensorflow/tfjs-core');
|
|
212
|
+
// Import CPU backend (always needed as fallback)
|
|
213
|
+
await import('@tensorflow/tfjs-backend-cpu');
|
|
214
|
+
// Try to import WebGL backend for GPU acceleration in browser environments
|
|
215
|
+
try {
|
|
216
|
+
if (isBrowser()) {
|
|
217
|
+
await import('@tensorflow/tfjs-backend-webgl');
|
|
218
|
+
// Check if WebGL is available
|
|
219
|
+
try {
|
|
220
|
+
if (this.tf.setBackend) {
|
|
221
|
+
await this.tf.setBackend('webgl');
|
|
222
|
+
this.backend = 'webgl';
|
|
223
|
+
console.log('Using WebGL backend for TensorFlow.js');
|
|
224
|
+
}
|
|
225
|
+
else {
|
|
226
|
+
console.warn('tf.setBackend is not available, falling back to CPU');
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
catch (e) {
|
|
230
|
+
console.warn('WebGL backend not available, falling back to CPU:', e);
|
|
231
|
+
this.backend = 'cpu';
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
catch (error) {
|
|
236
|
+
console.warn('WebGL backend not available, falling back to CPU:', error);
|
|
237
|
+
this.backend = 'cpu';
|
|
238
|
+
}
|
|
239
|
+
// Note: @tensorflow-models/universal-sentence-encoder is no longer used
|
|
240
|
+
// Model loading is handled entirely by robustLoader
|
|
241
|
+
}
|
|
242
|
+
catch (error) {
|
|
243
|
+
this.logger('error', 'Failed to initialize TensorFlow.js:', error);
|
|
244
|
+
// No fallback allowed - throw error
|
|
245
|
+
throw new Error(`Universal Sentence Encoder initialization failed: ${error}`);
|
|
246
|
+
}
|
|
247
|
+
// Set the backend
|
|
248
|
+
if (this.tf && this.tf.setBackend) {
|
|
249
|
+
await this.tf.setBackend(this.backend);
|
|
250
|
+
}
|
|
251
|
+
// Load model using robustLoader which handles all loading strategies:
|
|
252
|
+
// 1. @soulcraft/brainy-models package if available (offline mode)
|
|
253
|
+
// 2. Direct TensorFlow.js URL loading as fallback
|
|
254
|
+
try {
|
|
255
|
+
this.model = await this.robustLoader.loadModelWithFallbacks();
|
|
256
|
+
this.initialized = true;
|
|
257
|
+
// If the model doesn't have an embed method but has embedToArrays, wrap it
|
|
258
|
+
if (!this.model.embed && this.model.embedToArrays) {
|
|
259
|
+
const originalModel = this.model;
|
|
260
|
+
this.model = {
|
|
261
|
+
embed: async (sentences) => {
|
|
262
|
+
const input = Array.isArray(sentences) ? sentences : [sentences];
|
|
263
|
+
const embeddings = await originalModel.embedToArrays(input);
|
|
264
|
+
// Return TensorFlow tensor-like object
|
|
265
|
+
return {
|
|
266
|
+
array: async () => embeddings,
|
|
267
|
+
arraySync: () => embeddings
|
|
268
|
+
};
|
|
269
|
+
},
|
|
270
|
+
dispose: () => originalModel.dispose ? originalModel.dispose() : undefined
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
catch (modelError) {
|
|
275
|
+
this.logger('error', 'Failed to load Universal Sentence Encoder model:', modelError);
|
|
276
|
+
throw new Error(`Universal Sentence Encoder model loading failed: ${modelError}`);
|
|
277
|
+
}
|
|
278
|
+
// Restore original console.warn
|
|
279
|
+
console.warn = originalWarn;
|
|
280
|
+
}
|
|
281
|
+
catch (error) {
|
|
282
|
+
this.logger('error', 'Failed to initialize Universal Sentence Encoder:', error);
|
|
283
|
+
// No fallback allowed - throw error
|
|
284
|
+
throw new Error(`Universal Sentence Encoder initialization failed: ${error}`);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
/**
|
|
288
|
+
* Embed text into a vector using Universal Sentence Encoder
|
|
289
|
+
* @param data Text to embed
|
|
290
|
+
*/
|
|
291
|
+
async embed(data) {
|
|
292
|
+
if (!this.initialized) {
|
|
293
|
+
await this.init();
|
|
294
|
+
}
|
|
295
|
+
try {
|
|
296
|
+
// Handle different input types
|
|
297
|
+
let textToEmbed;
|
|
298
|
+
if (typeof data === 'string') {
|
|
299
|
+
// Handle empty string case
|
|
300
|
+
if (data.trim() === '') {
|
|
301
|
+
// Return a zero vector of 512 dimensions (standard for Universal Sentence Encoder)
|
|
302
|
+
return new Array(512).fill(0);
|
|
303
|
+
}
|
|
304
|
+
textToEmbed = [data];
|
|
305
|
+
}
|
|
306
|
+
else if (Array.isArray(data) &&
|
|
307
|
+
data.every((item) => typeof item === 'string')) {
|
|
308
|
+
// Handle empty array or array with empty strings
|
|
309
|
+
if (data.length === 0 || data.every((item) => item.trim() === '')) {
|
|
310
|
+
return new Array(512).fill(0);
|
|
311
|
+
}
|
|
312
|
+
// Filter out empty strings
|
|
313
|
+
textToEmbed = data.filter((item) => item.trim() !== '');
|
|
314
|
+
if (textToEmbed.length === 0) {
|
|
315
|
+
return new Array(512).fill(0);
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
else {
|
|
319
|
+
throw new Error('UniversalSentenceEncoder only supports string or string[] data');
|
|
320
|
+
}
|
|
321
|
+
// Ensure the model is available
|
|
322
|
+
if (!this.model) {
|
|
323
|
+
throw new Error('Universal Sentence Encoder model is not available');
|
|
324
|
+
}
|
|
325
|
+
// Get embeddings
|
|
326
|
+
const embeddings = await this.model.embed(textToEmbed);
|
|
327
|
+
// Convert to array and return the first embedding
|
|
328
|
+
const embeddingArray = await embeddings.array();
|
|
329
|
+
// Dispose of the tensor to free memory
|
|
330
|
+
embeddings.dispose();
|
|
331
|
+
// Get the first embedding
|
|
332
|
+
let embedding = embeddingArray[0];
|
|
333
|
+
// Always ensure the embedding is exactly 512 dimensions
|
|
334
|
+
if (embedding.length !== 512) {
|
|
335
|
+
this.logger('warn', `Embedding dimension mismatch: expected 512, got ${embedding.length}. Standardizing to 512 dimensions.`);
|
|
336
|
+
// If the embedding is too short, pad with zeros
|
|
337
|
+
if (embedding.length < 512) {
|
|
338
|
+
const paddedEmbedding = new Array(512).fill(0);
|
|
339
|
+
for (let i = 0; i < embedding.length; i++) {
|
|
340
|
+
paddedEmbedding[i] = embedding[i];
|
|
341
|
+
}
|
|
342
|
+
embedding = paddedEmbedding;
|
|
343
|
+
}
|
|
344
|
+
// If the embedding is too long, truncate
|
|
345
|
+
else if (embedding.length > 512) {
|
|
346
|
+
embedding = embedding.slice(0, 512);
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
return embedding;
|
|
350
|
+
}
|
|
351
|
+
catch (error) {
|
|
352
|
+
this.logger('error', 'Failed to embed text with Universal Sentence Encoder:', error);
|
|
353
|
+
throw new Error(`Universal Sentence Encoder embedding failed: ${error}`);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Embed multiple texts into vectors using Universal Sentence Encoder
|
|
358
|
+
* This is more efficient than calling embed() multiple times
|
|
359
|
+
* @param dataArray Array of texts to embed
|
|
360
|
+
* @returns Array of embedding vectors
|
|
361
|
+
*/
|
|
362
|
+
async embedBatch(dataArray) {
|
|
363
|
+
if (!this.initialized) {
|
|
364
|
+
await this.init();
|
|
365
|
+
}
|
|
366
|
+
try {
|
|
367
|
+
// Handle empty array case
|
|
368
|
+
if (dataArray.length === 0) {
|
|
369
|
+
return [];
|
|
370
|
+
}
|
|
371
|
+
// Filter out empty strings and handle edge cases
|
|
372
|
+
const textToEmbed = dataArray.filter((text) => typeof text === 'string' && text.trim() !== '');
|
|
373
|
+
// If all strings were empty, return appropriate zero vectors
|
|
374
|
+
if (textToEmbed.length === 0) {
|
|
375
|
+
return dataArray.map(() => new Array(512).fill(0));
|
|
376
|
+
}
|
|
377
|
+
// Ensure the model is available
|
|
378
|
+
if (!this.model) {
|
|
379
|
+
throw new Error('Universal Sentence Encoder model is not available');
|
|
380
|
+
}
|
|
381
|
+
// Get embeddings for all texts in a single batch operation
|
|
382
|
+
const embeddings = await this.model.embed(textToEmbed);
|
|
383
|
+
// Convert to array
|
|
384
|
+
const embeddingArray = await embeddings.array();
|
|
385
|
+
// Dispose of the tensor to free memory
|
|
386
|
+
embeddings.dispose();
|
|
387
|
+
// Standardize embeddings to ensure they're all 512 dimensions
|
|
388
|
+
const standardizedEmbeddings = embeddingArray.map((embedding) => {
|
|
389
|
+
if (embedding.length !== 512) {
|
|
390
|
+
this.logger('warn', `Batch embedding dimension mismatch: expected 512, got ${embedding.length}. Standardizing to 512 dimensions.`);
|
|
391
|
+
// If the embedding is too short, pad with zeros
|
|
392
|
+
if (embedding.length < 512) {
|
|
393
|
+
const paddedEmbedding = new Array(512).fill(0);
|
|
394
|
+
for (let i = 0; i < embedding.length; i++) {
|
|
395
|
+
paddedEmbedding[i] = embedding[i];
|
|
396
|
+
}
|
|
397
|
+
return paddedEmbedding;
|
|
398
|
+
}
|
|
399
|
+
// If the embedding is too long, truncate
|
|
400
|
+
else if (embedding.length > 512) {
|
|
401
|
+
return embedding.slice(0, 512);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
return embedding;
|
|
405
|
+
});
|
|
406
|
+
// Map the results back to the original array order
|
|
407
|
+
const results = [];
|
|
408
|
+
let embeddingIndex = 0;
|
|
409
|
+
for (let i = 0; i < dataArray.length; i++) {
|
|
410
|
+
const text = dataArray[i];
|
|
411
|
+
if (typeof text === 'string' && text.trim() !== '') {
|
|
412
|
+
// Use the standardized embedding for non-empty strings
|
|
413
|
+
results.push(standardizedEmbeddings[embeddingIndex]);
|
|
414
|
+
embeddingIndex++;
|
|
415
|
+
}
|
|
416
|
+
else {
|
|
417
|
+
// Use a zero vector for empty strings
|
|
418
|
+
results.push(new Array(512).fill(0));
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
return results;
|
|
422
|
+
}
|
|
423
|
+
catch (error) {
|
|
424
|
+
this.logger('error', 'Failed to batch embed text with Universal Sentence Encoder:', error);
|
|
425
|
+
throw new Error(`Universal Sentence Encoder batch embedding failed: ${error}`);
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
/**
|
|
429
|
+
* Dispose of the model resources
|
|
430
|
+
*/
|
|
431
|
+
async dispose() {
|
|
432
|
+
if (this.model && this.tf) {
|
|
433
|
+
try {
|
|
434
|
+
// Dispose of the model and tensors
|
|
435
|
+
this.model.dispose();
|
|
436
|
+
this.tf.disposeVariables();
|
|
437
|
+
this.initialized = false;
|
|
438
|
+
}
|
|
439
|
+
catch (error) {
|
|
440
|
+
this.logger('error', 'Failed to dispose Universal Sentence Encoder:', error);
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
return Promise.resolve();
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
/**
|
|
447
|
+
* Helper function - NO LONGER USED
|
|
448
|
+
* Kept for compatibility but will be removed in next major version
|
|
449
|
+
* @deprecated Since we removed @tensorflow-models/universal-sentence-encoder dependency
|
|
450
|
+
*/
|
|
451
|
+
function findUSELoadFunction(sentenceEncoderModule) {
|
|
452
|
+
// Module structure available for debugging if needed
|
|
453
|
+
// Find the appropriate load function from the module
|
|
454
|
+
let loadFunction = null;
|
|
455
|
+
// Try sentenceEncoderModule.load first (direct export)
|
|
456
|
+
if (sentenceEncoderModule.load &&
|
|
457
|
+
typeof sentenceEncoderModule.load === 'function') {
|
|
458
|
+
loadFunction = sentenceEncoderModule.load;
|
|
459
|
+
}
|
|
460
|
+
// Then try sentenceEncoderModule.default.load (default export)
|
|
461
|
+
else if (sentenceEncoderModule.default &&
|
|
462
|
+
sentenceEncoderModule.default.load &&
|
|
463
|
+
typeof sentenceEncoderModule.default.load === 'function') {
|
|
464
|
+
loadFunction = sentenceEncoderModule.default.load;
|
|
465
|
+
}
|
|
466
|
+
// Try sentenceEncoderModule.default directly if it's a function
|
|
467
|
+
else if (sentenceEncoderModule.default &&
|
|
468
|
+
typeof sentenceEncoderModule.default === 'function') {
|
|
469
|
+
loadFunction = sentenceEncoderModule.default;
|
|
470
|
+
}
|
|
471
|
+
// Try sentenceEncoderModule directly if it's a function
|
|
472
|
+
else if (typeof sentenceEncoderModule === 'function') {
|
|
473
|
+
loadFunction = sentenceEncoderModule;
|
|
474
|
+
}
|
|
475
|
+
// Try additional common patterns
|
|
476
|
+
else if (sentenceEncoderModule.UniversalSentenceEncoder &&
|
|
477
|
+
typeof sentenceEncoderModule.UniversalSentenceEncoder.load === 'function') {
|
|
478
|
+
loadFunction = sentenceEncoderModule.UniversalSentenceEncoder.load;
|
|
479
|
+
}
|
|
480
|
+
else if (sentenceEncoderModule.default &&
|
|
481
|
+
sentenceEncoderModule.default.UniversalSentenceEncoder &&
|
|
482
|
+
typeof sentenceEncoderModule.default.UniversalSentenceEncoder.load ===
|
|
483
|
+
'function') {
|
|
484
|
+
loadFunction = sentenceEncoderModule.default.UniversalSentenceEncoder.load;
|
|
485
|
+
}
|
|
486
|
+
// Try to find the load function in the module's properties
|
|
487
|
+
else {
|
|
488
|
+
// Look for any property that might be a load function
|
|
489
|
+
for (const key in sentenceEncoderModule) {
|
|
490
|
+
if (typeof sentenceEncoderModule[key] === 'function') {
|
|
491
|
+
// Check if the function name or key contains 'load'
|
|
492
|
+
const fnName = sentenceEncoderModule[key].name || key;
|
|
493
|
+
if (fnName.toLowerCase().includes('load')) {
|
|
494
|
+
loadFunction = sentenceEncoderModule[key];
|
|
495
|
+
break;
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
// Also check nested objects
|
|
499
|
+
else if (typeof sentenceEncoderModule[key] === 'object' &&
|
|
500
|
+
sentenceEncoderModule[key] !== null) {
|
|
501
|
+
for (const nestedKey in sentenceEncoderModule[key]) {
|
|
502
|
+
if (typeof sentenceEncoderModule[key][nestedKey] === 'function') {
|
|
503
|
+
const fnName = sentenceEncoderModule[key][nestedKey].name || nestedKey;
|
|
504
|
+
if (fnName.toLowerCase().includes('load')) {
|
|
505
|
+
loadFunction = sentenceEncoderModule[key][nestedKey];
|
|
506
|
+
break;
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
if (loadFunction)
|
|
511
|
+
break;
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
// Return a function that calls the load function without arguments
|
|
516
|
+
// This will use the bundled model from the package
|
|
517
|
+
if (loadFunction) {
|
|
518
|
+
return async () => await loadFunction();
|
|
519
|
+
}
|
|
520
|
+
return null;
|
|
521
|
+
}
|
|
522
|
+
/**
|
|
523
|
+
* Check if we're running in a test environment (standalone version)
|
|
524
|
+
* Uses the same logic as the class method to avoid duplication
|
|
525
|
+
*/
|
|
526
|
+
function isTestEnvironment() {
|
|
527
|
+
// Use the same implementation as the class method
|
|
528
|
+
// Safely check for Node.js environment first
|
|
529
|
+
if (typeof process === 'undefined') {
|
|
530
|
+
return false;
|
|
531
|
+
}
|
|
532
|
+
return (process.env.NODE_ENV === 'test' ||
|
|
533
|
+
process.env.VITEST === 'true' ||
|
|
534
|
+
(typeof global !== 'undefined' && global.__vitest__) ||
|
|
535
|
+
process.argv.some((arg) => arg.includes('vitest')));
|
|
536
|
+
}
|
|
537
|
+
/**
|
|
538
|
+
* Log message only if not in test environment and verbose mode is enabled (standalone version)
|
|
539
|
+
* @param level Log level ('log', 'warn', 'error')
|
|
540
|
+
* @param message Message to log
|
|
541
|
+
* @param args Additional arguments to log
|
|
542
|
+
* @param verbose Whether to log non-essential messages (default: true)
|
|
543
|
+
*/
|
|
544
|
+
function logIfNotTest(level, message, args = [], verbose = true) {
|
|
545
|
+
// Always log errors, but only log other messages if verbose mode is enabled
|
|
546
|
+
if ((level === 'error' || verbose) && !isTestEnvironment()) {
|
|
547
|
+
console[level](message, ...args);
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
/**
|
|
551
|
+
* Create an embedding function from an embedding model
|
|
552
|
+
* @param model Embedding model to use (optional, defaults to UniversalSentenceEncoder)
|
|
553
|
+
*/
|
|
554
|
+
export function createEmbeddingFunction(model) {
|
|
555
|
+
// If no model is provided, use the default TensorFlow embedding function
|
|
556
|
+
if (!model) {
|
|
557
|
+
return createTensorFlowEmbeddingFunction();
|
|
558
|
+
}
|
|
559
|
+
return async (data) => {
|
|
560
|
+
return await model.embed(data);
|
|
561
|
+
};
|
|
562
|
+
}
|
|
563
|
+
/**
|
|
564
|
+
* Creates a TensorFlow-based Universal Sentence Encoder embedding function
|
|
565
|
+
* This is the required embedding function for all text embeddings
|
|
566
|
+
* Uses a shared model instance for better performance across multiple calls
|
|
567
|
+
* @param options Configuration options
|
|
568
|
+
* @param options.verbose Whether to log non-essential messages (default: true)
|
|
569
|
+
*/
|
|
570
|
+
// Create a single shared instance of the model that persists across all embedding calls
|
|
571
|
+
let sharedModel = null;
|
|
572
|
+
let sharedModelInitialized = false;
|
|
573
|
+
let sharedModelVerbose = true;
|
|
574
|
+
export function createTensorFlowEmbeddingFunction(options = {}) {
|
|
575
|
+
// Update verbose setting if provided
|
|
576
|
+
if (options.verbose !== undefined) {
|
|
577
|
+
sharedModelVerbose = options.verbose;
|
|
578
|
+
}
|
|
579
|
+
// Create the shared model if it doesn't exist yet
|
|
580
|
+
if (!sharedModel) {
|
|
581
|
+
sharedModel = new UniversalSentenceEncoder({ verbose: sharedModelVerbose });
|
|
582
|
+
}
|
|
583
|
+
return async (data) => {
|
|
584
|
+
try {
|
|
585
|
+
// Initialize the model if it hasn't been initialized yet
|
|
586
|
+
if (!sharedModelInitialized) {
|
|
587
|
+
try {
|
|
588
|
+
await sharedModel.init();
|
|
589
|
+
sharedModelInitialized = true;
|
|
590
|
+
}
|
|
591
|
+
catch (initError) {
|
|
592
|
+
// Reset the flag so we can retry initialization on the next call
|
|
593
|
+
sharedModelInitialized = false;
|
|
594
|
+
throw initError;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
return await sharedModel.embed(data);
|
|
598
|
+
}
|
|
599
|
+
catch (error) {
|
|
600
|
+
logIfNotTest('error', 'Failed to use Universal Sentence Encoder:', [error], sharedModelVerbose);
|
|
601
|
+
// No fallback - Universal Sentence Encoder is required
|
|
602
|
+
throw new Error(`Universal Sentence Encoder is required and no fallbacks are allowed: ${error}`);
|
|
603
|
+
}
|
|
604
|
+
};
|
|
605
|
+
}
|
|
606
|
+
/**
|
|
607
|
+
* Default embedding function
|
|
608
|
+
* Uses UniversalSentenceEncoder for all text embeddings
|
|
609
|
+
* TensorFlow.js is required for this to work
|
|
610
|
+
* Uses CPU for compatibility
|
|
611
|
+
* @param options Configuration options
|
|
612
|
+
* @param options.verbose Whether to log non-essential messages (default: true)
|
|
613
|
+
*/
|
|
614
|
+
export function getDefaultEmbeddingFunction(options = {}) {
|
|
615
|
+
return createTensorFlowEmbeddingFunction(options);
|
|
616
|
+
}
|
|
617
|
+
/**
|
|
618
|
+
* Default embedding function with default options
|
|
619
|
+
* Uses UniversalSentenceEncoder for all text embeddings
|
|
620
|
+
* TensorFlow.js is required for this to work
|
|
621
|
+
* Uses CPU for compatibility
|
|
622
|
+
*/
|
|
623
|
+
export const defaultEmbeddingFunction = getDefaultEmbeddingFunction();
|
|
624
|
+
/**
|
|
625
|
+
* Creates a batch embedding function that uses UniversalSentenceEncoder
|
|
626
|
+
* TensorFlow.js is required for this to work
|
|
627
|
+
* Processes all items in a single batch operation
|
|
628
|
+
* Uses a shared model instance for better performance across multiple calls
|
|
629
|
+
* @param options Configuration options
|
|
630
|
+
* @param options.verbose Whether to log non-essential messages (default: true)
|
|
631
|
+
*/
|
|
632
|
+
// Create a single shared instance of the model that persists across function calls
|
|
633
|
+
let sharedBatchModel = null;
|
|
634
|
+
let sharedBatchModelInitialized = false;
|
|
635
|
+
let sharedBatchModelVerbose = true;
|
|
636
|
+
export function createBatchEmbeddingFunction(options = {}) {
|
|
637
|
+
// Update verbose setting if provided
|
|
638
|
+
if (options.verbose !== undefined) {
|
|
639
|
+
sharedBatchModelVerbose = options.verbose;
|
|
640
|
+
}
|
|
641
|
+
// Create the shared model if it doesn't exist yet
|
|
642
|
+
if (!sharedBatchModel) {
|
|
643
|
+
sharedBatchModel = new UniversalSentenceEncoder({
|
|
644
|
+
verbose: sharedBatchModelVerbose
|
|
645
|
+
});
|
|
646
|
+
}
|
|
647
|
+
return async (dataArray) => {
|
|
648
|
+
try {
|
|
649
|
+
// Initialize the model if it hasn't been initialized yet
|
|
650
|
+
if (!sharedBatchModelInitialized) {
|
|
651
|
+
try {
|
|
652
|
+
await sharedBatchModel.init();
|
|
653
|
+
sharedBatchModelInitialized = true;
|
|
654
|
+
}
|
|
655
|
+
catch (initError) {
|
|
656
|
+
// Reset the flag so we can retry initialization on the next call
|
|
657
|
+
sharedBatchModelInitialized = false;
|
|
658
|
+
throw initError;
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
return await sharedBatchModel.embedBatch(dataArray);
|
|
662
|
+
}
|
|
663
|
+
catch (error) {
|
|
664
|
+
logIfNotTest('error', 'Failed to use Universal Sentence Encoder batch embedding:', [error], sharedBatchModelVerbose);
|
|
665
|
+
// No fallback - Universal Sentence Encoder is required
|
|
666
|
+
throw new Error(`Universal Sentence Encoder is required for batch embedding and no fallbacks are allowed: ${error}`);
|
|
667
|
+
}
|
|
668
|
+
};
|
|
669
|
+
}
|
|
670
|
+
/**
|
|
671
|
+
* Get a batch embedding function with custom options
|
|
672
|
+
* Uses UniversalSentenceEncoder for all text embeddings
|
|
673
|
+
* TensorFlow.js is required for this to work
|
|
674
|
+
* Processes all items in a single batch operation
|
|
675
|
+
* @param options Configuration options
|
|
676
|
+
* @param options.verbose Whether to log non-essential messages (default: true)
|
|
677
|
+
*/
|
|
678
|
+
export function getDefaultBatchEmbeddingFunction(options = {}) {
|
|
679
|
+
return createBatchEmbeddingFunction(options);
|
|
680
|
+
}
|
|
681
|
+
/**
|
|
682
|
+
* Default batch embedding function with default options
|
|
683
|
+
* Uses UniversalSentenceEncoder for all text embeddings
|
|
684
|
+
* TensorFlow.js is required for this to work
|
|
685
|
+
* Processes all items in a single batch operation
|
|
686
|
+
*/
|
|
687
|
+
export const defaultBatchEmbeddingFunction = getDefaultBatchEmbeddingFunction();
|
|
688
|
+
/**
|
|
689
|
+
* Creates an embedding function that runs in a separate thread
|
|
690
|
+
* This is a wrapper around createEmbeddingFunction that uses executeInThread
|
|
691
|
+
* @param model Embedding model to use
|
|
692
|
+
*/
|
|
693
|
+
export function createThreadedEmbeddingFunction(model) {
|
|
694
|
+
const embeddingFunction = createEmbeddingFunction(model);
|
|
695
|
+
return async (data) => {
|
|
696
|
+
// Convert the embedding function to a string
|
|
697
|
+
const fnString = embeddingFunction.toString();
|
|
698
|
+
// Execute the embedding function in a "thread" (main thread in this implementation)
|
|
699
|
+
return await executeInThread(fnString, data);
|
|
700
|
+
};
|
|
701
|
+
}
|
|
702
|
+
//# sourceMappingURL=embedding.js.map
|