s3db.js 11.2.4 → 11.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/s3db-cli.js +588 -74
- package/dist/s3db.cjs.js +1361 -23
- package/dist/s3db.cjs.js.map +1 -1
- package/dist/s3db.d.ts +3 -1
- package/dist/s3db.es.js +1359 -24
- package/dist/s3db.es.js.map +1 -1
- package/package.json +2 -1
- package/src/concerns/base62.js +70 -0
- package/src/plugins/index.js +1 -0
- package/src/plugins/vector/distances.js +173 -0
- package/src/plugins/vector/kmeans.js +367 -0
- package/src/plugins/vector/metrics.js +369 -0
- package/src/plugins/vector/vector-error.js +43 -0
- package/src/plugins/vector.plugin.js +687 -0
- package/src/resource.class.js +79 -0
- package/src/s3db.d.ts +3 -1
- package/src/schema.class.js +232 -41
- package/src/validator.class.js +8 -0
|
@@ -0,0 +1,687 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector Plugin
|
|
3
|
+
*
|
|
4
|
+
* Provides vector storage, similarity search, and clustering capabilities.
|
|
5
|
+
* Supports multiple distance metrics and automatic K determination.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Vector similarity search (KNN)
|
|
9
|
+
* - K-means clustering
|
|
10
|
+
* - Multiple distance metrics (cosine, euclidean, manhattan)
|
|
11
|
+
* - Optimal K analysis with 5 evaluation metrics
|
|
12
|
+
* - Automatic storage validation for large vectors
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { Plugin } from './plugin.class.js';
|
|
16
|
+
import { cosineDistance, euclideanDistance, manhattanDistance, dotProduct, normalize } from './vector/distances.js';
|
|
17
|
+
import { kmeans, findOptimalK } from './vector/kmeans.js';
|
|
18
|
+
import { VectorError } from './vector/vector-error.js';
|
|
19
|
+
|
|
20
|
+
export class VectorPlugin extends Plugin {
|
|
21
|
+
constructor(options = {}) {
|
|
22
|
+
super(options);
|
|
23
|
+
|
|
24
|
+
this.config = {
|
|
25
|
+
dimensions: 1536, // Default to OpenAI text-embedding-3-small/3-large
|
|
26
|
+
distanceMetric: 'cosine', // Default metric
|
|
27
|
+
storageThreshold: 1500, // Bytes - warn if vectors exceed this
|
|
28
|
+
autoFixBehavior: false, // Automatically set body-overflow
|
|
29
|
+
autoDetectVectorField: true, // Auto-detect embedding:XXX fields
|
|
30
|
+
emitEvents: true, // Emit events for monitoring
|
|
31
|
+
verboseEvents: false, // Emit detailed progress events
|
|
32
|
+
eventThrottle: 100, // Throttle progress events (ms)
|
|
33
|
+
...options
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
this.distanceFunctions = {
|
|
37
|
+
cosine: cosineDistance,
|
|
38
|
+
euclidean: euclideanDistance,
|
|
39
|
+
manhattan: manhattanDistance
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
// Cache for auto-detected vector fields per resource
|
|
43
|
+
this._vectorFieldCache = new Map();
|
|
44
|
+
|
|
45
|
+
// Throttle state for progress events
|
|
46
|
+
this._throttleState = new Map();
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
async onInstall() {
|
|
50
|
+
this.emit('installed', { plugin: 'VectorPlugin' });
|
|
51
|
+
|
|
52
|
+
// Validate vector storage for all resources
|
|
53
|
+
this.validateVectorStorage();
|
|
54
|
+
|
|
55
|
+
// Add vector methods to all resources
|
|
56
|
+
this.installResourceMethods();
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
async onStart() {
|
|
60
|
+
this.emit('started', { plugin: 'VectorPlugin' });
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
async onStop() {
|
|
64
|
+
this.emit('stopped', { plugin: 'VectorPlugin' });
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
async onUninstall(options) {
|
|
68
|
+
// Remove vector methods from resources
|
|
69
|
+
for (const resource of Object.values(this.database.resources)) {
|
|
70
|
+
// Remove technical methods
|
|
71
|
+
delete resource.vectorSearch;
|
|
72
|
+
delete resource.cluster;
|
|
73
|
+
delete resource.vectorDistance;
|
|
74
|
+
|
|
75
|
+
// Remove intuitive aliases
|
|
76
|
+
delete resource.similarTo;
|
|
77
|
+
delete resource.findSimilar;
|
|
78
|
+
delete resource.distance;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
this.emit('uninstalled', { plugin: 'VectorPlugin' });
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Validate vector storage configuration for all resources
|
|
86
|
+
*
|
|
87
|
+
* Detects large vector fields and warns if proper behavior is not set.
|
|
88
|
+
* Can optionally auto-fix by setting body-overflow behavior.
|
|
89
|
+
*/
|
|
90
|
+
validateVectorStorage() {
|
|
91
|
+
for (const resource of Object.values(this.database.resources)) {
|
|
92
|
+
const vectorFields = this.findVectorFields(resource.schema.attributes);
|
|
93
|
+
|
|
94
|
+
if (vectorFields.length === 0) continue;
|
|
95
|
+
|
|
96
|
+
const totalVectorSize = vectorFields.reduce((sum, f) => sum + f.estimatedBytes, 0);
|
|
97
|
+
|
|
98
|
+
// If exceeds threshold AND doesn't have correct behavior
|
|
99
|
+
if (totalVectorSize > this.config.storageThreshold) {
|
|
100
|
+
const hasCorrectBehavior = ['body-overflow', 'body-only'].includes(resource.behavior);
|
|
101
|
+
|
|
102
|
+
if (!hasCorrectBehavior) {
|
|
103
|
+
const warning = {
|
|
104
|
+
resource: resource.name,
|
|
105
|
+
vectorFields: vectorFields.map(f => ({
|
|
106
|
+
field: f.name,
|
|
107
|
+
dimensions: f.length,
|
|
108
|
+
estimatedBytes: f.estimatedBytes
|
|
109
|
+
})),
|
|
110
|
+
totalEstimatedBytes: totalVectorSize,
|
|
111
|
+
metadataLimit: 2047,
|
|
112
|
+
currentBehavior: resource.behavior || 'default',
|
|
113
|
+
recommendation: 'body-overflow'
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
this.emit('vector:storage-warning', warning);
|
|
117
|
+
|
|
118
|
+
// Auto-fix if configured
|
|
119
|
+
if (this.config.autoFixBehavior) {
|
|
120
|
+
resource.behavior = 'body-overflow';
|
|
121
|
+
this.emit('vector:behavior-fixed', {
|
|
122
|
+
resource: resource.name,
|
|
123
|
+
newBehavior: 'body-overflow'
|
|
124
|
+
});
|
|
125
|
+
} else {
|
|
126
|
+
// Just warn
|
|
127
|
+
console.warn(`⚠️ VectorPlugin: Resource '${resource.name}' has large vector fields (${totalVectorSize} bytes estimated)`);
|
|
128
|
+
console.warn(` Current behavior: '${resource.behavior || 'default'}'`);
|
|
129
|
+
console.warn(` Recommendation: Add behavior: 'body-overflow' or 'body-only' to resource configuration`);
|
|
130
|
+
console.warn(` Large vectors will exceed S3 metadata limit (2047 bytes) and cause errors.`);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* Auto-detect vector field from resource schema
|
|
139
|
+
*
|
|
140
|
+
* Looks for fields with type 'embedding:XXX' pattern.
|
|
141
|
+
* Caches result per resource for performance.
|
|
142
|
+
*
|
|
143
|
+
* @param {Resource} resource - Resource instance
|
|
144
|
+
* @returns {string|null} Detected vector field name or null
|
|
145
|
+
*/
|
|
146
|
+
detectVectorField(resource) {
|
|
147
|
+
// Check cache first
|
|
148
|
+
if (this._vectorFieldCache.has(resource.name)) {
|
|
149
|
+
return this._vectorFieldCache.get(resource.name);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Search for embedding:XXX fields
|
|
153
|
+
const vectorField = this._findEmbeddingField(resource.schema.attributes);
|
|
154
|
+
|
|
155
|
+
// Cache the result
|
|
156
|
+
this._vectorFieldCache.set(resource.name, vectorField);
|
|
157
|
+
|
|
158
|
+
// Emit event if field detected
|
|
159
|
+
if (vectorField && this.config.emitEvents) {
|
|
160
|
+
this.emit('vector:field-detected', {
|
|
161
|
+
resource: resource.name,
|
|
162
|
+
vectorField,
|
|
163
|
+
timestamp: Date.now()
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return vectorField;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Recursively find embedding:XXX field in attributes
|
|
172
|
+
*
|
|
173
|
+
* @param {Object} attributes - Resource attributes
|
|
174
|
+
* @param {string} path - Current path (for nested objects)
|
|
175
|
+
* @returns {string|null} Field path or null
|
|
176
|
+
*/
|
|
177
|
+
_findEmbeddingField(attributes, path = '') {
|
|
178
|
+
for (const [key, attr] of Object.entries(attributes)) {
|
|
179
|
+
const fullPath = path ? `${path}.${key}` : key;
|
|
180
|
+
|
|
181
|
+
// Check for embedding:XXX shorthand
|
|
182
|
+
if (typeof attr === 'string' && attr.startsWith('embedding:')) {
|
|
183
|
+
return fullPath;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Check for expanded embedding definition
|
|
187
|
+
if (attr.type === 'array' && attr.items === 'number' && attr.length) {
|
|
188
|
+
return fullPath;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Check nested objects
|
|
192
|
+
if (attr.type === 'object' && attr.props) {
|
|
193
|
+
const nested = this._findEmbeddingField(attr.props, fullPath);
|
|
194
|
+
if (nested) return nested;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return null;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Emit event with throttling support
|
|
203
|
+
*
|
|
204
|
+
* @param {string} eventName - Event name
|
|
205
|
+
* @param {Object} data - Event data
|
|
206
|
+
* @param {string} throttleKey - Unique key for throttling (optional)
|
|
207
|
+
*/
|
|
208
|
+
_emitEvent(eventName, data, throttleKey = null) {
|
|
209
|
+
if (!this.config.emitEvents) return;
|
|
210
|
+
|
|
211
|
+
// If throttleKey provided, check throttle state
|
|
212
|
+
if (throttleKey) {
|
|
213
|
+
const now = Date.now();
|
|
214
|
+
const lastEmit = this._throttleState.get(throttleKey);
|
|
215
|
+
|
|
216
|
+
if (lastEmit && (now - lastEmit) < this.config.eventThrottle) {
|
|
217
|
+
return; // Skip emission
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
this._throttleState.set(throttleKey, now);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
this.emit(eventName, data);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Find vector fields in resource attributes
|
|
228
|
+
*
|
|
229
|
+
* @param {Object} attributes - Resource attributes
|
|
230
|
+
* @param {string} path - Current path (for nested objects)
|
|
231
|
+
* @returns {Array} Array of vector field info
|
|
232
|
+
*/
|
|
233
|
+
findVectorFields(attributes, path = '') {
|
|
234
|
+
const vectors = [];
|
|
235
|
+
|
|
236
|
+
for (const [key, attr] of Object.entries(attributes)) {
|
|
237
|
+
const fullPath = path ? `${path}.${key}` : key;
|
|
238
|
+
|
|
239
|
+
// Check if it's a vector field (array of numbers with length)
|
|
240
|
+
if (attr.type === 'array' && attr.items === 'number' && attr.length) {
|
|
241
|
+
vectors.push({
|
|
242
|
+
name: fullPath,
|
|
243
|
+
length: attr.length,
|
|
244
|
+
estimatedBytes: this.estimateVectorBytes(attr.length)
|
|
245
|
+
});
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Check nested objects
|
|
249
|
+
if (attr.type === 'object' && attr.props) {
|
|
250
|
+
vectors.push(...this.findVectorFields(attr.props, fullPath));
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return vectors;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Estimate bytes required to store a vector in JSON format
|
|
259
|
+
*
|
|
260
|
+
* Conservative estimate: ~7 bytes per number + array overhead
|
|
261
|
+
*
|
|
262
|
+
* @param {number} dimensions - Number of dimensions
|
|
263
|
+
* @returns {number} Estimated bytes
|
|
264
|
+
*/
|
|
265
|
+
estimateVectorBytes(dimensions) {
|
|
266
|
+
// Each float: ~6-8 bytes in JSON (e.g., "0.1234")
|
|
267
|
+
// Array overhead: brackets, commas
|
|
268
|
+
return dimensions * 7 + 50;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Install vector methods on all resources
|
|
273
|
+
*/
|
|
274
|
+
installResourceMethods() {
|
|
275
|
+
for (const resource of Object.values(this.database.resources)) {
|
|
276
|
+
// Core methods
|
|
277
|
+
const searchMethod = this.createVectorSearchMethod(resource);
|
|
278
|
+
const clusterMethod = this.createClusteringMethod(resource);
|
|
279
|
+
const distanceMethod = this.createDistanceMethod();
|
|
280
|
+
|
|
281
|
+
// Add technical methods (original names for compatibility)
|
|
282
|
+
resource.vectorSearch = searchMethod;
|
|
283
|
+
resource.cluster = clusterMethod;
|
|
284
|
+
resource.vectorDistance = distanceMethod;
|
|
285
|
+
|
|
286
|
+
// Add intuitive aliases for better DX
|
|
287
|
+
resource.similarTo = searchMethod; // More natural: "find products similar to X"
|
|
288
|
+
resource.findSimilar = searchMethod; // Descriptive alternative
|
|
289
|
+
resource.distance = distanceMethod; // Simpler than vectorDistance
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/**
|
|
294
|
+
* Create vector search method for a resource
|
|
295
|
+
*
|
|
296
|
+
* Performs K-nearest neighbors search to find similar vectors.
|
|
297
|
+
*
|
|
298
|
+
* @param {Resource} resource - Resource instance
|
|
299
|
+
* @returns {Function} Vector search method
|
|
300
|
+
*/
|
|
301
|
+
createVectorSearchMethod(resource) {
|
|
302
|
+
return async (queryVector, options = {}) => {
|
|
303
|
+
const startTime = Date.now();
|
|
304
|
+
|
|
305
|
+
// Auto-detect vectorField if not provided
|
|
306
|
+
let vectorField = options.vectorField;
|
|
307
|
+
if (!vectorField && this.config.autoDetectVectorField) {
|
|
308
|
+
vectorField = this.detectVectorField(resource);
|
|
309
|
+
if (!vectorField) {
|
|
310
|
+
vectorField = 'vector'; // Fallback to default
|
|
311
|
+
}
|
|
312
|
+
} else if (!vectorField) {
|
|
313
|
+
vectorField = 'vector'; // Fallback to default
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
const {
|
|
317
|
+
limit = 10,
|
|
318
|
+
distanceMetric = this.config.distanceMetric,
|
|
319
|
+
threshold = null,
|
|
320
|
+
partition = null
|
|
321
|
+
} = options;
|
|
322
|
+
|
|
323
|
+
const distanceFn = this.distanceFunctions[distanceMetric];
|
|
324
|
+
if (!distanceFn) {
|
|
325
|
+
const error = new VectorError(`Invalid distance metric: ${distanceMetric}`, {
|
|
326
|
+
operation: 'vectorSearch',
|
|
327
|
+
availableMetrics: Object.keys(this.distanceFunctions),
|
|
328
|
+
providedMetric: distanceMetric
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
this._emitEvent('vector:search-error', {
|
|
332
|
+
resource: resource.name,
|
|
333
|
+
error: error.message,
|
|
334
|
+
timestamp: Date.now()
|
|
335
|
+
});
|
|
336
|
+
|
|
337
|
+
throw error;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Emit start event
|
|
341
|
+
this._emitEvent('vector:search-start', {
|
|
342
|
+
resource: resource.name,
|
|
343
|
+
vectorField,
|
|
344
|
+
limit,
|
|
345
|
+
distanceMetric,
|
|
346
|
+
partition,
|
|
347
|
+
threshold,
|
|
348
|
+
queryDimensions: queryVector.length,
|
|
349
|
+
timestamp: startTime
|
|
350
|
+
});
|
|
351
|
+
|
|
352
|
+
try {
|
|
353
|
+
// Get all records (with optional partition filter)
|
|
354
|
+
let allRecords;
|
|
355
|
+
if (partition) {
|
|
356
|
+
this._emitEvent('vector:partition-filter', {
|
|
357
|
+
resource: resource.name,
|
|
358
|
+
partition,
|
|
359
|
+
timestamp: Date.now()
|
|
360
|
+
});
|
|
361
|
+
allRecords = await resource.list({ partition, partitionValues: partition });
|
|
362
|
+
} else {
|
|
363
|
+
allRecords = await resource.getAll();
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
const totalRecords = allRecords.length;
|
|
367
|
+
let processedRecords = 0;
|
|
368
|
+
let dimensionMismatches = 0;
|
|
369
|
+
|
|
370
|
+
// Calculate distances
|
|
371
|
+
const results = allRecords
|
|
372
|
+
.filter(record => record[vectorField] && Array.isArray(record[vectorField]))
|
|
373
|
+
.map((record, index) => {
|
|
374
|
+
try {
|
|
375
|
+
const distance = distanceFn(queryVector, record[vectorField]);
|
|
376
|
+
processedRecords++;
|
|
377
|
+
|
|
378
|
+
// Emit progress event (throttled)
|
|
379
|
+
if (this.config.verboseEvents && processedRecords % 100 === 0) {
|
|
380
|
+
this._emitEvent('vector:search-progress', {
|
|
381
|
+
resource: resource.name,
|
|
382
|
+
processed: processedRecords,
|
|
383
|
+
total: totalRecords,
|
|
384
|
+
progress: (processedRecords / totalRecords) * 100,
|
|
385
|
+
timestamp: Date.now()
|
|
386
|
+
}, `search-${resource.name}`);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
return { record, distance };
|
|
390
|
+
} catch (err) {
|
|
391
|
+
// Skip records with dimension mismatch
|
|
392
|
+
dimensionMismatches++;
|
|
393
|
+
|
|
394
|
+
if (this.config.verboseEvents) {
|
|
395
|
+
this._emitEvent('vector:dimension-mismatch', {
|
|
396
|
+
resource: resource.name,
|
|
397
|
+
recordIndex: index,
|
|
398
|
+
expected: queryVector.length,
|
|
399
|
+
got: record[vectorField]?.length,
|
|
400
|
+
timestamp: Date.now()
|
|
401
|
+
});
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
return null;
|
|
405
|
+
}
|
|
406
|
+
})
|
|
407
|
+
.filter(result => result !== null)
|
|
408
|
+
.filter(result => threshold === null || result.distance <= threshold)
|
|
409
|
+
.sort((a, b) => a.distance - b.distance)
|
|
410
|
+
.slice(0, limit);
|
|
411
|
+
|
|
412
|
+
const duration = Date.now() - startTime;
|
|
413
|
+
const throughput = totalRecords / (duration / 1000);
|
|
414
|
+
|
|
415
|
+
// Emit complete event
|
|
416
|
+
this._emitEvent('vector:search-complete', {
|
|
417
|
+
resource: resource.name,
|
|
418
|
+
vectorField,
|
|
419
|
+
resultsCount: results.length,
|
|
420
|
+
totalRecords,
|
|
421
|
+
processedRecords,
|
|
422
|
+
dimensionMismatches,
|
|
423
|
+
duration,
|
|
424
|
+
throughput: throughput.toFixed(2),
|
|
425
|
+
timestamp: Date.now()
|
|
426
|
+
});
|
|
427
|
+
|
|
428
|
+
// Emit performance metrics
|
|
429
|
+
if (this.config.verboseEvents) {
|
|
430
|
+
this._emitEvent('vector:performance', {
|
|
431
|
+
operation: 'search',
|
|
432
|
+
resource: resource.name,
|
|
433
|
+
duration,
|
|
434
|
+
throughput: throughput.toFixed(2),
|
|
435
|
+
recordsPerSecond: (processedRecords / (duration / 1000)).toFixed(2),
|
|
436
|
+
timestamp: Date.now()
|
|
437
|
+
});
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
return results;
|
|
441
|
+
} catch (error) {
|
|
442
|
+
this._emitEvent('vector:search-error', {
|
|
443
|
+
resource: resource.name,
|
|
444
|
+
error: error.message,
|
|
445
|
+
stack: error.stack,
|
|
446
|
+
timestamp: Date.now()
|
|
447
|
+
});
|
|
448
|
+
throw error;
|
|
449
|
+
}
|
|
450
|
+
};
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
/**
|
|
454
|
+
* Create clustering method for a resource
|
|
455
|
+
*
|
|
456
|
+
* Performs k-means clustering on resource vectors.
|
|
457
|
+
*
|
|
458
|
+
* @param {Resource} resource - Resource instance
|
|
459
|
+
* @returns {Function} Clustering method
|
|
460
|
+
*/
|
|
461
|
+
createClusteringMethod(resource) {
|
|
462
|
+
return async (options = {}) => {
|
|
463
|
+
const startTime = Date.now();
|
|
464
|
+
|
|
465
|
+
// Auto-detect vectorField if not provided
|
|
466
|
+
let vectorField = options.vectorField;
|
|
467
|
+
if (!vectorField && this.config.autoDetectVectorField) {
|
|
468
|
+
vectorField = this.detectVectorField(resource);
|
|
469
|
+
if (!vectorField) {
|
|
470
|
+
vectorField = 'vector'; // Fallback to default
|
|
471
|
+
}
|
|
472
|
+
} else if (!vectorField) {
|
|
473
|
+
vectorField = 'vector'; // Fallback to default
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
const {
|
|
477
|
+
k = 5,
|
|
478
|
+
distanceMetric = this.config.distanceMetric,
|
|
479
|
+
partition = null,
|
|
480
|
+
...kmeansOptions
|
|
481
|
+
} = options;
|
|
482
|
+
|
|
483
|
+
const distanceFn = this.distanceFunctions[distanceMetric];
|
|
484
|
+
if (!distanceFn) {
|
|
485
|
+
const error = new VectorError(`Invalid distance metric: ${distanceMetric}`, {
|
|
486
|
+
operation: 'cluster',
|
|
487
|
+
availableMetrics: Object.keys(this.distanceFunctions),
|
|
488
|
+
providedMetric: distanceMetric
|
|
489
|
+
});
|
|
490
|
+
|
|
491
|
+
this._emitEvent('vector:cluster-error', {
|
|
492
|
+
resource: resource.name,
|
|
493
|
+
error: error.message,
|
|
494
|
+
timestamp: Date.now()
|
|
495
|
+
});
|
|
496
|
+
|
|
497
|
+
throw error;
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
// Emit start event
|
|
501
|
+
this._emitEvent('vector:cluster-start', {
|
|
502
|
+
resource: resource.name,
|
|
503
|
+
vectorField,
|
|
504
|
+
k,
|
|
505
|
+
distanceMetric,
|
|
506
|
+
partition,
|
|
507
|
+
maxIterations: kmeansOptions.maxIterations || 100,
|
|
508
|
+
timestamp: startTime
|
|
509
|
+
});
|
|
510
|
+
|
|
511
|
+
try {
|
|
512
|
+
// Get all records (with optional partition filter)
|
|
513
|
+
let allRecords;
|
|
514
|
+
if (partition) {
|
|
515
|
+
this._emitEvent('vector:partition-filter', {
|
|
516
|
+
resource: resource.name,
|
|
517
|
+
partition,
|
|
518
|
+
timestamp: Date.now()
|
|
519
|
+
});
|
|
520
|
+
allRecords = await resource.list({ partition, partitionValues: partition });
|
|
521
|
+
} else {
|
|
522
|
+
allRecords = await resource.getAll();
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
// Extract vectors
|
|
526
|
+
const recordsWithVectors = allRecords.filter(
|
|
527
|
+
record => record[vectorField] && Array.isArray(record[vectorField])
|
|
528
|
+
);
|
|
529
|
+
|
|
530
|
+
if (recordsWithVectors.length === 0) {
|
|
531
|
+
const error = new VectorError('No vectors found in resource', {
|
|
532
|
+
operation: 'cluster',
|
|
533
|
+
resourceName: resource.name,
|
|
534
|
+
vectorField
|
|
535
|
+
});
|
|
536
|
+
|
|
537
|
+
this._emitEvent('vector:empty-dataset', {
|
|
538
|
+
resource: resource.name,
|
|
539
|
+
vectorField,
|
|
540
|
+
totalRecords: allRecords.length,
|
|
541
|
+
timestamp: Date.now()
|
|
542
|
+
});
|
|
543
|
+
|
|
544
|
+
throw error;
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
const vectors = recordsWithVectors.map(record => record[vectorField]);
|
|
548
|
+
|
|
549
|
+
// Run k-means with progress callback
|
|
550
|
+
const result = kmeans(vectors, k, {
|
|
551
|
+
...kmeansOptions,
|
|
552
|
+
distanceFn,
|
|
553
|
+
onIteration: this.config.verboseEvents ? (iteration, inertia, converged) => {
|
|
554
|
+
this._emitEvent('vector:cluster-iteration', {
|
|
555
|
+
resource: resource.name,
|
|
556
|
+
k,
|
|
557
|
+
iteration,
|
|
558
|
+
inertia,
|
|
559
|
+
converged,
|
|
560
|
+
timestamp: Date.now()
|
|
561
|
+
}, `cluster-${resource.name}`);
|
|
562
|
+
} : undefined
|
|
563
|
+
});
|
|
564
|
+
|
|
565
|
+
// Emit convergence event
|
|
566
|
+
if (result.converged) {
|
|
567
|
+
this._emitEvent('vector:cluster-converged', {
|
|
568
|
+
resource: resource.name,
|
|
569
|
+
k,
|
|
570
|
+
iterations: result.iterations,
|
|
571
|
+
inertia: result.inertia,
|
|
572
|
+
timestamp: Date.now()
|
|
573
|
+
});
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// Map results back to records
|
|
577
|
+
const clusters = Array(k).fill(null).map(() => []);
|
|
578
|
+
recordsWithVectors.forEach((record, i) => {
|
|
579
|
+
const clusterIndex = result.assignments[i];
|
|
580
|
+
clusters[clusterIndex].push(record);
|
|
581
|
+
});
|
|
582
|
+
|
|
583
|
+
const duration = Date.now() - startTime;
|
|
584
|
+
const clusterSizes = clusters.map(c => c.length);
|
|
585
|
+
|
|
586
|
+
// Emit complete event
|
|
587
|
+
this._emitEvent('vector:cluster-complete', {
|
|
588
|
+
resource: resource.name,
|
|
589
|
+
vectorField,
|
|
590
|
+
k,
|
|
591
|
+
vectorCount: vectors.length,
|
|
592
|
+
iterations: result.iterations,
|
|
593
|
+
converged: result.converged,
|
|
594
|
+
inertia: result.inertia,
|
|
595
|
+
clusterSizes,
|
|
596
|
+
duration,
|
|
597
|
+
timestamp: Date.now()
|
|
598
|
+
});
|
|
599
|
+
|
|
600
|
+
// Emit performance metrics
|
|
601
|
+
if (this.config.verboseEvents) {
|
|
602
|
+
this._emitEvent('vector:performance', {
|
|
603
|
+
operation: 'clustering',
|
|
604
|
+
resource: resource.name,
|
|
605
|
+
k,
|
|
606
|
+
duration,
|
|
607
|
+
iterationsPerSecond: (result.iterations / (duration / 1000)).toFixed(2),
|
|
608
|
+
vectorsPerSecond: (vectors.length / (duration / 1000)).toFixed(2),
|
|
609
|
+
timestamp: Date.now()
|
|
610
|
+
});
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
return {
|
|
614
|
+
clusters,
|
|
615
|
+
centroids: result.centroids,
|
|
616
|
+
inertia: result.inertia,
|
|
617
|
+
iterations: result.iterations,
|
|
618
|
+
converged: result.converged
|
|
619
|
+
};
|
|
620
|
+
} catch (error) {
|
|
621
|
+
this._emitEvent('vector:cluster-error', {
|
|
622
|
+
resource: resource.name,
|
|
623
|
+
error: error.message,
|
|
624
|
+
stack: error.stack,
|
|
625
|
+
timestamp: Date.now()
|
|
626
|
+
});
|
|
627
|
+
throw error;
|
|
628
|
+
}
|
|
629
|
+
};
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
/**
|
|
633
|
+
* Create distance calculation method
|
|
634
|
+
*
|
|
635
|
+
* @returns {Function} Distance method
|
|
636
|
+
*/
|
|
637
|
+
createDistanceMethod() {
|
|
638
|
+
return (vector1, vector2, metric = this.config.distanceMetric) => {
|
|
639
|
+
const distanceFn = this.distanceFunctions[metric];
|
|
640
|
+
if (!distanceFn) {
|
|
641
|
+
throw new VectorError(`Invalid distance metric: ${metric}`, {
|
|
642
|
+
operation: 'vectorDistance',
|
|
643
|
+
availableMetrics: Object.keys(this.distanceFunctions),
|
|
644
|
+
providedMetric: metric
|
|
645
|
+
});
|
|
646
|
+
}
|
|
647
|
+
return distanceFn(vector1, vector2);
|
|
648
|
+
};
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
/**
|
|
652
|
+
* Static utility: Normalize vector
|
|
653
|
+
*
|
|
654
|
+
* @param {number[]} vector - Input vector
|
|
655
|
+
* @returns {number[]} Normalized vector
|
|
656
|
+
*/
|
|
657
|
+
static normalize(vector) {
|
|
658
|
+
return normalize(vector);
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
/**
|
|
662
|
+
* Static utility: Calculate dot product
|
|
663
|
+
*
|
|
664
|
+
* @param {number[]} vector1 - First vector
|
|
665
|
+
* @param {number[]} vector2 - Second vector
|
|
666
|
+
* @returns {number} Dot product
|
|
667
|
+
*/
|
|
668
|
+
static dotProduct(vector1, vector2) {
|
|
669
|
+
return dotProduct(vector1, vector2);
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
/**
|
|
673
|
+
* Static utility: Find optimal K for clustering
|
|
674
|
+
*
|
|
675
|
+
* Analyzes clustering quality across a range of K values using
|
|
676
|
+
* multiple evaluation metrics.
|
|
677
|
+
*
|
|
678
|
+
* @param {number[][]} vectors - Vectors to analyze
|
|
679
|
+
* @param {Object} options - Configuration options
|
|
680
|
+
* @returns {Promise<Object>} Analysis results with recommendations
|
|
681
|
+
*/
|
|
682
|
+
static async findOptimalK(vectors, options) {
|
|
683
|
+
return findOptimalK(vectors, options);
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
export default VectorPlugin;
|