s3db.js 11.2.3 → 11.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/s3db-cli.js +588 -74
- package/dist/s3db.cjs.js +2472 -150
- package/dist/s3db.cjs.js.map +1 -1
- package/dist/s3db.es.js +2464 -151
- package/dist/s3db.es.js.map +1 -1
- package/package.json +2 -1
- package/src/behaviors/enforce-limits.js +28 -4
- package/src/behaviors/index.js +6 -1
- package/src/client.class.js +11 -1
- package/src/concerns/base62.js +70 -0
- package/src/concerns/partition-queue.js +7 -1
- package/src/concerns/plugin-storage.js +75 -13
- package/src/database.class.js +19 -4
- package/src/errors.js +306 -27
- package/src/partition-drivers/base-partition-driver.js +12 -2
- package/src/partition-drivers/index.js +7 -1
- package/src/partition-drivers/memory-partition-driver.js +20 -5
- package/src/partition-drivers/sqs-partition-driver.js +6 -1
- package/src/plugins/audit.errors.js +46 -0
- package/src/plugins/backup/base-backup-driver.class.js +36 -6
- package/src/plugins/backup/filesystem-backup-driver.class.js +55 -7
- package/src/plugins/backup/index.js +40 -9
- package/src/plugins/backup/multi-backup-driver.class.js +69 -9
- package/src/plugins/backup/s3-backup-driver.class.js +48 -6
- package/src/plugins/backup.errors.js +45 -0
- package/src/plugins/cache/cache.class.js +8 -1
- package/src/plugins/cache.errors.js +47 -0
- package/src/plugins/cache.plugin.js +8 -1
- package/src/plugins/fulltext.errors.js +46 -0
- package/src/plugins/fulltext.plugin.js +15 -3
- package/src/plugins/index.js +1 -0
- package/src/plugins/metrics.errors.js +46 -0
- package/src/plugins/queue-consumer.plugin.js +31 -4
- package/src/plugins/queue.errors.js +46 -0
- package/src/plugins/replicator.errors.js +46 -0
- package/src/plugins/replicator.plugin.js +40 -5
- package/src/plugins/replicators/base-replicator.class.js +19 -3
- package/src/plugins/replicators/index.js +9 -3
- package/src/plugins/replicators/s3db-replicator.class.js +38 -8
- package/src/plugins/scheduler.errors.js +46 -0
- package/src/plugins/scheduler.plugin.js +79 -19
- package/src/plugins/state-machine.errors.js +47 -0
- package/src/plugins/state-machine.plugin.js +86 -17
- package/src/plugins/vector/distances.js +173 -0
- package/src/plugins/vector/kmeans.js +367 -0
- package/src/plugins/vector/metrics.js +369 -0
- package/src/plugins/vector/vector-error.js +43 -0
- package/src/plugins/vector.plugin.js +687 -0
- package/src/schema.class.js +232 -41
- package/src/stream/index.js +6 -1
- package/src/stream/resource-reader.class.js +6 -1
- package/src/validator.class.js +8 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* K-Means Clustering Implementation
|
|
3
|
+
*
|
|
4
|
+
* Provides k-means clustering with k-means++ initialization
|
|
5
|
+
* and comprehensive optimal K analysis using multiple metrics.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { euclideanDistance } from './distances.js';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* K-means clustering algorithm
|
|
12
|
+
*
|
|
13
|
+
* @param {number[][]} vectors - Array of vectors to cluster
|
|
14
|
+
* @param {number} k - Number of clusters
|
|
15
|
+
* @param {Object} options - Configuration options
|
|
16
|
+
* @param {number} options.maxIterations - Maximum iterations (default: 100)
|
|
17
|
+
* @param {number} options.tolerance - Convergence tolerance (default: 0.0001)
|
|
18
|
+
* @param {Function} options.distanceFn - Distance function (default: euclideanDistance)
|
|
19
|
+
* @param {number|null} options.seed - Random seed for reproducibility (default: null)
|
|
20
|
+
* @param {Function} options.onIteration - Callback for each iteration (iteration, inertia, converged) (default: null)
|
|
21
|
+
* @returns {Object} Clustering results
|
|
22
|
+
* @throws {Error} If invalid parameters
|
|
23
|
+
*/
|
|
24
|
+
export function kmeans(vectors, k, options = {}) {
|
|
25
|
+
const {
|
|
26
|
+
maxIterations = 100,
|
|
27
|
+
tolerance = 0.0001,
|
|
28
|
+
distanceFn = euclideanDistance,
|
|
29
|
+
seed = null,
|
|
30
|
+
onIteration = null
|
|
31
|
+
} = options;
|
|
32
|
+
|
|
33
|
+
if (vectors.length === 0) {
|
|
34
|
+
throw new Error('Cannot cluster empty vector array');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (k < 1) {
|
|
38
|
+
throw new Error(`k must be at least 1, got ${k}`);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (k > vectors.length) {
|
|
42
|
+
throw new Error(`k (${k}) cannot be greater than number of vectors (${vectors.length})`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const dimensions = vectors[0].length;
|
|
46
|
+
|
|
47
|
+
// Validate all vectors have same dimensions
|
|
48
|
+
for (let i = 1; i < vectors.length; i++) {
|
|
49
|
+
if (vectors[i].length !== dimensions) {
|
|
50
|
+
throw new Error(`All vectors must have same dimensions. Expected ${dimensions}, got ${vectors[i].length} at index ${i}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Initialize centroids using k-means++
|
|
55
|
+
const centroids = initializeCentroidsKMeansPlusPlus(vectors, k, distanceFn, seed);
|
|
56
|
+
|
|
57
|
+
let assignments = new Array(vectors.length);
|
|
58
|
+
let iterations = 0;
|
|
59
|
+
let converged = false;
|
|
60
|
+
let previousInertia = Infinity;
|
|
61
|
+
|
|
62
|
+
while (!converged && iterations < maxIterations) {
|
|
63
|
+
// Assign each vector to nearest centroid
|
|
64
|
+
const newAssignments = vectors.map(vector => {
|
|
65
|
+
let minDist = Infinity;
|
|
66
|
+
let nearestCluster = 0;
|
|
67
|
+
|
|
68
|
+
for (let i = 0; i < k; i++) {
|
|
69
|
+
const dist = distanceFn(vector, centroids[i]);
|
|
70
|
+
if (dist < minDist) {
|
|
71
|
+
minDist = dist;
|
|
72
|
+
nearestCluster = i;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return nearestCluster;
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
// Calculate inertia (sum of squared distances to centroids)
|
|
80
|
+
let inertia = 0;
|
|
81
|
+
vectors.forEach((vector, i) => {
|
|
82
|
+
const dist = distanceFn(vector, centroids[newAssignments[i]]);
|
|
83
|
+
inertia += dist * dist;
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
// Check for convergence
|
|
87
|
+
const inertiaChange = Math.abs(previousInertia - inertia);
|
|
88
|
+
converged = inertiaChange < tolerance;
|
|
89
|
+
|
|
90
|
+
assignments = newAssignments;
|
|
91
|
+
previousInertia = inertia;
|
|
92
|
+
|
|
93
|
+
// Call onIteration callback if provided
|
|
94
|
+
if (onIteration) {
|
|
95
|
+
onIteration(iterations + 1, inertia, converged);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (!converged) {
|
|
99
|
+
// Recalculate centroids
|
|
100
|
+
const clusterSums = Array(k).fill(null).map(() => new Array(dimensions).fill(0));
|
|
101
|
+
const clusterCounts = new Array(k).fill(0);
|
|
102
|
+
|
|
103
|
+
vectors.forEach((vector, i) => {
|
|
104
|
+
const cluster = assignments[i];
|
|
105
|
+
clusterCounts[cluster]++;
|
|
106
|
+
vector.forEach((val, j) => {
|
|
107
|
+
clusterSums[cluster][j] += val;
|
|
108
|
+
});
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
// Update centroids
|
|
112
|
+
for (let i = 0; i < k; i++) {
|
|
113
|
+
if (clusterCounts[i] > 0) {
|
|
114
|
+
centroids[i] = clusterSums[i].map(sum => sum / clusterCounts[i]);
|
|
115
|
+
}
|
|
116
|
+
// If cluster is empty, reinitialize to a random vector
|
|
117
|
+
else {
|
|
118
|
+
const randomIdx = Math.floor(Math.random() * vectors.length);
|
|
119
|
+
centroids[i] = [...vectors[randomIdx]];
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
iterations++;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Final inertia calculation
|
|
128
|
+
let inertia = 0;
|
|
129
|
+
vectors.forEach((vector, i) => {
|
|
130
|
+
const dist = distanceFn(vector, centroids[assignments[i]]);
|
|
131
|
+
inertia += dist * dist;
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
centroids,
|
|
136
|
+
assignments,
|
|
137
|
+
iterations,
|
|
138
|
+
converged,
|
|
139
|
+
inertia
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Initialize centroids using k-means++ algorithm
|
|
145
|
+
*
|
|
146
|
+
* K-means++ provides better initialization than random selection,
|
|
147
|
+
* leading to faster convergence and better final clustering.
|
|
148
|
+
*
|
|
149
|
+
* @param {number[][]} vectors - Input vectors
|
|
150
|
+
* @param {number} k - Number of centroids
|
|
151
|
+
* @param {Function} distanceFn - Distance function
|
|
152
|
+
* @param {number|null} seed - Random seed
|
|
153
|
+
* @returns {number[][]} Initial centroids
|
|
154
|
+
*/
|
|
155
|
+
function initializeCentroidsKMeansPlusPlus(vectors, k, distanceFn, seed) {
|
|
156
|
+
const centroids = [];
|
|
157
|
+
const n = vectors.length;
|
|
158
|
+
|
|
159
|
+
// Choose first centroid randomly
|
|
160
|
+
const firstIndex = seed !== null ? seed % n : Math.floor(Math.random() * n);
|
|
161
|
+
centroids.push([...vectors[firstIndex]]);
|
|
162
|
+
|
|
163
|
+
// Choose remaining centroids
|
|
164
|
+
for (let i = 1; i < k; i++) {
|
|
165
|
+
// Calculate distance to nearest existing centroid
|
|
166
|
+
const distances = vectors.map(vector => {
|
|
167
|
+
return Math.min(...centroids.map(c => distanceFn(vector, c)));
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
// Square distances for probability distribution
|
|
171
|
+
const squaredDistances = distances.map(d => d * d);
|
|
172
|
+
const totalSquared = squaredDistances.reduce((a, b) => a + b, 0);
|
|
173
|
+
|
|
174
|
+
if (totalSquared === 0) {
|
|
175
|
+
// All remaining points are identical to existing centroids
|
|
176
|
+
// Choose randomly
|
|
177
|
+
const randomIdx = Math.floor(Math.random() * n);
|
|
178
|
+
centroids.push([...vectors[randomIdx]]);
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Choose next centroid with probability proportional to squared distance
|
|
183
|
+
let threshold = Math.random() * totalSquared;
|
|
184
|
+
let cumulativeSum = 0;
|
|
185
|
+
|
|
186
|
+
for (let j = 0; j < n; j++) {
|
|
187
|
+
cumulativeSum += squaredDistances[j];
|
|
188
|
+
if (cumulativeSum >= threshold) {
|
|
189
|
+
centroids.push([...vectors[j]]);
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return centroids;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Find optimal K using multiple evaluation metrics
|
|
200
|
+
*
|
|
201
|
+
* Analyzes clustering quality across a range of K values using:
|
|
202
|
+
* - Elbow method (inertia)
|
|
203
|
+
* - Silhouette score
|
|
204
|
+
* - Davies-Bouldin index
|
|
205
|
+
* - Calinski-Harabasz index
|
|
206
|
+
* - Gap statistic
|
|
207
|
+
* - Clustering stability
|
|
208
|
+
*
|
|
209
|
+
* @param {number[][]} vectors - Vectors to analyze
|
|
210
|
+
* @param {Object} options - Configuration options
|
|
211
|
+
* @param {number} options.minK - Minimum K to test (default: 2)
|
|
212
|
+
* @param {number} options.maxK - Maximum K to test (default: sqrt(n/2))
|
|
213
|
+
* @param {Function} options.distanceFn - Distance function (default: euclideanDistance)
|
|
214
|
+
* @param {number} options.nReferences - Number of reference datasets for Gap statistic (default: 10)
|
|
215
|
+
* @param {number} options.stabilityRuns - Number of runs for stability analysis (default: 5)
|
|
216
|
+
* @returns {Promise<Object>} Analysis results with recommendations
|
|
217
|
+
*/
|
|
218
|
+
export async function findOptimalK(vectors, options = {}) {
|
|
219
|
+
const {
|
|
220
|
+
minK = 2,
|
|
221
|
+
maxK = Math.min(10, Math.floor(Math.sqrt(vectors.length / 2))),
|
|
222
|
+
distanceFn = euclideanDistance,
|
|
223
|
+
nReferences = 10,
|
|
224
|
+
stabilityRuns = 5,
|
|
225
|
+
...kmeansOptions
|
|
226
|
+
} = options;
|
|
227
|
+
|
|
228
|
+
// Dynamic import to avoid circular dependency
|
|
229
|
+
const metricsModule = await import('./metrics.js');
|
|
230
|
+
const {
|
|
231
|
+
silhouetteScore,
|
|
232
|
+
daviesBouldinIndex,
|
|
233
|
+
calinskiHarabaszIndex,
|
|
234
|
+
gapStatistic,
|
|
235
|
+
clusteringStability
|
|
236
|
+
} = metricsModule;
|
|
237
|
+
|
|
238
|
+
const results = [];
|
|
239
|
+
|
|
240
|
+
for (let k = minK; k <= maxK; k++) {
|
|
241
|
+
// Run k-means
|
|
242
|
+
const kmeansResult = kmeans(vectors, k, { ...kmeansOptions, distanceFn });
|
|
243
|
+
|
|
244
|
+
// Calculate all metrics
|
|
245
|
+
const silhouette = silhouetteScore(
|
|
246
|
+
vectors,
|
|
247
|
+
kmeansResult.assignments,
|
|
248
|
+
kmeansResult.centroids,
|
|
249
|
+
distanceFn
|
|
250
|
+
);
|
|
251
|
+
|
|
252
|
+
const daviesBouldin = daviesBouldinIndex(
|
|
253
|
+
vectors,
|
|
254
|
+
kmeansResult.assignments,
|
|
255
|
+
kmeansResult.centroids,
|
|
256
|
+
distanceFn
|
|
257
|
+
);
|
|
258
|
+
|
|
259
|
+
const calinskiHarabasz = calinskiHarabaszIndex(
|
|
260
|
+
vectors,
|
|
261
|
+
kmeansResult.assignments,
|
|
262
|
+
kmeansResult.centroids,
|
|
263
|
+
distanceFn
|
|
264
|
+
);
|
|
265
|
+
|
|
266
|
+
const gap = await gapStatistic(
|
|
267
|
+
vectors,
|
|
268
|
+
kmeansResult.assignments,
|
|
269
|
+
kmeansResult.centroids,
|
|
270
|
+
distanceFn,
|
|
271
|
+
nReferences
|
|
272
|
+
);
|
|
273
|
+
|
|
274
|
+
const stability = clusteringStability(
|
|
275
|
+
vectors,
|
|
276
|
+
k,
|
|
277
|
+
{ ...kmeansOptions, distanceFn, nRuns: stabilityRuns }
|
|
278
|
+
);
|
|
279
|
+
|
|
280
|
+
results.push({
|
|
281
|
+
k,
|
|
282
|
+
inertia: kmeansResult.inertia,
|
|
283
|
+
silhouette,
|
|
284
|
+
daviesBouldin,
|
|
285
|
+
calinskiHarabasz,
|
|
286
|
+
gap: gap.gap,
|
|
287
|
+
gapSk: gap.sk,
|
|
288
|
+
stability: stability.stability,
|
|
289
|
+
cvInertia: stability.cvInertia,
|
|
290
|
+
iterations: kmeansResult.iterations,
|
|
291
|
+
converged: kmeansResult.converged
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Calculate elbow point
|
|
296
|
+
const elbowK = findElbowPoint(results.map(r => r.inertia));
|
|
297
|
+
|
|
298
|
+
// Find best K by each metric
|
|
299
|
+
const recommendations = {
|
|
300
|
+
elbow: minK + elbowK,
|
|
301
|
+
silhouette: results.reduce((best, curr) =>
|
|
302
|
+
curr.silhouette > best.silhouette ? curr : best
|
|
303
|
+
).k,
|
|
304
|
+
daviesBouldin: results.reduce((best, curr) =>
|
|
305
|
+
curr.daviesBouldin < best.daviesBouldin ? curr : best
|
|
306
|
+
).k,
|
|
307
|
+
calinskiHarabasz: results.reduce((best, curr) =>
|
|
308
|
+
curr.calinskiHarabasz > best.calinskiHarabasz ? curr : best
|
|
309
|
+
).k,
|
|
310
|
+
gap: results.reduce((best, curr) =>
|
|
311
|
+
curr.gap > best.gap ? curr : best
|
|
312
|
+
).k,
|
|
313
|
+
stability: results.reduce((best, curr) =>
|
|
314
|
+
curr.stability > best.stability ? curr : best
|
|
315
|
+
).k
|
|
316
|
+
};
|
|
317
|
+
|
|
318
|
+
// Calculate consensus (most common recommendation)
|
|
319
|
+
const votes = Object.values(recommendations);
|
|
320
|
+
const consensus = votes.reduce((acc, k) => {
|
|
321
|
+
acc[k] = (acc[k] || 0) + 1;
|
|
322
|
+
return acc;
|
|
323
|
+
}, {});
|
|
324
|
+
|
|
325
|
+
const consensusK = parseInt(
|
|
326
|
+
Object.entries(consensus).reduce((a, b) => b[1] > a[1] ? b : a)[0]
|
|
327
|
+
);
|
|
328
|
+
|
|
329
|
+
return {
|
|
330
|
+
results,
|
|
331
|
+
recommendations,
|
|
332
|
+
consensus: consensusK,
|
|
333
|
+
summary: {
|
|
334
|
+
analysisRange: `${minK}-${maxK}`,
|
|
335
|
+
totalVectors: vectors.length,
|
|
336
|
+
dimensions: vectors[0].length,
|
|
337
|
+
recommendation: consensusK,
|
|
338
|
+
confidence: consensus[consensusK] / votes.length
|
|
339
|
+
}
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* Find elbow point using method of maximum curvature
|
|
345
|
+
*
|
|
346
|
+
* @param {number[]} inertias - Array of inertia values
|
|
347
|
+
* @returns {number} Index of elbow point
|
|
348
|
+
*/
|
|
349
|
+
function findElbowPoint(inertias) {
|
|
350
|
+
const n = inertias.length;
|
|
351
|
+
if (n < 3) return 0;
|
|
352
|
+
|
|
353
|
+
let maxCurvature = -Infinity;
|
|
354
|
+
let elbowIndex = 0;
|
|
355
|
+
|
|
356
|
+
for (let i = 1; i < n - 1; i++) {
|
|
357
|
+
// Calculate second derivative (curvature approximation)
|
|
358
|
+
const curvature = inertias[i - 1] - 2 * inertias[i] + inertias[i + 1];
|
|
359
|
+
|
|
360
|
+
if (curvature > maxCurvature) {
|
|
361
|
+
maxCurvature = curvature;
|
|
362
|
+
elbowIndex = i;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return elbowIndex;
|
|
367
|
+
}
|