s3db.js 11.2.4 → 11.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/s3db-cli.js +588 -74
- package/dist/s3db.cjs.js +1361 -23
- package/dist/s3db.cjs.js.map +1 -1
- package/dist/s3db.d.ts +3 -1
- package/dist/s3db.es.js +1359 -24
- package/dist/s3db.es.js.map +1 -1
- package/package.json +2 -1
- package/src/concerns/base62.js +70 -0
- package/src/plugins/index.js +1 -0
- package/src/plugins/vector/distances.js +173 -0
- package/src/plugins/vector/kmeans.js +367 -0
- package/src/plugins/vector/metrics.js +369 -0
- package/src/plugins/vector/vector-error.js +43 -0
- package/src/plugins/vector.plugin.js +687 -0
- package/src/resource.class.js +79 -0
- package/src/s3db.d.ts +3 -1
- package/src/schema.class.js +232 -41
- package/src/validator.class.js +8 -0
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clustering Evaluation Metrics
|
|
3
|
+
*
|
|
4
|
+
* Provides multiple metrics for evaluating clustering quality
|
|
5
|
+
* and determining optimal number of clusters (K).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { euclideanDistance } from './distances.js';
|
|
9
|
+
import { kmeans } from './kmeans.js';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Calculate Silhouette Score for clustering quality
|
|
13
|
+
*
|
|
14
|
+
* Measures how similar each point is to its own cluster compared to other clusters.
|
|
15
|
+
*
|
|
16
|
+
* Range: [-1, 1]
|
|
17
|
+
* - Close to 1: Well clustered
|
|
18
|
+
* - Close to 0: On border between clusters
|
|
19
|
+
* - Negative: Likely in wrong cluster
|
|
20
|
+
*
|
|
21
|
+
* @param {number[][]} vectors - Input vectors
|
|
22
|
+
* @param {number[]} assignments - Cluster assignments
|
|
23
|
+
* @param {number[][]} centroids - Cluster centroids
|
|
24
|
+
* @param {Function} distanceFn - Distance function
|
|
25
|
+
* @returns {number} Average silhouette score
|
|
26
|
+
*/
|
|
27
|
+
export function silhouetteScore(vectors, assignments, centroids, distanceFn = euclideanDistance) {
|
|
28
|
+
const k = centroids.length;
|
|
29
|
+
const n = vectors.length;
|
|
30
|
+
|
|
31
|
+
// Group vectors by cluster
|
|
32
|
+
const clusters = Array(k).fill(null).map(() => []);
|
|
33
|
+
vectors.forEach((vector, i) => {
|
|
34
|
+
clusters[assignments[i]].push(i);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
let totalScore = 0;
|
|
38
|
+
let validPoints = 0;
|
|
39
|
+
|
|
40
|
+
// Handle case where all points are in different clusters
|
|
41
|
+
if (clusters.every(c => c.length <= 1)) {
|
|
42
|
+
return 0;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
for (let i = 0; i < n; i++) {
|
|
46
|
+
const clusterIdx = assignments[i];
|
|
47
|
+
const cluster = clusters[clusterIdx];
|
|
48
|
+
|
|
49
|
+
// Skip singleton clusters
|
|
50
|
+
if (cluster.length === 1) continue;
|
|
51
|
+
|
|
52
|
+
// a(i): Average distance to points in same cluster
|
|
53
|
+
let a = 0;
|
|
54
|
+
for (const j of cluster) {
|
|
55
|
+
if (i !== j) {
|
|
56
|
+
a += distanceFn(vectors[i], vectors[j]);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
a /= (cluster.length - 1);
|
|
60
|
+
|
|
61
|
+
// b(i): Minimum average distance to points in other clusters
|
|
62
|
+
let b = Infinity;
|
|
63
|
+
for (let otherCluster = 0; otherCluster < k; otherCluster++) {
|
|
64
|
+
if (otherCluster === clusterIdx) continue;
|
|
65
|
+
|
|
66
|
+
const otherPoints = clusters[otherCluster];
|
|
67
|
+
if (otherPoints.length === 0) continue;
|
|
68
|
+
|
|
69
|
+
let avgDist = 0;
|
|
70
|
+
for (const j of otherPoints) {
|
|
71
|
+
avgDist += distanceFn(vectors[i], vectors[j]);
|
|
72
|
+
}
|
|
73
|
+
avgDist /= otherPoints.length;
|
|
74
|
+
|
|
75
|
+
b = Math.min(b, avgDist);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// If no other clusters exist (k=1), skip this point
|
|
79
|
+
if (b === Infinity) continue;
|
|
80
|
+
|
|
81
|
+
// Silhouette coefficient for point i
|
|
82
|
+
const maxAB = Math.max(a, b);
|
|
83
|
+
const s = maxAB === 0 ? 0 : (b - a) / maxAB;
|
|
84
|
+
totalScore += s;
|
|
85
|
+
validPoints++;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return validPoints > 0 ? totalScore / validPoints : 0;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Calculate Davies-Bouldin Index
|
|
93
|
+
*
|
|
94
|
+
* Measures average similarity between each cluster and its most similar cluster.
|
|
95
|
+
* Lower is better (minimum 0).
|
|
96
|
+
*
|
|
97
|
+
* @param {number[][]} vectors - Input vectors
|
|
98
|
+
* @param {number[]} assignments - Cluster assignments
|
|
99
|
+
* @param {number[][]} centroids - Cluster centroids
|
|
100
|
+
* @param {Function} distanceFn - Distance function
|
|
101
|
+
* @returns {number} Davies-Bouldin index
|
|
102
|
+
*/
|
|
103
|
+
export function daviesBouldinIndex(vectors, assignments, centroids, distanceFn = euclideanDistance) {
|
|
104
|
+
const k = centroids.length;
|
|
105
|
+
|
|
106
|
+
// Calculate average distance from points to their centroid (cluster scatter)
|
|
107
|
+
const scatters = new Array(k).fill(0);
|
|
108
|
+
const clusterCounts = new Array(k).fill(0);
|
|
109
|
+
|
|
110
|
+
vectors.forEach((vector, i) => {
|
|
111
|
+
const cluster = assignments[i];
|
|
112
|
+
scatters[cluster] += distanceFn(vector, centroids[cluster]);
|
|
113
|
+
clusterCounts[cluster]++;
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
for (let i = 0; i < k; i++) {
|
|
117
|
+
if (clusterCounts[i] > 0) {
|
|
118
|
+
scatters[i] /= clusterCounts[i];
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Calculate Davies-Bouldin index
|
|
123
|
+
let dbIndex = 0;
|
|
124
|
+
let validClusters = 0;
|
|
125
|
+
|
|
126
|
+
for (let i = 0; i < k; i++) {
|
|
127
|
+
if (clusterCounts[i] === 0) continue;
|
|
128
|
+
|
|
129
|
+
let maxRatio = 0;
|
|
130
|
+
for (let j = 0; j < k; j++) {
|
|
131
|
+
if (i === j || clusterCounts[j] === 0) continue;
|
|
132
|
+
|
|
133
|
+
const centroidDist = distanceFn(centroids[i], centroids[j]);
|
|
134
|
+
if (centroidDist === 0) continue;
|
|
135
|
+
|
|
136
|
+
const ratio = (scatters[i] + scatters[j]) / centroidDist;
|
|
137
|
+
maxRatio = Math.max(maxRatio, ratio);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
dbIndex += maxRatio;
|
|
141
|
+
validClusters++;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return validClusters > 0 ? dbIndex / validClusters : 0;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Calculate Calinski-Harabasz Index (Variance Ratio Criterion)
|
|
149
|
+
*
|
|
150
|
+
* Ratio of between-cluster dispersion to within-cluster dispersion.
|
|
151
|
+
* Higher is better.
|
|
152
|
+
*
|
|
153
|
+
* @param {number[][]} vectors - Input vectors
|
|
154
|
+
* @param {number[]} assignments - Cluster assignments
|
|
155
|
+
* @param {number[][]} centroids - Cluster centroids
|
|
156
|
+
* @param {Function} distanceFn - Distance function
|
|
157
|
+
* @returns {number} Calinski-Harabasz index
|
|
158
|
+
*/
|
|
159
|
+
export function calinskiHarabaszIndex(vectors, assignments, centroids, distanceFn = euclideanDistance) {
|
|
160
|
+
const n = vectors.length;
|
|
161
|
+
const k = centroids.length;
|
|
162
|
+
|
|
163
|
+
if (k === 1 || k === n) return 0;
|
|
164
|
+
|
|
165
|
+
// Calculate overall centroid
|
|
166
|
+
const dimensions = vectors[0].length;
|
|
167
|
+
const overallCentroid = new Array(dimensions).fill(0);
|
|
168
|
+
|
|
169
|
+
vectors.forEach(vector => {
|
|
170
|
+
vector.forEach((val, dim) => {
|
|
171
|
+
overallCentroid[dim] += val;
|
|
172
|
+
});
|
|
173
|
+
});
|
|
174
|
+
|
|
175
|
+
overallCentroid.forEach((val, dim, arr) => {
|
|
176
|
+
arr[dim] = val / n;
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
// Calculate between-cluster dispersion (BGSS)
|
|
180
|
+
const clusterCounts = new Array(k).fill(0);
|
|
181
|
+
vectors.forEach((vector, i) => {
|
|
182
|
+
clusterCounts[assignments[i]]++;
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
let bgss = 0;
|
|
186
|
+
for (let i = 0; i < k; i++) {
|
|
187
|
+
if (clusterCounts[i] === 0) continue;
|
|
188
|
+
const dist = distanceFn(centroids[i], overallCentroid);
|
|
189
|
+
bgss += clusterCounts[i] * dist * dist;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Calculate within-cluster dispersion (WCSS)
|
|
193
|
+
let wcss = 0;
|
|
194
|
+
vectors.forEach((vector, i) => {
|
|
195
|
+
const cluster = assignments[i];
|
|
196
|
+
const dist = distanceFn(vector, centroids[cluster]);
|
|
197
|
+
wcss += dist * dist;
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
if (wcss === 0) return 0;
|
|
201
|
+
|
|
202
|
+
// Calinski-Harabasz index
|
|
203
|
+
return (bgss / (k - 1)) / (wcss / (n - k));
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Calculate Gap Statistic
|
|
208
|
+
*
|
|
209
|
+
* Compares clustering to random uniform distribution.
|
|
210
|
+
* Higher gap indicates better clustering.
|
|
211
|
+
*
|
|
212
|
+
* @param {number[][]} vectors - Input vectors
|
|
213
|
+
* @param {number[]} assignments - Cluster assignments
|
|
214
|
+
* @param {number[][]} centroids - Cluster centroids
|
|
215
|
+
* @param {Function} distanceFn - Distance function
|
|
216
|
+
* @param {number} nReferences - Number of reference datasets
|
|
217
|
+
* @returns {Promise<Object>} Gap statistic results
|
|
218
|
+
*/
|
|
219
|
+
export async function gapStatistic(vectors, assignments, centroids, distanceFn = euclideanDistance, nReferences = 10) {
|
|
220
|
+
const n = vectors.length;
|
|
221
|
+
const k = centroids.length;
|
|
222
|
+
const dimensions = vectors[0].length;
|
|
223
|
+
|
|
224
|
+
// Calculate within-cluster dispersion for actual data
|
|
225
|
+
let wk = 0;
|
|
226
|
+
vectors.forEach((vector, i) => {
|
|
227
|
+
const dist = distanceFn(vector, centroids[assignments[i]]);
|
|
228
|
+
wk += dist * dist;
|
|
229
|
+
});
|
|
230
|
+
wk = Math.log(wk + 1e-10); // Add small value to avoid log(0)
|
|
231
|
+
|
|
232
|
+
// Generate reference datasets and calculate their dispersions
|
|
233
|
+
const referenceWks = [];
|
|
234
|
+
|
|
235
|
+
// Find min/max for each dimension to create uniform distribution
|
|
236
|
+
const mins = new Array(dimensions).fill(Infinity);
|
|
237
|
+
const maxs = new Array(dimensions).fill(-Infinity);
|
|
238
|
+
|
|
239
|
+
vectors.forEach(vector => {
|
|
240
|
+
vector.forEach((val, dim) => {
|
|
241
|
+
mins[dim] = Math.min(mins[dim], val);
|
|
242
|
+
maxs[dim] = Math.max(maxs[dim], val);
|
|
243
|
+
});
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
// Generate reference datasets
|
|
247
|
+
for (let ref = 0; ref < nReferences; ref++) {
|
|
248
|
+
const refVectors = [];
|
|
249
|
+
|
|
250
|
+
for (let i = 0; i < n; i++) {
|
|
251
|
+
const refVector = new Array(dimensions);
|
|
252
|
+
for (let dim = 0; dim < dimensions; dim++) {
|
|
253
|
+
refVector[dim] = mins[dim] + Math.random() * (maxs[dim] - mins[dim]);
|
|
254
|
+
}
|
|
255
|
+
refVectors.push(refVector);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Cluster reference data
|
|
259
|
+
const refResult = kmeans(refVectors, k, { maxIterations: 50, distanceFn });
|
|
260
|
+
|
|
261
|
+
let refWk = 0;
|
|
262
|
+
refVectors.forEach((vector, i) => {
|
|
263
|
+
const dist = distanceFn(vector, refResult.centroids[refResult.assignments[i]]);
|
|
264
|
+
refWk += dist * dist;
|
|
265
|
+
});
|
|
266
|
+
referenceWks.push(Math.log(refWk + 1e-10));
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Calculate gap statistic
|
|
270
|
+
const expectedWk = referenceWks.reduce((a, b) => a + b, 0) / nReferences;
|
|
271
|
+
const gap = expectedWk - wk;
|
|
272
|
+
|
|
273
|
+
// Calculate standard deviation
|
|
274
|
+
const sdk = Math.sqrt(
|
|
275
|
+
referenceWks.reduce((sum, wk) => sum + Math.pow(wk - expectedWk, 2), 0) / nReferences
|
|
276
|
+
);
|
|
277
|
+
const sk = sdk * Math.sqrt(1 + 1 / nReferences);
|
|
278
|
+
|
|
279
|
+
return { gap, sk, expectedWk, actualWk: wk };
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Analyze clustering stability across multiple runs
|
|
284
|
+
*
|
|
285
|
+
* Higher stability (lower variance) indicates better K.
|
|
286
|
+
*
|
|
287
|
+
* @param {number[][]} vectors - Input vectors
|
|
288
|
+
* @param {number} k - Number of clusters
|
|
289
|
+
* @param {Object} options - Configuration options
|
|
290
|
+
* @returns {Object} Stability metrics
|
|
291
|
+
*/
|
|
292
|
+
export function clusteringStability(vectors, k, options = {}) {
|
|
293
|
+
const {
|
|
294
|
+
nRuns = 10,
|
|
295
|
+
distanceFn = euclideanDistance,
|
|
296
|
+
...kmeansOptions
|
|
297
|
+
} = options;
|
|
298
|
+
|
|
299
|
+
const inertias = [];
|
|
300
|
+
const allAssignments = [];
|
|
301
|
+
|
|
302
|
+
// Run k-means multiple times with different initializations
|
|
303
|
+
for (let run = 0; run < nRuns; run++) {
|
|
304
|
+
const result = kmeans(vectors, k, {
|
|
305
|
+
...kmeansOptions,
|
|
306
|
+
distanceFn,
|
|
307
|
+
seed: run // Different seed for each run
|
|
308
|
+
});
|
|
309
|
+
|
|
310
|
+
inertias.push(result.inertia);
|
|
311
|
+
allAssignments.push(result.assignments);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// Calculate pairwise assignment similarity
|
|
315
|
+
const assignmentSimilarities = [];
|
|
316
|
+
for (let i = 0; i < nRuns - 1; i++) {
|
|
317
|
+
for (let j = i + 1; j < nRuns; j++) {
|
|
318
|
+
const similarity = calculateAssignmentSimilarity(allAssignments[i], allAssignments[j]);
|
|
319
|
+
assignmentSimilarities.push(similarity);
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Calculate statistics
|
|
324
|
+
const avgInertia = inertias.reduce((a, b) => a + b, 0) / nRuns;
|
|
325
|
+
const stdInertia = Math.sqrt(
|
|
326
|
+
inertias.reduce((sum, val) => sum + Math.pow(val - avgInertia, 2), 0) / nRuns
|
|
327
|
+
);
|
|
328
|
+
|
|
329
|
+
const avgSimilarity = assignmentSimilarities.length > 0
|
|
330
|
+
? assignmentSimilarities.reduce((a, b) => a + b, 0) / assignmentSimilarities.length
|
|
331
|
+
: 1;
|
|
332
|
+
|
|
333
|
+
return {
|
|
334
|
+
avgInertia,
|
|
335
|
+
stdInertia,
|
|
336
|
+
cvInertia: avgInertia !== 0 ? stdInertia / avgInertia : 0, // Coefficient of variation
|
|
337
|
+
avgSimilarity,
|
|
338
|
+
stability: avgSimilarity // Higher is more stable
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Calculate similarity between two assignment arrays
|
|
344
|
+
*
|
|
345
|
+
* Returns value between 0 and 1 indicating how often
|
|
346
|
+
* pairs of points are assigned to same cluster in both assignments.
|
|
347
|
+
*
|
|
348
|
+
* @param {number[]} assignments1 - First assignment array
|
|
349
|
+
* @param {number[]} assignments2 - Second assignment array
|
|
350
|
+
* @returns {number} Similarity score [0, 1]
|
|
351
|
+
*/
|
|
352
|
+
function calculateAssignmentSimilarity(assignments1, assignments2) {
|
|
353
|
+
const n = assignments1.length;
|
|
354
|
+
let matches = 0;
|
|
355
|
+
|
|
356
|
+
// Count how many pairs of points are clustered together in both assignments
|
|
357
|
+
for (let i = 0; i < n; i++) {
|
|
358
|
+
for (let j = i + 1; j < n; j++) {
|
|
359
|
+
const sameCluster1 = assignments1[i] === assignments1[j];
|
|
360
|
+
const sameCluster2 = assignments2[i] === assignments2[j];
|
|
361
|
+
if (sameCluster1 === sameCluster2) {
|
|
362
|
+
matches++;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
const totalPairs = (n * (n - 1)) / 2;
|
|
368
|
+
return totalPairs > 0 ? matches / totalPairs : 1;
|
|
369
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector Plugin Error Class
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { PluginError } from '../../errors.js';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* VectorError class for vector-related errors
|
|
9
|
+
*
|
|
10
|
+
* @extends PluginError
|
|
11
|
+
*/
|
|
12
|
+
export class VectorError extends PluginError {
|
|
13
|
+
constructor(message, details = {}) {
|
|
14
|
+
super(message, {
|
|
15
|
+
pluginName: 'VectorPlugin',
|
|
16
|
+
...details,
|
|
17
|
+
description: details.description || `
|
|
18
|
+
Vector Plugin Error
|
|
19
|
+
|
|
20
|
+
Operation: ${details.operation || 'unknown'}
|
|
21
|
+
|
|
22
|
+
Common causes:
|
|
23
|
+
1. Vector dimension mismatch between vectors
|
|
24
|
+
2. Invalid distance metric specified (must be: cosine, euclidean, manhattan)
|
|
25
|
+
3. Empty vector array provided for clustering
|
|
26
|
+
4. k value larger than number of available vectors
|
|
27
|
+
5. Vector field not found or invalid in resource
|
|
28
|
+
6. Large vectors without proper behavior (use 'body-overflow' or 'body-only')
|
|
29
|
+
|
|
30
|
+
Available distance metrics:
|
|
31
|
+
- cosine: Best for normalized vectors, semantic similarity. Range: [0, 2]
|
|
32
|
+
- euclidean: Standard L2 distance, geometric proximity. Range: [0, ∞)
|
|
33
|
+
- manhattan: L1 distance, faster computation. Range: [0, ∞)
|
|
34
|
+
|
|
35
|
+
Storage considerations:
|
|
36
|
+
- Vectors > 250 dimensions may exceed S3 metadata limit (2KB)
|
|
37
|
+
- Use behavior: 'body-overflow' or 'body-only' for large vectors
|
|
38
|
+
- OpenAI ada-002 (1536 dims): ~10KB, requires body storage
|
|
39
|
+
- Sentence Transformers (384 dims): ~2.7KB, requires body storage
|
|
40
|
+
`.trim()
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
}
|