s3db.js 11.2.4 → 11.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,369 @@
1
+ /**
2
+ * Clustering Evaluation Metrics
3
+ *
4
+ * Provides multiple metrics for evaluating clustering quality
5
+ * and determining optimal number of clusters (K).
6
+ */
7
+
8
+ import { euclideanDistance } from './distances.js';
9
+ import { kmeans } from './kmeans.js';
10
+
11
+ /**
12
+ * Calculate Silhouette Score for clustering quality
13
+ *
14
+ * Measures how similar each point is to its own cluster compared to other clusters.
15
+ *
16
+ * Range: [-1, 1]
17
+ * - Close to 1: Well clustered
18
+ * - Close to 0: On border between clusters
19
+ * - Negative: Likely in wrong cluster
20
+ *
21
+ * @param {number[][]} vectors - Input vectors
22
+ * @param {number[]} assignments - Cluster assignments
23
+ * @param {number[][]} centroids - Cluster centroids
24
+ * @param {Function} distanceFn - Distance function
25
+ * @returns {number} Average silhouette score
26
+ */
27
+ export function silhouetteScore(vectors, assignments, centroids, distanceFn = euclideanDistance) {
28
+ const k = centroids.length;
29
+ const n = vectors.length;
30
+
31
+ // Group vectors by cluster
32
+ const clusters = Array(k).fill(null).map(() => []);
33
+ vectors.forEach((vector, i) => {
34
+ clusters[assignments[i]].push(i);
35
+ });
36
+
37
+ let totalScore = 0;
38
+ let validPoints = 0;
39
+
40
+ // Handle case where all points are in different clusters
41
+ if (clusters.every(c => c.length <= 1)) {
42
+ return 0;
43
+ }
44
+
45
+ for (let i = 0; i < n; i++) {
46
+ const clusterIdx = assignments[i];
47
+ const cluster = clusters[clusterIdx];
48
+
49
+ // Skip singleton clusters
50
+ if (cluster.length === 1) continue;
51
+
52
+ // a(i): Average distance to points in same cluster
53
+ let a = 0;
54
+ for (const j of cluster) {
55
+ if (i !== j) {
56
+ a += distanceFn(vectors[i], vectors[j]);
57
+ }
58
+ }
59
+ a /= (cluster.length - 1);
60
+
61
+ // b(i): Minimum average distance to points in other clusters
62
+ let b = Infinity;
63
+ for (let otherCluster = 0; otherCluster < k; otherCluster++) {
64
+ if (otherCluster === clusterIdx) continue;
65
+
66
+ const otherPoints = clusters[otherCluster];
67
+ if (otherPoints.length === 0) continue;
68
+
69
+ let avgDist = 0;
70
+ for (const j of otherPoints) {
71
+ avgDist += distanceFn(vectors[i], vectors[j]);
72
+ }
73
+ avgDist /= otherPoints.length;
74
+
75
+ b = Math.min(b, avgDist);
76
+ }
77
+
78
+ // If no other clusters exist (k=1), skip this point
79
+ if (b === Infinity) continue;
80
+
81
+ // Silhouette coefficient for point i
82
+ const maxAB = Math.max(a, b);
83
+ const s = maxAB === 0 ? 0 : (b - a) / maxAB;
84
+ totalScore += s;
85
+ validPoints++;
86
+ }
87
+
88
+ return validPoints > 0 ? totalScore / validPoints : 0;
89
+ }
90
+
91
+ /**
92
+ * Calculate Davies-Bouldin Index
93
+ *
94
+ * Measures average similarity between each cluster and its most similar cluster.
95
+ * Lower is better (minimum 0).
96
+ *
97
+ * @param {number[][]} vectors - Input vectors
98
+ * @param {number[]} assignments - Cluster assignments
99
+ * @param {number[][]} centroids - Cluster centroids
100
+ * @param {Function} distanceFn - Distance function
101
+ * @returns {number} Davies-Bouldin index
102
+ */
103
+ export function daviesBouldinIndex(vectors, assignments, centroids, distanceFn = euclideanDistance) {
104
+ const k = centroids.length;
105
+
106
+ // Calculate average distance from points to their centroid (cluster scatter)
107
+ const scatters = new Array(k).fill(0);
108
+ const clusterCounts = new Array(k).fill(0);
109
+
110
+ vectors.forEach((vector, i) => {
111
+ const cluster = assignments[i];
112
+ scatters[cluster] += distanceFn(vector, centroids[cluster]);
113
+ clusterCounts[cluster]++;
114
+ });
115
+
116
+ for (let i = 0; i < k; i++) {
117
+ if (clusterCounts[i] > 0) {
118
+ scatters[i] /= clusterCounts[i];
119
+ }
120
+ }
121
+
122
+ // Calculate Davies-Bouldin index
123
+ let dbIndex = 0;
124
+ let validClusters = 0;
125
+
126
+ for (let i = 0; i < k; i++) {
127
+ if (clusterCounts[i] === 0) continue;
128
+
129
+ let maxRatio = 0;
130
+ for (let j = 0; j < k; j++) {
131
+ if (i === j || clusterCounts[j] === 0) continue;
132
+
133
+ const centroidDist = distanceFn(centroids[i], centroids[j]);
134
+ if (centroidDist === 0) continue;
135
+
136
+ const ratio = (scatters[i] + scatters[j]) / centroidDist;
137
+ maxRatio = Math.max(maxRatio, ratio);
138
+ }
139
+
140
+ dbIndex += maxRatio;
141
+ validClusters++;
142
+ }
143
+
144
+ return validClusters > 0 ? dbIndex / validClusters : 0;
145
+ }
146
+
147
+ /**
148
+ * Calculate Calinski-Harabasz Index (Variance Ratio Criterion)
149
+ *
150
+ * Ratio of between-cluster dispersion to within-cluster dispersion.
151
+ * Higher is better.
152
+ *
153
+ * @param {number[][]} vectors - Input vectors
154
+ * @param {number[]} assignments - Cluster assignments
155
+ * @param {number[][]} centroids - Cluster centroids
156
+ * @param {Function} distanceFn - Distance function
157
+ * @returns {number} Calinski-Harabasz index
158
+ */
159
+ export function calinskiHarabaszIndex(vectors, assignments, centroids, distanceFn = euclideanDistance) {
160
+ const n = vectors.length;
161
+ const k = centroids.length;
162
+
163
+ if (k === 1 || k === n) return 0;
164
+
165
+ // Calculate overall centroid
166
+ const dimensions = vectors[0].length;
167
+ const overallCentroid = new Array(dimensions).fill(0);
168
+
169
+ vectors.forEach(vector => {
170
+ vector.forEach((val, dim) => {
171
+ overallCentroid[dim] += val;
172
+ });
173
+ });
174
+
175
+ overallCentroid.forEach((val, dim, arr) => {
176
+ arr[dim] = val / n;
177
+ });
178
+
179
+ // Calculate between-cluster dispersion (BGSS)
180
+ const clusterCounts = new Array(k).fill(0);
181
+ vectors.forEach((vector, i) => {
182
+ clusterCounts[assignments[i]]++;
183
+ });
184
+
185
+ let bgss = 0;
186
+ for (let i = 0; i < k; i++) {
187
+ if (clusterCounts[i] === 0) continue;
188
+ const dist = distanceFn(centroids[i], overallCentroid);
189
+ bgss += clusterCounts[i] * dist * dist;
190
+ }
191
+
192
+ // Calculate within-cluster dispersion (WCSS)
193
+ let wcss = 0;
194
+ vectors.forEach((vector, i) => {
195
+ const cluster = assignments[i];
196
+ const dist = distanceFn(vector, centroids[cluster]);
197
+ wcss += dist * dist;
198
+ });
199
+
200
+ if (wcss === 0) return 0;
201
+
202
+ // Calinski-Harabasz index
203
+ return (bgss / (k - 1)) / (wcss / (n - k));
204
+ }
205
+
206
+ /**
207
+ * Calculate Gap Statistic
208
+ *
209
+ * Compares clustering to random uniform distribution.
210
+ * Higher gap indicates better clustering.
211
+ *
212
+ * @param {number[][]} vectors - Input vectors
213
+ * @param {number[]} assignments - Cluster assignments
214
+ * @param {number[][]} centroids - Cluster centroids
215
+ * @param {Function} distanceFn - Distance function
216
+ * @param {number} nReferences - Number of reference datasets
217
+ * @returns {Promise<Object>} Gap statistic results
218
+ */
219
+ export async function gapStatistic(vectors, assignments, centroids, distanceFn = euclideanDistance, nReferences = 10) {
220
+ const n = vectors.length;
221
+ const k = centroids.length;
222
+ const dimensions = vectors[0].length;
223
+
224
+ // Calculate within-cluster dispersion for actual data
225
+ let wk = 0;
226
+ vectors.forEach((vector, i) => {
227
+ const dist = distanceFn(vector, centroids[assignments[i]]);
228
+ wk += dist * dist;
229
+ });
230
+ wk = Math.log(wk + 1e-10); // Add small value to avoid log(0)
231
+
232
+ // Generate reference datasets and calculate their dispersions
233
+ const referenceWks = [];
234
+
235
+ // Find min/max for each dimension to create uniform distribution
236
+ const mins = new Array(dimensions).fill(Infinity);
237
+ const maxs = new Array(dimensions).fill(-Infinity);
238
+
239
+ vectors.forEach(vector => {
240
+ vector.forEach((val, dim) => {
241
+ mins[dim] = Math.min(mins[dim], val);
242
+ maxs[dim] = Math.max(maxs[dim], val);
243
+ });
244
+ });
245
+
246
+ // Generate reference datasets
247
+ for (let ref = 0; ref < nReferences; ref++) {
248
+ const refVectors = [];
249
+
250
+ for (let i = 0; i < n; i++) {
251
+ const refVector = new Array(dimensions);
252
+ for (let dim = 0; dim < dimensions; dim++) {
253
+ refVector[dim] = mins[dim] + Math.random() * (maxs[dim] - mins[dim]);
254
+ }
255
+ refVectors.push(refVector);
256
+ }
257
+
258
+ // Cluster reference data
259
+ const refResult = kmeans(refVectors, k, { maxIterations: 50, distanceFn });
260
+
261
+ let refWk = 0;
262
+ refVectors.forEach((vector, i) => {
263
+ const dist = distanceFn(vector, refResult.centroids[refResult.assignments[i]]);
264
+ refWk += dist * dist;
265
+ });
266
+ referenceWks.push(Math.log(refWk + 1e-10));
267
+ }
268
+
269
+ // Calculate gap statistic
270
+ const expectedWk = referenceWks.reduce((a, b) => a + b, 0) / nReferences;
271
+ const gap = expectedWk - wk;
272
+
273
+ // Calculate standard deviation
274
+ const sdk = Math.sqrt(
275
+ referenceWks.reduce((sum, wk) => sum + Math.pow(wk - expectedWk, 2), 0) / nReferences
276
+ );
277
+ const sk = sdk * Math.sqrt(1 + 1 / nReferences);
278
+
279
+ return { gap, sk, expectedWk, actualWk: wk };
280
+ }
281
+
282
+ /**
283
+ * Analyze clustering stability across multiple runs
284
+ *
285
+ * Higher stability (lower variance) indicates better K.
286
+ *
287
+ * @param {number[][]} vectors - Input vectors
288
+ * @param {number} k - Number of clusters
289
+ * @param {Object} options - Configuration options
290
+ * @returns {Object} Stability metrics
291
+ */
292
+ export function clusteringStability(vectors, k, options = {}) {
293
+ const {
294
+ nRuns = 10,
295
+ distanceFn = euclideanDistance,
296
+ ...kmeansOptions
297
+ } = options;
298
+
299
+ const inertias = [];
300
+ const allAssignments = [];
301
+
302
+ // Run k-means multiple times with different initializations
303
+ for (let run = 0; run < nRuns; run++) {
304
+ const result = kmeans(vectors, k, {
305
+ ...kmeansOptions,
306
+ distanceFn,
307
+ seed: run // Different seed for each run
308
+ });
309
+
310
+ inertias.push(result.inertia);
311
+ allAssignments.push(result.assignments);
312
+ }
313
+
314
+ // Calculate pairwise assignment similarity
315
+ const assignmentSimilarities = [];
316
+ for (let i = 0; i < nRuns - 1; i++) {
317
+ for (let j = i + 1; j < nRuns; j++) {
318
+ const similarity = calculateAssignmentSimilarity(allAssignments[i], allAssignments[j]);
319
+ assignmentSimilarities.push(similarity);
320
+ }
321
+ }
322
+
323
+ // Calculate statistics
324
+ const avgInertia = inertias.reduce((a, b) => a + b, 0) / nRuns;
325
+ const stdInertia = Math.sqrt(
326
+ inertias.reduce((sum, val) => sum + Math.pow(val - avgInertia, 2), 0) / nRuns
327
+ );
328
+
329
+ const avgSimilarity = assignmentSimilarities.length > 0
330
+ ? assignmentSimilarities.reduce((a, b) => a + b, 0) / assignmentSimilarities.length
331
+ : 1;
332
+
333
+ return {
334
+ avgInertia,
335
+ stdInertia,
336
+ cvInertia: avgInertia !== 0 ? stdInertia / avgInertia : 0, // Coefficient of variation
337
+ avgSimilarity,
338
+ stability: avgSimilarity // Higher is more stable
339
+ };
340
+ }
341
+
342
+ /**
343
+ * Calculate similarity between two assignment arrays
344
+ *
345
+ * Returns value between 0 and 1 indicating how often
346
+ * pairs of points are assigned to same cluster in both assignments.
347
+ *
348
+ * @param {number[]} assignments1 - First assignment array
349
+ * @param {number[]} assignments2 - Second assignment array
350
+ * @returns {number} Similarity score [0, 1]
351
+ */
352
+ function calculateAssignmentSimilarity(assignments1, assignments2) {
353
+ const n = assignments1.length;
354
+ let matches = 0;
355
+
356
+ // Count how many pairs of points are clustered together in both assignments
357
+ for (let i = 0; i < n; i++) {
358
+ for (let j = i + 1; j < n; j++) {
359
+ const sameCluster1 = assignments1[i] === assignments1[j];
360
+ const sameCluster2 = assignments2[i] === assignments2[j];
361
+ if (sameCluster1 === sameCluster2) {
362
+ matches++;
363
+ }
364
+ }
365
+ }
366
+
367
+ const totalPairs = (n * (n - 1)) / 2;
368
+ return totalPairs > 0 ? matches / totalPairs : 1;
369
+ }
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Vector Plugin Error Class
3
+ */
4
+
5
+ import { PluginError } from '../../errors.js';
6
+
7
+ /**
8
+ * VectorError class for vector-related errors
9
+ *
10
+ * @extends PluginError
11
+ */
12
+ export class VectorError extends PluginError {
13
+ constructor(message, details = {}) {
14
+ super(message, {
15
+ pluginName: 'VectorPlugin',
16
+ ...details,
17
+ description: details.description || `
18
+ Vector Plugin Error
19
+
20
+ Operation: ${details.operation || 'unknown'}
21
+
22
+ Common causes:
23
+ 1. Vector dimension mismatch between vectors
24
+ 2. Invalid distance metric specified (must be: cosine, euclidean, manhattan)
25
+ 3. Empty vector array provided for clustering
26
+ 4. k value larger than number of available vectors
27
+ 5. Vector field not found or invalid in resource
28
+ 6. Large vectors without proper behavior (use 'body-overflow' or 'body-only')
29
+
30
+ Available distance metrics:
31
+ - cosine: Best for normalized vectors, semantic similarity. Range: [0, 2]
32
+ - euclidean: Standard L2 distance, geometric proximity. Range: [0, ∞)
33
+ - manhattan: L1 distance, faster computation. Range: [0, ∞)
34
+
35
+ Storage considerations:
36
+ - Vectors > 250 dimensions may exceed S3 metadata limit (2KB)
37
+ - Use behavior: 'body-overflow' or 'body-only' for large vectors
38
+ - OpenAI ada-002 (1536 dims): ~10KB, requires body storage
39
+ - Sentence Transformers (384 dims): ~2.7KB, requires body storage
40
+ `.trim()
41
+ });
42
+ }
43
+ }