s3db.js 11.2.4 → 11.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "s3db.js",
3
- "version": "11.2.4",
3
+ "version": "11.2.6",
4
4
  "description": "Use AWS S3, the world's most reliable document storage, as a database with this ORM.",
5
5
  "main": "dist/s3db.cjs.js",
6
6
  "module": "dist/s3db.es.js",
@@ -103,6 +103,7 @@
103
103
  "@rollup/plugin-replace": "^6.0.2",
104
104
  "@rollup/plugin-terser": "^0.4.4",
105
105
  "@types/node": "24.7.0",
106
+ "@xenova/transformers": "^2.17.2",
106
107
  "babel-loader": "^10.0.0",
107
108
  "chalk": "^5.6.2",
108
109
  "cli-table3": "^0.6.5",
@@ -59,3 +59,73 @@ export const decodeDecimal = s => {
59
59
  const num = decPart ? Number(decodedInt + '.' + decPart) : decodedInt;
60
60
  return negative ? -num : num;
61
61
  };
62
+
63
+ /**
64
+ * Fixed-point encoding optimized for normalized values (typically -1 to 1)
65
+ * Common in embeddings, similarity scores, probabilities, etc.
66
+ *
67
+ * Achieves ~77% compression vs encodeDecimal for embedding vectors.
68
+ *
69
+ * @param {number} n - Number to encode (works for any range, optimized for [-1, 1])
70
+ * @param {number} precision - Decimal places to preserve (default: 6)
71
+ * @returns {string} Base62-encoded string with '^' prefix to indicate fixed-point encoding
72
+ *
73
+ * Examples:
74
+ * 0.123456 → "^w7f" (4 bytes vs 8 bytes with encodeDecimal)
75
+ * -0.8234567 → "^-3sdz" (6 bytes vs 10 bytes)
76
+ * 1.5 → "^98v9" (for values outside [-1,1], still works but less optimal)
77
+ */
78
+ export const encodeFixedPoint = (n, precision = 6) => {
79
+ if (typeof n !== 'number' || isNaN(n)) return 'undefined';
80
+ if (!isFinite(n)) return 'undefined';
81
+
82
+ const scale = Math.pow(10, precision);
83
+ const scaled = Math.round(n * scale);
84
+
85
+ if (scaled === 0) return '^0';
86
+
87
+ const negative = scaled < 0;
88
+ let num = Math.abs(scaled);
89
+ let s = '';
90
+
91
+ while (num > 0) {
92
+ s = alphabet[num % base] + s;
93
+ num = Math.floor(num / base);
94
+ }
95
+
96
+ // Prefix with ^ to distinguish from regular base62
97
+ return '^' + (negative ? '-' : '') + s;
98
+ };
99
+
100
+ /**
101
+ * Decodes fixed-point encoded values
102
+ *
103
+ * @param {string} s - Encoded string (must start with '^')
104
+ * @param {number} precision - Decimal places used in encoding (default: 6)
105
+ * @returns {number} Decoded number
106
+ */
107
+ export const decodeFixedPoint = (s, precision = 6) => {
108
+ if (typeof s !== 'string') return NaN;
109
+ if (!s.startsWith('^')) return NaN; // Safety check
110
+
111
+ s = s.slice(1); // Remove ^ prefix
112
+
113
+ if (s === '0') return 0;
114
+
115
+ let negative = false;
116
+ if (s[0] === '-') {
117
+ negative = true;
118
+ s = s.slice(1);
119
+ }
120
+
121
+ let r = 0;
122
+ for (let i = 0; i < s.length; i++) {
123
+ const idx = charToValue[s[i]];
124
+ if (idx === undefined) return NaN;
125
+ r = r * base + idx;
126
+ }
127
+
128
+ const scale = Math.pow(10, precision);
129
+ const scaled = negative ? -r : r;
130
+ return scaled / scale;
131
+ };
@@ -15,3 +15,4 @@ export * from './replicator.plugin.js'
15
15
  export * from './s3-queue.plugin.js'
16
16
  export * from './scheduler.plugin.js'
17
17
  export * from './state-machine.plugin.js'
18
+ export * from './vector.plugin.js'
@@ -0,0 +1,173 @@
1
+ /**
2
+ * Vector Distance Functions
3
+ *
4
+ * Provides distance/similarity calculations for vector operations.
5
+ * All distance functions return lower values for more similar vectors.
6
+ */
7
+
8
+ /**
9
+ * Calculate cosine distance between two vectors
10
+ *
11
+ * Range: 0 (identical) to 2 (opposite direction)
12
+ * Best for: Normalized vectors, semantic similarity
13
+ *
14
+ * @param {number[]} a - First vector
15
+ * @param {number[]} b - Second vector
16
+ * @returns {number} Cosine distance
17
+ * @throws {Error} If vectors have different dimensions
18
+ */
19
+ export function cosineDistance(a, b) {
20
+ if (a.length !== b.length) {
21
+ throw new Error(`Dimension mismatch: ${a.length} vs ${b.length}`);
22
+ }
23
+
24
+ let dotProduct = 0;
25
+ let normA = 0;
26
+ let normB = 0;
27
+
28
+ for (let i = 0; i < a.length; i++) {
29
+ dotProduct += a[i] * b[i];
30
+ normA += a[i] * a[i];
31
+ normB += b[i] * b[i];
32
+ }
33
+
34
+ const denominator = Math.sqrt(normA) * Math.sqrt(normB);
35
+
36
+ // Handle zero vectors
37
+ if (denominator === 0) {
38
+ return a.every(v => v === 0) && b.every(v => v === 0) ? 0 : 1;
39
+ }
40
+
41
+ const similarity = dotProduct / denominator;
42
+
43
+ // Convert similarity [-1, 1] to distance [0, 2]
44
+ return 1 - similarity;
45
+ }
46
+
47
+ /**
48
+ * Calculate euclidean (L2) distance between two vectors
49
+ *
50
+ * Range: [0, ∞)
51
+ * Best for: Geometric proximity, continuous data
52
+ *
53
+ * @param {number[]} a - First vector
54
+ * @param {number[]} b - Second vector
55
+ * @returns {number} Euclidean distance
56
+ * @throws {Error} If vectors have different dimensions
57
+ */
58
+ export function euclideanDistance(a, b) {
59
+ if (a.length !== b.length) {
60
+ throw new Error(`Dimension mismatch: ${a.length} vs ${b.length}`);
61
+ }
62
+
63
+ let sum = 0;
64
+ for (let i = 0; i < a.length; i++) {
65
+ const diff = a[i] - b[i];
66
+ sum += diff * diff;
67
+ }
68
+
69
+ return Math.sqrt(sum);
70
+ }
71
+
72
+ /**
73
+ * Calculate manhattan (L1) distance between two vectors
74
+ *
75
+ * Range: [0, ∞)
76
+ * Best for: Grid-based movement, faster computation
77
+ *
78
+ * @param {number[]} a - First vector
79
+ * @param {number[]} b - Second vector
80
+ * @returns {number} Manhattan distance
81
+ * @throws {Error} If vectors have different dimensions
82
+ */
83
+ export function manhattanDistance(a, b) {
84
+ if (a.length !== b.length) {
85
+ throw new Error(`Dimension mismatch: ${a.length} vs ${b.length}`);
86
+ }
87
+
88
+ let sum = 0;
89
+ for (let i = 0; i < a.length; i++) {
90
+ sum += Math.abs(a[i] - b[i]);
91
+ }
92
+
93
+ return sum;
94
+ }
95
+
96
+ /**
97
+ * Calculate dot product of two vectors
98
+ *
99
+ * Higher values indicate more similarity (for normalized vectors)
100
+ *
101
+ * @param {number[]} a - First vector
102
+ * @param {number[]} b - Second vector
103
+ * @returns {number} Dot product
104
+ * @throws {Error} If vectors have different dimensions
105
+ */
106
+ export function dotProduct(a, b) {
107
+ if (a.length !== b.length) {
108
+ throw new Error(`Dimension mismatch: ${a.length} vs ${b.length}`);
109
+ }
110
+
111
+ let sum = 0;
112
+ for (let i = 0; i < a.length; i++) {
113
+ sum += a[i] * b[i];
114
+ }
115
+
116
+ return sum;
117
+ }
118
+
119
+ /**
120
+ * Normalize a vector to unit length (L2 normalization)
121
+ *
122
+ * Converts vector to unit vector pointing in same direction.
123
+ * Useful for cosine similarity calculations.
124
+ *
125
+ * @param {number[]} vector - Vector to normalize
126
+ * @returns {number[]} Normalized vector
127
+ */
128
+ export function normalize(vector) {
129
+ const magnitude = Math.sqrt(
130
+ vector.reduce((sum, val) => sum + val * val, 0)
131
+ );
132
+
133
+ // Handle zero vector
134
+ if (magnitude === 0) {
135
+ return vector.slice(); // Return copy of zero vector
136
+ }
137
+
138
+ return vector.map(val => val / magnitude);
139
+ }
140
+
141
+ /**
142
+ * Calculate the magnitude (length) of a vector
143
+ *
144
+ * @param {number[]} vector - Input vector
145
+ * @returns {number} Magnitude
146
+ */
147
+ export function magnitude(vector) {
148
+ return Math.sqrt(
149
+ vector.reduce((sum, val) => sum + val * val, 0)
150
+ );
151
+ }
152
+
153
+ /**
154
+ * Check if two vectors are equal within a tolerance
155
+ *
156
+ * @param {number[]} a - First vector
157
+ * @param {number[]} b - Second vector
158
+ * @param {number} epsilon - Tolerance for floating point comparison
159
+ * @returns {boolean} True if vectors are equal within tolerance
160
+ */
161
+ export function vectorsEqual(a, b, epsilon = 1e-10) {
162
+ if (a.length !== b.length) {
163
+ return false;
164
+ }
165
+
166
+ for (let i = 0; i < a.length; i++) {
167
+ if (Math.abs(a[i] - b[i]) > epsilon) {
168
+ return false;
169
+ }
170
+ }
171
+
172
+ return true;
173
+ }
@@ -0,0 +1,367 @@
1
+ /**
2
+ * K-Means Clustering Implementation
3
+ *
4
+ * Provides k-means clustering with k-means++ initialization
5
+ * and comprehensive optimal K analysis using multiple metrics.
6
+ */
7
+
8
+ import { euclideanDistance } from './distances.js';
9
+
10
+ /**
11
+ * K-means clustering algorithm
12
+ *
13
+ * @param {number[][]} vectors - Array of vectors to cluster
14
+ * @param {number} k - Number of clusters
15
+ * @param {Object} options - Configuration options
16
+ * @param {number} options.maxIterations - Maximum iterations (default: 100)
17
+ * @param {number} options.tolerance - Convergence tolerance (default: 0.0001)
18
+ * @param {Function} options.distanceFn - Distance function (default: euclideanDistance)
19
+ * @param {number|null} options.seed - Random seed for reproducibility (default: null)
20
+ * @param {Function} options.onIteration - Callback for each iteration (iteration, inertia, converged) (default: null)
21
+ * @returns {Object} Clustering results
22
+ * @throws {Error} If invalid parameters
23
+ */
24
+ export function kmeans(vectors, k, options = {}) {
25
+ const {
26
+ maxIterations = 100,
27
+ tolerance = 0.0001,
28
+ distanceFn = euclideanDistance,
29
+ seed = null,
30
+ onIteration = null
31
+ } = options;
32
+
33
+ if (vectors.length === 0) {
34
+ throw new Error('Cannot cluster empty vector array');
35
+ }
36
+
37
+ if (k < 1) {
38
+ throw new Error(`k must be at least 1, got ${k}`);
39
+ }
40
+
41
+ if (k > vectors.length) {
42
+ throw new Error(`k (${k}) cannot be greater than number of vectors (${vectors.length})`);
43
+ }
44
+
45
+ const dimensions = vectors[0].length;
46
+
47
+ // Validate all vectors have same dimensions
48
+ for (let i = 1; i < vectors.length; i++) {
49
+ if (vectors[i].length !== dimensions) {
50
+ throw new Error(`All vectors must have same dimensions. Expected ${dimensions}, got ${vectors[i].length} at index ${i}`);
51
+ }
52
+ }
53
+
54
+ // Initialize centroids using k-means++
55
+ const centroids = initializeCentroidsKMeansPlusPlus(vectors, k, distanceFn, seed);
56
+
57
+ let assignments = new Array(vectors.length);
58
+ let iterations = 0;
59
+ let converged = false;
60
+ let previousInertia = Infinity;
61
+
62
+ while (!converged && iterations < maxIterations) {
63
+ // Assign each vector to nearest centroid
64
+ const newAssignments = vectors.map(vector => {
65
+ let minDist = Infinity;
66
+ let nearestCluster = 0;
67
+
68
+ for (let i = 0; i < k; i++) {
69
+ const dist = distanceFn(vector, centroids[i]);
70
+ if (dist < minDist) {
71
+ minDist = dist;
72
+ nearestCluster = i;
73
+ }
74
+ }
75
+
76
+ return nearestCluster;
77
+ });
78
+
79
+ // Calculate inertia (sum of squared distances to centroids)
80
+ let inertia = 0;
81
+ vectors.forEach((vector, i) => {
82
+ const dist = distanceFn(vector, centroids[newAssignments[i]]);
83
+ inertia += dist * dist;
84
+ });
85
+
86
+ // Check for convergence
87
+ const inertiaChange = Math.abs(previousInertia - inertia);
88
+ converged = inertiaChange < tolerance;
89
+
90
+ assignments = newAssignments;
91
+ previousInertia = inertia;
92
+
93
+ // Call onIteration callback if provided
94
+ if (onIteration) {
95
+ onIteration(iterations + 1, inertia, converged);
96
+ }
97
+
98
+ if (!converged) {
99
+ // Recalculate centroids
100
+ const clusterSums = Array(k).fill(null).map(() => new Array(dimensions).fill(0));
101
+ const clusterCounts = new Array(k).fill(0);
102
+
103
+ vectors.forEach((vector, i) => {
104
+ const cluster = assignments[i];
105
+ clusterCounts[cluster]++;
106
+ vector.forEach((val, j) => {
107
+ clusterSums[cluster][j] += val;
108
+ });
109
+ });
110
+
111
+ // Update centroids
112
+ for (let i = 0; i < k; i++) {
113
+ if (clusterCounts[i] > 0) {
114
+ centroids[i] = clusterSums[i].map(sum => sum / clusterCounts[i]);
115
+ }
116
+ // If cluster is empty, reinitialize to a random vector
117
+ else {
118
+ const randomIdx = Math.floor(Math.random() * vectors.length);
119
+ centroids[i] = [...vectors[randomIdx]];
120
+ }
121
+ }
122
+ }
123
+
124
+ iterations++;
125
+ }
126
+
127
+ // Final inertia calculation
128
+ let inertia = 0;
129
+ vectors.forEach((vector, i) => {
130
+ const dist = distanceFn(vector, centroids[assignments[i]]);
131
+ inertia += dist * dist;
132
+ });
133
+
134
+ return {
135
+ centroids,
136
+ assignments,
137
+ iterations,
138
+ converged,
139
+ inertia
140
+ };
141
+ }
142
+
143
+ /**
144
+ * Initialize centroids using k-means++ algorithm
145
+ *
146
+ * K-means++ provides better initialization than random selection,
147
+ * leading to faster convergence and better final clustering.
148
+ *
149
+ * @param {number[][]} vectors - Input vectors
150
+ * @param {number} k - Number of centroids
151
+ * @param {Function} distanceFn - Distance function
152
+ * @param {number|null} seed - Random seed
153
+ * @returns {number[][]} Initial centroids
154
+ */
155
+ function initializeCentroidsKMeansPlusPlus(vectors, k, distanceFn, seed) {
156
+ const centroids = [];
157
+ const n = vectors.length;
158
+
159
+ // Choose first centroid randomly
160
+ const firstIndex = seed !== null ? seed % n : Math.floor(Math.random() * n);
161
+ centroids.push([...vectors[firstIndex]]);
162
+
163
+ // Choose remaining centroids
164
+ for (let i = 1; i < k; i++) {
165
+ // Calculate distance to nearest existing centroid
166
+ const distances = vectors.map(vector => {
167
+ return Math.min(...centroids.map(c => distanceFn(vector, c)));
168
+ });
169
+
170
+ // Square distances for probability distribution
171
+ const squaredDistances = distances.map(d => d * d);
172
+ const totalSquared = squaredDistances.reduce((a, b) => a + b, 0);
173
+
174
+ if (totalSquared === 0) {
175
+ // All remaining points are identical to existing centroids
176
+ // Choose randomly
177
+ const randomIdx = Math.floor(Math.random() * n);
178
+ centroids.push([...vectors[randomIdx]]);
179
+ continue;
180
+ }
181
+
182
+ // Choose next centroid with probability proportional to squared distance
183
+ let threshold = Math.random() * totalSquared;
184
+ let cumulativeSum = 0;
185
+
186
+ for (let j = 0; j < n; j++) {
187
+ cumulativeSum += squaredDistances[j];
188
+ if (cumulativeSum >= threshold) {
189
+ centroids.push([...vectors[j]]);
190
+ break;
191
+ }
192
+ }
193
+ }
194
+
195
+ return centroids;
196
+ }
197
+
198
+ /**
199
+ * Find optimal K using multiple evaluation metrics
200
+ *
201
+ * Analyzes clustering quality across a range of K values using:
202
+ * - Elbow method (inertia)
203
+ * - Silhouette score
204
+ * - Davies-Bouldin index
205
+ * - Calinski-Harabasz index
206
+ * - Gap statistic
207
+ * - Clustering stability
208
+ *
209
+ * @param {number[][]} vectors - Vectors to analyze
210
+ * @param {Object} options - Configuration options
211
+ * @param {number} options.minK - Minimum K to test (default: 2)
212
+ * @param {number} options.maxK - Maximum K to test (default: sqrt(n/2))
213
+ * @param {Function} options.distanceFn - Distance function (default: euclideanDistance)
214
+ * @param {number} options.nReferences - Number of reference datasets for Gap statistic (default: 10)
215
+ * @param {number} options.stabilityRuns - Number of runs for stability analysis (default: 5)
216
+ * @returns {Promise<Object>} Analysis results with recommendations
217
+ */
218
+ export async function findOptimalK(vectors, options = {}) {
219
+ const {
220
+ minK = 2,
221
+ maxK = Math.min(10, Math.floor(Math.sqrt(vectors.length / 2))),
222
+ distanceFn = euclideanDistance,
223
+ nReferences = 10,
224
+ stabilityRuns = 5,
225
+ ...kmeansOptions
226
+ } = options;
227
+
228
+ // Dynamic import to avoid circular dependency
229
+ const metricsModule = await import('./metrics.js');
230
+ const {
231
+ silhouetteScore,
232
+ daviesBouldinIndex,
233
+ calinskiHarabaszIndex,
234
+ gapStatistic,
235
+ clusteringStability
236
+ } = metricsModule;
237
+
238
+ const results = [];
239
+
240
+ for (let k = minK; k <= maxK; k++) {
241
+ // Run k-means
242
+ const kmeansResult = kmeans(vectors, k, { ...kmeansOptions, distanceFn });
243
+
244
+ // Calculate all metrics
245
+ const silhouette = silhouetteScore(
246
+ vectors,
247
+ kmeansResult.assignments,
248
+ kmeansResult.centroids,
249
+ distanceFn
250
+ );
251
+
252
+ const daviesBouldin = daviesBouldinIndex(
253
+ vectors,
254
+ kmeansResult.assignments,
255
+ kmeansResult.centroids,
256
+ distanceFn
257
+ );
258
+
259
+ const calinskiHarabasz = calinskiHarabaszIndex(
260
+ vectors,
261
+ kmeansResult.assignments,
262
+ kmeansResult.centroids,
263
+ distanceFn
264
+ );
265
+
266
+ const gap = await gapStatistic(
267
+ vectors,
268
+ kmeansResult.assignments,
269
+ kmeansResult.centroids,
270
+ distanceFn,
271
+ nReferences
272
+ );
273
+
274
+ const stability = clusteringStability(
275
+ vectors,
276
+ k,
277
+ { ...kmeansOptions, distanceFn, nRuns: stabilityRuns }
278
+ );
279
+
280
+ results.push({
281
+ k,
282
+ inertia: kmeansResult.inertia,
283
+ silhouette,
284
+ daviesBouldin,
285
+ calinskiHarabasz,
286
+ gap: gap.gap,
287
+ gapSk: gap.sk,
288
+ stability: stability.stability,
289
+ cvInertia: stability.cvInertia,
290
+ iterations: kmeansResult.iterations,
291
+ converged: kmeansResult.converged
292
+ });
293
+ }
294
+
295
+ // Calculate elbow point
296
+ const elbowK = findElbowPoint(results.map(r => r.inertia));
297
+
298
+ // Find best K by each metric
299
+ const recommendations = {
300
+ elbow: minK + elbowK,
301
+ silhouette: results.reduce((best, curr) =>
302
+ curr.silhouette > best.silhouette ? curr : best
303
+ ).k,
304
+ daviesBouldin: results.reduce((best, curr) =>
305
+ curr.daviesBouldin < best.daviesBouldin ? curr : best
306
+ ).k,
307
+ calinskiHarabasz: results.reduce((best, curr) =>
308
+ curr.calinskiHarabasz > best.calinskiHarabasz ? curr : best
309
+ ).k,
310
+ gap: results.reduce((best, curr) =>
311
+ curr.gap > best.gap ? curr : best
312
+ ).k,
313
+ stability: results.reduce((best, curr) =>
314
+ curr.stability > best.stability ? curr : best
315
+ ).k
316
+ };
317
+
318
+ // Calculate consensus (most common recommendation)
319
+ const votes = Object.values(recommendations);
320
+ const consensus = votes.reduce((acc, k) => {
321
+ acc[k] = (acc[k] || 0) + 1;
322
+ return acc;
323
+ }, {});
324
+
325
+ const consensusK = parseInt(
326
+ Object.entries(consensus).reduce((a, b) => b[1] > a[1] ? b : a)[0]
327
+ );
328
+
329
+ return {
330
+ results,
331
+ recommendations,
332
+ consensus: consensusK,
333
+ summary: {
334
+ analysisRange: `${minK}-${maxK}`,
335
+ totalVectors: vectors.length,
336
+ dimensions: vectors[0].length,
337
+ recommendation: consensusK,
338
+ confidence: consensus[consensusK] / votes.length
339
+ }
340
+ };
341
+ }
342
+
343
+ /**
344
+ * Find elbow point using method of maximum curvature
345
+ *
346
+ * @param {number[]} inertias - Array of inertia values
347
+ * @returns {number} Index of elbow point
348
+ */
349
+ function findElbowPoint(inertias) {
350
+ const n = inertias.length;
351
+ if (n < 3) return 0;
352
+
353
+ let maxCurvature = -Infinity;
354
+ let elbowIndex = 0;
355
+
356
+ for (let i = 1; i < n - 1; i++) {
357
+ // Calculate second derivative (curvature approximation)
358
+ const curvature = inertias[i - 1] - 2 * inertias[i] + inertias[i + 1];
359
+
360
+ if (curvature > maxCurvature) {
361
+ maxCurvature = curvature;
362
+ elbowIndex = i;
363
+ }
364
+ }
365
+
366
+ return elbowIndex;
367
+ }