s3db.js 11.2.4 → 11.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/s3db-cli.js +588 -74
- package/dist/s3db.cjs.js +1361 -23
- package/dist/s3db.cjs.js.map +1 -1
- package/dist/s3db.d.ts +3 -1
- package/dist/s3db.es.js +1359 -24
- package/dist/s3db.es.js.map +1 -1
- package/package.json +2 -1
- package/src/concerns/base62.js +70 -0
- package/src/plugins/index.js +1 -0
- package/src/plugins/vector/distances.js +173 -0
- package/src/plugins/vector/kmeans.js +367 -0
- package/src/plugins/vector/metrics.js +369 -0
- package/src/plugins/vector/vector-error.js +43 -0
- package/src/plugins/vector.plugin.js +687 -0
- package/src/resource.class.js +79 -0
- package/src/s3db.d.ts +3 -1
- package/src/schema.class.js +232 -41
- package/src/validator.class.js +8 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "s3db.js",
|
|
3
|
-
"version": "11.2.
|
|
3
|
+
"version": "11.2.6",
|
|
4
4
|
"description": "Use AWS S3, the world's most reliable document storage, as a database with this ORM.",
|
|
5
5
|
"main": "dist/s3db.cjs.js",
|
|
6
6
|
"module": "dist/s3db.es.js",
|
|
@@ -103,6 +103,7 @@
|
|
|
103
103
|
"@rollup/plugin-replace": "^6.0.2",
|
|
104
104
|
"@rollup/plugin-terser": "^0.4.4",
|
|
105
105
|
"@types/node": "24.7.0",
|
|
106
|
+
"@xenova/transformers": "^2.17.2",
|
|
106
107
|
"babel-loader": "^10.0.0",
|
|
107
108
|
"chalk": "^5.6.2",
|
|
108
109
|
"cli-table3": "^0.6.5",
|
package/src/concerns/base62.js
CHANGED
|
@@ -59,3 +59,73 @@ export const decodeDecimal = s => {
|
|
|
59
59
|
const num = decPart ? Number(decodedInt + '.' + decPart) : decodedInt;
|
|
60
60
|
return negative ? -num : num;
|
|
61
61
|
};
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Fixed-point encoding optimized for normalized values (typically -1 to 1)
|
|
65
|
+
* Common in embeddings, similarity scores, probabilities, etc.
|
|
66
|
+
*
|
|
67
|
+
* Achieves ~77% compression vs encodeDecimal for embedding vectors.
|
|
68
|
+
*
|
|
69
|
+
* @param {number} n - Number to encode (works for any range, optimized for [-1, 1])
|
|
70
|
+
* @param {number} precision - Decimal places to preserve (default: 6)
|
|
71
|
+
* @returns {string} Base62-encoded string with '^' prefix to indicate fixed-point encoding
|
|
72
|
+
*
|
|
73
|
+
* Examples:
|
|
74
|
+
* 0.123456 → "^w7f" (4 bytes vs 8 bytes with encodeDecimal)
|
|
75
|
+
* -0.8234567 → "^-3sdz" (6 bytes vs 10 bytes)
|
|
76
|
+
* 1.5 → "^98v9" (for values outside [-1,1], still works but less optimal)
|
|
77
|
+
*/
|
|
78
|
+
export const encodeFixedPoint = (n, precision = 6) => {
|
|
79
|
+
if (typeof n !== 'number' || isNaN(n)) return 'undefined';
|
|
80
|
+
if (!isFinite(n)) return 'undefined';
|
|
81
|
+
|
|
82
|
+
const scale = Math.pow(10, precision);
|
|
83
|
+
const scaled = Math.round(n * scale);
|
|
84
|
+
|
|
85
|
+
if (scaled === 0) return '^0';
|
|
86
|
+
|
|
87
|
+
const negative = scaled < 0;
|
|
88
|
+
let num = Math.abs(scaled);
|
|
89
|
+
let s = '';
|
|
90
|
+
|
|
91
|
+
while (num > 0) {
|
|
92
|
+
s = alphabet[num % base] + s;
|
|
93
|
+
num = Math.floor(num / base);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Prefix with ^ to distinguish from regular base62
|
|
97
|
+
return '^' + (negative ? '-' : '') + s;
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Decodes fixed-point encoded values
|
|
102
|
+
*
|
|
103
|
+
* @param {string} s - Encoded string (must start with '^')
|
|
104
|
+
* @param {number} precision - Decimal places used in encoding (default: 6)
|
|
105
|
+
* @returns {number} Decoded number
|
|
106
|
+
*/
|
|
107
|
+
export const decodeFixedPoint = (s, precision = 6) => {
|
|
108
|
+
if (typeof s !== 'string') return NaN;
|
|
109
|
+
if (!s.startsWith('^')) return NaN; // Safety check
|
|
110
|
+
|
|
111
|
+
s = s.slice(1); // Remove ^ prefix
|
|
112
|
+
|
|
113
|
+
if (s === '0') return 0;
|
|
114
|
+
|
|
115
|
+
let negative = false;
|
|
116
|
+
if (s[0] === '-') {
|
|
117
|
+
negative = true;
|
|
118
|
+
s = s.slice(1);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
let r = 0;
|
|
122
|
+
for (let i = 0; i < s.length; i++) {
|
|
123
|
+
const idx = charToValue[s[i]];
|
|
124
|
+
if (idx === undefined) return NaN;
|
|
125
|
+
r = r * base + idx;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const scale = Math.pow(10, precision);
|
|
129
|
+
const scaled = negative ? -r : r;
|
|
130
|
+
return scaled / scale;
|
|
131
|
+
};
|
package/src/plugins/index.js
CHANGED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector Distance Functions
|
|
3
|
+
*
|
|
4
|
+
* Provides distance/similarity calculations for vector operations.
|
|
5
|
+
* All distance functions return lower values for more similar vectors.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Calculate cosine distance between two vectors
|
|
10
|
+
*
|
|
11
|
+
* Range: 0 (identical) to 2 (opposite direction)
|
|
12
|
+
* Best for: Normalized vectors, semantic similarity
|
|
13
|
+
*
|
|
14
|
+
* @param {number[]} a - First vector
|
|
15
|
+
* @param {number[]} b - Second vector
|
|
16
|
+
* @returns {number} Cosine distance
|
|
17
|
+
* @throws {Error} If vectors have different dimensions
|
|
18
|
+
*/
|
|
19
|
+
export function cosineDistance(a, b) {
|
|
20
|
+
if (a.length !== b.length) {
|
|
21
|
+
throw new Error(`Dimension mismatch: ${a.length} vs ${b.length}`);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
let dotProduct = 0;
|
|
25
|
+
let normA = 0;
|
|
26
|
+
let normB = 0;
|
|
27
|
+
|
|
28
|
+
for (let i = 0; i < a.length; i++) {
|
|
29
|
+
dotProduct += a[i] * b[i];
|
|
30
|
+
normA += a[i] * a[i];
|
|
31
|
+
normB += b[i] * b[i];
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
35
|
+
|
|
36
|
+
// Handle zero vectors
|
|
37
|
+
if (denominator === 0) {
|
|
38
|
+
return a.every(v => v === 0) && b.every(v => v === 0) ? 0 : 1;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const similarity = dotProduct / denominator;
|
|
42
|
+
|
|
43
|
+
// Convert similarity [-1, 1] to distance [0, 2]
|
|
44
|
+
return 1 - similarity;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Calculate euclidean (L2) distance between two vectors
|
|
49
|
+
*
|
|
50
|
+
* Range: [0, ∞)
|
|
51
|
+
* Best for: Geometric proximity, continuous data
|
|
52
|
+
*
|
|
53
|
+
* @param {number[]} a - First vector
|
|
54
|
+
* @param {number[]} b - Second vector
|
|
55
|
+
* @returns {number} Euclidean distance
|
|
56
|
+
* @throws {Error} If vectors have different dimensions
|
|
57
|
+
*/
|
|
58
|
+
export function euclideanDistance(a, b) {
|
|
59
|
+
if (a.length !== b.length) {
|
|
60
|
+
throw new Error(`Dimension mismatch: ${a.length} vs ${b.length}`);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
let sum = 0;
|
|
64
|
+
for (let i = 0; i < a.length; i++) {
|
|
65
|
+
const diff = a[i] - b[i];
|
|
66
|
+
sum += diff * diff;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return Math.sqrt(sum);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Calculate manhattan (L1) distance between two vectors
|
|
74
|
+
*
|
|
75
|
+
* Range: [0, ∞)
|
|
76
|
+
* Best for: Grid-based movement, faster computation
|
|
77
|
+
*
|
|
78
|
+
* @param {number[]} a - First vector
|
|
79
|
+
* @param {number[]} b - Second vector
|
|
80
|
+
* @returns {number} Manhattan distance
|
|
81
|
+
* @throws {Error} If vectors have different dimensions
|
|
82
|
+
*/
|
|
83
|
+
export function manhattanDistance(a, b) {
|
|
84
|
+
if (a.length !== b.length) {
|
|
85
|
+
throw new Error(`Dimension mismatch: ${a.length} vs ${b.length}`);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
let sum = 0;
|
|
89
|
+
for (let i = 0; i < a.length; i++) {
|
|
90
|
+
sum += Math.abs(a[i] - b[i]);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return sum;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Calculate dot product of two vectors
|
|
98
|
+
*
|
|
99
|
+
* Higher values indicate more similarity (for normalized vectors)
|
|
100
|
+
*
|
|
101
|
+
* @param {number[]} a - First vector
|
|
102
|
+
* @param {number[]} b - Second vector
|
|
103
|
+
* @returns {number} Dot product
|
|
104
|
+
* @throws {Error} If vectors have different dimensions
|
|
105
|
+
*/
|
|
106
|
+
export function dotProduct(a, b) {
|
|
107
|
+
if (a.length !== b.length) {
|
|
108
|
+
throw new Error(`Dimension mismatch: ${a.length} vs ${b.length}`);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
let sum = 0;
|
|
112
|
+
for (let i = 0; i < a.length; i++) {
|
|
113
|
+
sum += a[i] * b[i];
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return sum;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Normalize a vector to unit length (L2 normalization)
|
|
121
|
+
*
|
|
122
|
+
* Converts vector to unit vector pointing in same direction.
|
|
123
|
+
* Useful for cosine similarity calculations.
|
|
124
|
+
*
|
|
125
|
+
* @param {number[]} vector - Vector to normalize
|
|
126
|
+
* @returns {number[]} Normalized vector
|
|
127
|
+
*/
|
|
128
|
+
export function normalize(vector) {
|
|
129
|
+
const magnitude = Math.sqrt(
|
|
130
|
+
vector.reduce((sum, val) => sum + val * val, 0)
|
|
131
|
+
);
|
|
132
|
+
|
|
133
|
+
// Handle zero vector
|
|
134
|
+
if (magnitude === 0) {
|
|
135
|
+
return vector.slice(); // Return copy of zero vector
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return vector.map(val => val / magnitude);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Calculate the magnitude (length) of a vector
|
|
143
|
+
*
|
|
144
|
+
* @param {number[]} vector - Input vector
|
|
145
|
+
* @returns {number} Magnitude
|
|
146
|
+
*/
|
|
147
|
+
export function magnitude(vector) {
|
|
148
|
+
return Math.sqrt(
|
|
149
|
+
vector.reduce((sum, val) => sum + val * val, 0)
|
|
150
|
+
);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Check if two vectors are equal within a tolerance
|
|
155
|
+
*
|
|
156
|
+
* @param {number[]} a - First vector
|
|
157
|
+
* @param {number[]} b - Second vector
|
|
158
|
+
* @param {number} epsilon - Tolerance for floating point comparison
|
|
159
|
+
* @returns {boolean} True if vectors are equal within tolerance
|
|
160
|
+
*/
|
|
161
|
+
export function vectorsEqual(a, b, epsilon = 1e-10) {
|
|
162
|
+
if (a.length !== b.length) {
|
|
163
|
+
return false;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
for (let i = 0; i < a.length; i++) {
|
|
167
|
+
if (Math.abs(a[i] - b[i]) > epsilon) {
|
|
168
|
+
return false;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return true;
|
|
173
|
+
}
|
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* K-Means Clustering Implementation
|
|
3
|
+
*
|
|
4
|
+
* Provides k-means clustering with k-means++ initialization
|
|
5
|
+
* and comprehensive optimal K analysis using multiple metrics.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { euclideanDistance } from './distances.js';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* K-means clustering algorithm
|
|
12
|
+
*
|
|
13
|
+
* @param {number[][]} vectors - Array of vectors to cluster
|
|
14
|
+
* @param {number} k - Number of clusters
|
|
15
|
+
* @param {Object} options - Configuration options
|
|
16
|
+
* @param {number} options.maxIterations - Maximum iterations (default: 100)
|
|
17
|
+
* @param {number} options.tolerance - Convergence tolerance (default: 0.0001)
|
|
18
|
+
* @param {Function} options.distanceFn - Distance function (default: euclideanDistance)
|
|
19
|
+
* @param {number|null} options.seed - Random seed for reproducibility (default: null)
|
|
20
|
+
* @param {Function} options.onIteration - Callback for each iteration (iteration, inertia, converged) (default: null)
|
|
21
|
+
* @returns {Object} Clustering results
|
|
22
|
+
* @throws {Error} If invalid parameters
|
|
23
|
+
*/
|
|
24
|
+
export function kmeans(vectors, k, options = {}) {
|
|
25
|
+
const {
|
|
26
|
+
maxIterations = 100,
|
|
27
|
+
tolerance = 0.0001,
|
|
28
|
+
distanceFn = euclideanDistance,
|
|
29
|
+
seed = null,
|
|
30
|
+
onIteration = null
|
|
31
|
+
} = options;
|
|
32
|
+
|
|
33
|
+
if (vectors.length === 0) {
|
|
34
|
+
throw new Error('Cannot cluster empty vector array');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (k < 1) {
|
|
38
|
+
throw new Error(`k must be at least 1, got ${k}`);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
if (k > vectors.length) {
|
|
42
|
+
throw new Error(`k (${k}) cannot be greater than number of vectors (${vectors.length})`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const dimensions = vectors[0].length;
|
|
46
|
+
|
|
47
|
+
// Validate all vectors have same dimensions
|
|
48
|
+
for (let i = 1; i < vectors.length; i++) {
|
|
49
|
+
if (vectors[i].length !== dimensions) {
|
|
50
|
+
throw new Error(`All vectors must have same dimensions. Expected ${dimensions}, got ${vectors[i].length} at index ${i}`);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Initialize centroids using k-means++
|
|
55
|
+
const centroids = initializeCentroidsKMeansPlusPlus(vectors, k, distanceFn, seed);
|
|
56
|
+
|
|
57
|
+
let assignments = new Array(vectors.length);
|
|
58
|
+
let iterations = 0;
|
|
59
|
+
let converged = false;
|
|
60
|
+
let previousInertia = Infinity;
|
|
61
|
+
|
|
62
|
+
while (!converged && iterations < maxIterations) {
|
|
63
|
+
// Assign each vector to nearest centroid
|
|
64
|
+
const newAssignments = vectors.map(vector => {
|
|
65
|
+
let minDist = Infinity;
|
|
66
|
+
let nearestCluster = 0;
|
|
67
|
+
|
|
68
|
+
for (let i = 0; i < k; i++) {
|
|
69
|
+
const dist = distanceFn(vector, centroids[i]);
|
|
70
|
+
if (dist < minDist) {
|
|
71
|
+
minDist = dist;
|
|
72
|
+
nearestCluster = i;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return nearestCluster;
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
// Calculate inertia (sum of squared distances to centroids)
|
|
80
|
+
let inertia = 0;
|
|
81
|
+
vectors.forEach((vector, i) => {
|
|
82
|
+
const dist = distanceFn(vector, centroids[newAssignments[i]]);
|
|
83
|
+
inertia += dist * dist;
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
// Check for convergence
|
|
87
|
+
const inertiaChange = Math.abs(previousInertia - inertia);
|
|
88
|
+
converged = inertiaChange < tolerance;
|
|
89
|
+
|
|
90
|
+
assignments = newAssignments;
|
|
91
|
+
previousInertia = inertia;
|
|
92
|
+
|
|
93
|
+
// Call onIteration callback if provided
|
|
94
|
+
if (onIteration) {
|
|
95
|
+
onIteration(iterations + 1, inertia, converged);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (!converged) {
|
|
99
|
+
// Recalculate centroids
|
|
100
|
+
const clusterSums = Array(k).fill(null).map(() => new Array(dimensions).fill(0));
|
|
101
|
+
const clusterCounts = new Array(k).fill(0);
|
|
102
|
+
|
|
103
|
+
vectors.forEach((vector, i) => {
|
|
104
|
+
const cluster = assignments[i];
|
|
105
|
+
clusterCounts[cluster]++;
|
|
106
|
+
vector.forEach((val, j) => {
|
|
107
|
+
clusterSums[cluster][j] += val;
|
|
108
|
+
});
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
// Update centroids
|
|
112
|
+
for (let i = 0; i < k; i++) {
|
|
113
|
+
if (clusterCounts[i] > 0) {
|
|
114
|
+
centroids[i] = clusterSums[i].map(sum => sum / clusterCounts[i]);
|
|
115
|
+
}
|
|
116
|
+
// If cluster is empty, reinitialize to a random vector
|
|
117
|
+
else {
|
|
118
|
+
const randomIdx = Math.floor(Math.random() * vectors.length);
|
|
119
|
+
centroids[i] = [...vectors[randomIdx]];
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
iterations++;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Final inertia calculation
|
|
128
|
+
let inertia = 0;
|
|
129
|
+
vectors.forEach((vector, i) => {
|
|
130
|
+
const dist = distanceFn(vector, centroids[assignments[i]]);
|
|
131
|
+
inertia += dist * dist;
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
centroids,
|
|
136
|
+
assignments,
|
|
137
|
+
iterations,
|
|
138
|
+
converged,
|
|
139
|
+
inertia
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Initialize centroids using k-means++ algorithm
|
|
145
|
+
*
|
|
146
|
+
* K-means++ provides better initialization than random selection,
|
|
147
|
+
* leading to faster convergence and better final clustering.
|
|
148
|
+
*
|
|
149
|
+
* @param {number[][]} vectors - Input vectors
|
|
150
|
+
* @param {number} k - Number of centroids
|
|
151
|
+
* @param {Function} distanceFn - Distance function
|
|
152
|
+
* @param {number|null} seed - Random seed
|
|
153
|
+
* @returns {number[][]} Initial centroids
|
|
154
|
+
*/
|
|
155
|
+
function initializeCentroidsKMeansPlusPlus(vectors, k, distanceFn, seed) {
|
|
156
|
+
const centroids = [];
|
|
157
|
+
const n = vectors.length;
|
|
158
|
+
|
|
159
|
+
// Choose first centroid randomly
|
|
160
|
+
const firstIndex = seed !== null ? seed % n : Math.floor(Math.random() * n);
|
|
161
|
+
centroids.push([...vectors[firstIndex]]);
|
|
162
|
+
|
|
163
|
+
// Choose remaining centroids
|
|
164
|
+
for (let i = 1; i < k; i++) {
|
|
165
|
+
// Calculate distance to nearest existing centroid
|
|
166
|
+
const distances = vectors.map(vector => {
|
|
167
|
+
return Math.min(...centroids.map(c => distanceFn(vector, c)));
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
// Square distances for probability distribution
|
|
171
|
+
const squaredDistances = distances.map(d => d * d);
|
|
172
|
+
const totalSquared = squaredDistances.reduce((a, b) => a + b, 0);
|
|
173
|
+
|
|
174
|
+
if (totalSquared === 0) {
|
|
175
|
+
// All remaining points are identical to existing centroids
|
|
176
|
+
// Choose randomly
|
|
177
|
+
const randomIdx = Math.floor(Math.random() * n);
|
|
178
|
+
centroids.push([...vectors[randomIdx]]);
|
|
179
|
+
continue;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Choose next centroid with probability proportional to squared distance
|
|
183
|
+
let threshold = Math.random() * totalSquared;
|
|
184
|
+
let cumulativeSum = 0;
|
|
185
|
+
|
|
186
|
+
for (let j = 0; j < n; j++) {
|
|
187
|
+
cumulativeSum += squaredDistances[j];
|
|
188
|
+
if (cumulativeSum >= threshold) {
|
|
189
|
+
centroids.push([...vectors[j]]);
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return centroids;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Find optimal K using multiple evaluation metrics
|
|
200
|
+
*
|
|
201
|
+
* Analyzes clustering quality across a range of K values using:
|
|
202
|
+
* - Elbow method (inertia)
|
|
203
|
+
* - Silhouette score
|
|
204
|
+
* - Davies-Bouldin index
|
|
205
|
+
* - Calinski-Harabasz index
|
|
206
|
+
* - Gap statistic
|
|
207
|
+
* - Clustering stability
|
|
208
|
+
*
|
|
209
|
+
* @param {number[][]} vectors - Vectors to analyze
|
|
210
|
+
* @param {Object} options - Configuration options
|
|
211
|
+
* @param {number} options.minK - Minimum K to test (default: 2)
|
|
212
|
+
* @param {number} options.maxK - Maximum K to test (default: sqrt(n/2))
|
|
213
|
+
* @param {Function} options.distanceFn - Distance function (default: euclideanDistance)
|
|
214
|
+
* @param {number} options.nReferences - Number of reference datasets for Gap statistic (default: 10)
|
|
215
|
+
* @param {number} options.stabilityRuns - Number of runs for stability analysis (default: 5)
|
|
216
|
+
* @returns {Promise<Object>} Analysis results with recommendations
|
|
217
|
+
*/
|
|
218
|
+
export async function findOptimalK(vectors, options = {}) {
|
|
219
|
+
const {
|
|
220
|
+
minK = 2,
|
|
221
|
+
maxK = Math.min(10, Math.floor(Math.sqrt(vectors.length / 2))),
|
|
222
|
+
distanceFn = euclideanDistance,
|
|
223
|
+
nReferences = 10,
|
|
224
|
+
stabilityRuns = 5,
|
|
225
|
+
...kmeansOptions
|
|
226
|
+
} = options;
|
|
227
|
+
|
|
228
|
+
// Dynamic import to avoid circular dependency
|
|
229
|
+
const metricsModule = await import('./metrics.js');
|
|
230
|
+
const {
|
|
231
|
+
silhouetteScore,
|
|
232
|
+
daviesBouldinIndex,
|
|
233
|
+
calinskiHarabaszIndex,
|
|
234
|
+
gapStatistic,
|
|
235
|
+
clusteringStability
|
|
236
|
+
} = metricsModule;
|
|
237
|
+
|
|
238
|
+
const results = [];
|
|
239
|
+
|
|
240
|
+
for (let k = minK; k <= maxK; k++) {
|
|
241
|
+
// Run k-means
|
|
242
|
+
const kmeansResult = kmeans(vectors, k, { ...kmeansOptions, distanceFn });
|
|
243
|
+
|
|
244
|
+
// Calculate all metrics
|
|
245
|
+
const silhouette = silhouetteScore(
|
|
246
|
+
vectors,
|
|
247
|
+
kmeansResult.assignments,
|
|
248
|
+
kmeansResult.centroids,
|
|
249
|
+
distanceFn
|
|
250
|
+
);
|
|
251
|
+
|
|
252
|
+
const daviesBouldin = daviesBouldinIndex(
|
|
253
|
+
vectors,
|
|
254
|
+
kmeansResult.assignments,
|
|
255
|
+
kmeansResult.centroids,
|
|
256
|
+
distanceFn
|
|
257
|
+
);
|
|
258
|
+
|
|
259
|
+
const calinskiHarabasz = calinskiHarabaszIndex(
|
|
260
|
+
vectors,
|
|
261
|
+
kmeansResult.assignments,
|
|
262
|
+
kmeansResult.centroids,
|
|
263
|
+
distanceFn
|
|
264
|
+
);
|
|
265
|
+
|
|
266
|
+
const gap = await gapStatistic(
|
|
267
|
+
vectors,
|
|
268
|
+
kmeansResult.assignments,
|
|
269
|
+
kmeansResult.centroids,
|
|
270
|
+
distanceFn,
|
|
271
|
+
nReferences
|
|
272
|
+
);
|
|
273
|
+
|
|
274
|
+
const stability = clusteringStability(
|
|
275
|
+
vectors,
|
|
276
|
+
k,
|
|
277
|
+
{ ...kmeansOptions, distanceFn, nRuns: stabilityRuns }
|
|
278
|
+
);
|
|
279
|
+
|
|
280
|
+
results.push({
|
|
281
|
+
k,
|
|
282
|
+
inertia: kmeansResult.inertia,
|
|
283
|
+
silhouette,
|
|
284
|
+
daviesBouldin,
|
|
285
|
+
calinskiHarabasz,
|
|
286
|
+
gap: gap.gap,
|
|
287
|
+
gapSk: gap.sk,
|
|
288
|
+
stability: stability.stability,
|
|
289
|
+
cvInertia: stability.cvInertia,
|
|
290
|
+
iterations: kmeansResult.iterations,
|
|
291
|
+
converged: kmeansResult.converged
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Calculate elbow point
|
|
296
|
+
const elbowK = findElbowPoint(results.map(r => r.inertia));
|
|
297
|
+
|
|
298
|
+
// Find best K by each metric
|
|
299
|
+
const recommendations = {
|
|
300
|
+
elbow: minK + elbowK,
|
|
301
|
+
silhouette: results.reduce((best, curr) =>
|
|
302
|
+
curr.silhouette > best.silhouette ? curr : best
|
|
303
|
+
).k,
|
|
304
|
+
daviesBouldin: results.reduce((best, curr) =>
|
|
305
|
+
curr.daviesBouldin < best.daviesBouldin ? curr : best
|
|
306
|
+
).k,
|
|
307
|
+
calinskiHarabasz: results.reduce((best, curr) =>
|
|
308
|
+
curr.calinskiHarabasz > best.calinskiHarabasz ? curr : best
|
|
309
|
+
).k,
|
|
310
|
+
gap: results.reduce((best, curr) =>
|
|
311
|
+
curr.gap > best.gap ? curr : best
|
|
312
|
+
).k,
|
|
313
|
+
stability: results.reduce((best, curr) =>
|
|
314
|
+
curr.stability > best.stability ? curr : best
|
|
315
|
+
).k
|
|
316
|
+
};
|
|
317
|
+
|
|
318
|
+
// Calculate consensus (most common recommendation)
|
|
319
|
+
const votes = Object.values(recommendations);
|
|
320
|
+
const consensus = votes.reduce((acc, k) => {
|
|
321
|
+
acc[k] = (acc[k] || 0) + 1;
|
|
322
|
+
return acc;
|
|
323
|
+
}, {});
|
|
324
|
+
|
|
325
|
+
const consensusK = parseInt(
|
|
326
|
+
Object.entries(consensus).reduce((a, b) => b[1] > a[1] ? b : a)[0]
|
|
327
|
+
);
|
|
328
|
+
|
|
329
|
+
return {
|
|
330
|
+
results,
|
|
331
|
+
recommendations,
|
|
332
|
+
consensus: consensusK,
|
|
333
|
+
summary: {
|
|
334
|
+
analysisRange: `${minK}-${maxK}`,
|
|
335
|
+
totalVectors: vectors.length,
|
|
336
|
+
dimensions: vectors[0].length,
|
|
337
|
+
recommendation: consensusK,
|
|
338
|
+
confidence: consensus[consensusK] / votes.length
|
|
339
|
+
}
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* Find elbow point using method of maximum curvature
|
|
345
|
+
*
|
|
346
|
+
* @param {number[]} inertias - Array of inertia values
|
|
347
|
+
* @returns {number} Index of elbow point
|
|
348
|
+
*/
|
|
349
|
+
function findElbowPoint(inertias) {
|
|
350
|
+
const n = inertias.length;
|
|
351
|
+
if (n < 3) return 0;
|
|
352
|
+
|
|
353
|
+
let maxCurvature = -Infinity;
|
|
354
|
+
let elbowIndex = 0;
|
|
355
|
+
|
|
356
|
+
for (let i = 1; i < n - 1; i++) {
|
|
357
|
+
// Calculate second derivative (curvature approximation)
|
|
358
|
+
const curvature = inertias[i - 1] - 2 * inertias[i] + inertias[i + 1];
|
|
359
|
+
|
|
360
|
+
if (curvature > maxCurvature) {
|
|
361
|
+
maxCurvature = curvature;
|
|
362
|
+
elbowIndex = i;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
return elbowIndex;
|
|
367
|
+
}
|