bun-scikit 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +187 -0
  3. package/binding.gyp +21 -0
  4. package/docs/README.md +7 -0
  5. package/docs/native-abi.md +53 -0
  6. package/index.ts +1 -0
  7. package/package.json +76 -0
  8. package/scripts/build-node-addon.ts +26 -0
  9. package/scripts/build-zig-kernels.ts +50 -0
  10. package/scripts/check-api-docs-coverage.ts +52 -0
  11. package/scripts/check-benchmark-health.ts +140 -0
  12. package/scripts/install-native.ts +160 -0
  13. package/scripts/package-native-artifacts.ts +62 -0
  14. package/scripts/sync-benchmark-readme.ts +181 -0
  15. package/scripts/update-benchmark-history.ts +91 -0
  16. package/src/ensemble/RandomForestClassifier.ts +136 -0
  17. package/src/ensemble/RandomForestRegressor.ts +136 -0
  18. package/src/index.ts +32 -0
  19. package/src/linear_model/LinearRegression.ts +136 -0
  20. package/src/linear_model/LogisticRegression.ts +260 -0
  21. package/src/linear_model/SGDClassifier.ts +161 -0
  22. package/src/linear_model/SGDRegressor.ts +104 -0
  23. package/src/metrics/classification.ts +294 -0
  24. package/src/metrics/regression.ts +51 -0
  25. package/src/model_selection/GridSearchCV.ts +244 -0
  26. package/src/model_selection/KFold.ts +82 -0
  27. package/src/model_selection/RepeatedKFold.ts +49 -0
  28. package/src/model_selection/RepeatedStratifiedKFold.ts +50 -0
  29. package/src/model_selection/StratifiedKFold.ts +112 -0
  30. package/src/model_selection/StratifiedShuffleSplit.ts +211 -0
  31. package/src/model_selection/crossValScore.ts +165 -0
  32. package/src/model_selection/trainTestSplit.ts +82 -0
  33. package/src/naive_bayes/GaussianNB.ts +148 -0
  34. package/src/native/node-addon/bun_scikit_addon.cpp +450 -0
  35. package/src/native/zigKernels.ts +576 -0
  36. package/src/neighbors/KNeighborsClassifier.ts +85 -0
  37. package/src/pipeline/ColumnTransformer.ts +203 -0
  38. package/src/pipeline/FeatureUnion.ts +123 -0
  39. package/src/pipeline/Pipeline.ts +168 -0
  40. package/src/preprocessing/MinMaxScaler.ts +113 -0
  41. package/src/preprocessing/OneHotEncoder.ts +91 -0
  42. package/src/preprocessing/PolynomialFeatures.ts +158 -0
  43. package/src/preprocessing/RobustScaler.ts +149 -0
  44. package/src/preprocessing/SimpleImputer.ts +150 -0
  45. package/src/preprocessing/StandardScaler.ts +92 -0
  46. package/src/svm/LinearSVC.ts +117 -0
  47. package/src/tree/DecisionTreeClassifier.ts +394 -0
  48. package/src/tree/DecisionTreeRegressor.ts +407 -0
  49. package/src/types.ts +18 -0
  50. package/src/utils/linalg.ts +209 -0
  51. package/src/utils/validation.ts +78 -0
  52. package/zig/kernels.zig +1327 -0
@@ -0,0 +1,394 @@
1
+ import type { ClassificationModel, Matrix, Vector } from "../types";
2
+ import {
3
+ assertConsistentRowSize,
4
+ assertFiniteMatrix,
5
+ assertFiniteVector,
6
+ validateClassificationInputs,
7
+ } from "../utils/validation";
8
+ import { accuracyScore } from "../metrics/classification";
9
+
10
+ export type MaxFeaturesOption = "sqrt" | "log2" | number | null;
11
+
12
+ export interface DecisionTreeClassifierOptions {
13
+ maxDepth?: number;
14
+ minSamplesSplit?: number;
15
+ minSamplesLeaf?: number;
16
+ maxFeatures?: MaxFeaturesOption;
17
+ randomState?: number;
18
+ }
19
+
20
+ interface TreeNode {
21
+ prediction: 0 | 1;
22
+ featureIndex?: number;
23
+ threshold?: number;
24
+ left?: TreeNode;
25
+ right?: TreeNode;
26
+ isLeaf: boolean;
27
+ }
28
+
29
+ interface SplitEvaluation {
30
+ threshold: number;
31
+ impurity: number;
32
+ }
33
+
34
+ interface SplitPartition {
35
+ leftIndices: number[];
36
+ rightIndices: number[];
37
+ }
38
+
39
+ const MAX_THRESHOLD_BINS = 128;
40
+
41
+ function mulberry32(seed: number): () => number {
42
+ let state = seed >>> 0;
43
+ return () => {
44
+ state += 0x6d2b79f5;
45
+ let t = Math.imul(state ^ (state >>> 15), 1 | state);
46
+ t ^= t + Math.imul(t ^ (t >>> 7), 61 | t);
47
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
48
+ };
49
+ }
50
+
51
+ function giniImpurity(positiveCount: number, sampleCount: number): number {
52
+ if (sampleCount === 0) {
53
+ return 0;
54
+ }
55
+ const p1 = positiveCount / sampleCount;
56
+ const p0 = 1 - p1;
57
+ return 1 - p1 * p1 - p0 * p0;
58
+ }
59
+
60
+ export class DecisionTreeClassifier implements ClassificationModel {
61
+ classes_: Vector = [0, 1];
62
+ private readonly maxDepth: number;
63
+ private readonly minSamplesSplit: number;
64
+ private readonly minSamplesLeaf: number;
65
+ private readonly maxFeatures: MaxFeaturesOption;
66
+ private readonly randomState?: number;
67
+ private random: () => number = Math.random;
68
+ private root: TreeNode | null = null;
69
+ private flattenedXTrain: Float64Array | null = null;
70
+ private yBinaryTrain: Uint8Array | null = null;
71
+ private featureCount = 0;
72
+ private allFeatureIndices: number[] = [];
73
+ private featureSelectionMarks: Uint8Array | null = null;
74
+ private binTotals: Uint32Array = new Uint32Array(MAX_THRESHOLD_BINS);
75
+ private binPositives: Uint32Array = new Uint32Array(MAX_THRESHOLD_BINS);
76
+
77
+ constructor(options: DecisionTreeClassifierOptions = {}) {
78
+ this.maxDepth = options.maxDepth ?? 12;
79
+ this.minSamplesSplit = options.minSamplesSplit ?? 2;
80
+ this.minSamplesLeaf = options.minSamplesLeaf ?? 1;
81
+ this.maxFeatures = options.maxFeatures ?? null;
82
+ this.randomState = options.randomState;
83
+ }
84
+
85
+ fit(
86
+ X: Matrix,
87
+ y: Vector,
88
+ sampleIndices?: ArrayLike<number>,
89
+ skipValidation = false,
90
+ flattenedXTrain?: Float64Array,
91
+ yBinaryTrain?: Uint8Array,
92
+ ): this {
93
+ if (!skipValidation) {
94
+ validateClassificationInputs(X, y);
95
+ }
96
+ this.featureCount = X[0].length;
97
+ this.flattenedXTrain = flattenedXTrain ?? this.flattenTrainingMatrix(X);
98
+ this.yBinaryTrain = yBinaryTrain ?? this.buildBinaryTargets(y);
99
+ this.allFeatureIndices = new Array<number>(this.featureCount);
100
+ for (let i = 0; i < this.featureCount; i += 1) {
101
+ this.allFeatureIndices[i] = i;
102
+ }
103
+ this.featureSelectionMarks = new Uint8Array(this.featureCount);
104
+ this.random = this.randomState === undefined ? Math.random : mulberry32(this.randomState);
105
+
106
+ let rootIndices: number[];
107
+ if (sampleIndices) {
108
+ if (sampleIndices.length === 0) {
109
+ throw new Error("sampleIndices must not be empty.");
110
+ }
111
+ for (let i = 0; i < sampleIndices.length; i += 1) {
112
+ const index = sampleIndices[i];
113
+ if (!Number.isInteger(index) || index < 0 || index >= X.length) {
114
+ throw new Error(`sampleIndices contains invalid index: ${index}.`);
115
+ }
116
+ }
117
+ rootIndices = Array.from(sampleIndices);
118
+ } else {
119
+ rootIndices = new Array<number>(X.length);
120
+ for (let idx = 0; idx < X.length; idx += 1) {
121
+ rootIndices[idx] = idx;
122
+ }
123
+ }
124
+
125
+ this.root = this.buildTree(rootIndices, 0);
126
+ return this;
127
+ }
128
+
129
+ predict(X: Matrix): Vector {
130
+ if (!this.root || this.featureCount === 0) {
131
+ throw new Error("DecisionTreeClassifier has not been fitted.");
132
+ }
133
+
134
+ assertConsistentRowSize(X);
135
+ assertFiniteMatrix(X);
136
+
137
+ if (X[0].length !== this.featureCount) {
138
+ throw new Error(
139
+ `Feature size mismatch. Expected ${this.featureCount}, got ${X[0].length}.`,
140
+ );
141
+ }
142
+
143
+ return X.map((sample) => this.predictOne(sample, this.root!));
144
+ }
145
+
146
+ score(X: Matrix, y: Vector): number {
147
+ assertFiniteVector(y);
148
+ return accuracyScore(y, this.predict(X));
149
+ }
150
+
151
+ private predictOne(sample: Vector, node: TreeNode): 0 | 1 {
152
+ let current: TreeNode = node;
153
+ while (
154
+ !current.isLeaf &&
155
+ current.featureIndex !== undefined &&
156
+ current.threshold !== undefined
157
+ ) {
158
+ if (sample[current.featureIndex] <= current.threshold) {
159
+ current = current.left!;
160
+ } else {
161
+ current = current.right!;
162
+ }
163
+ }
164
+ return current.prediction;
165
+ }
166
+
167
+ private buildTree(indices: number[], depth: number): TreeNode {
168
+ const y = this.yBinaryTrain!;
169
+ const sampleCount = indices.length;
170
+ let positiveCount = 0;
171
+ for (let i = 0; i < sampleCount; i += 1) {
172
+ positiveCount += y[indices[i]];
173
+ }
174
+ const prediction: 0 | 1 = positiveCount * 2 >= sampleCount ? 1 : 0;
175
+
176
+ const sameClass = positiveCount === 0 || positiveCount === sampleCount;
177
+ const depthStop = depth >= this.maxDepth;
178
+ const splitStop = sampleCount < this.minSamplesSplit;
179
+ if (sameClass || depthStop || splitStop) {
180
+ return { isLeaf: true, prediction };
181
+ }
182
+
183
+ const candidateFeatures = this.selectFeatureIndices(this.featureCount);
184
+ const parentImpurity = giniImpurity(positiveCount, sampleCount);
185
+
186
+ let bestFeature = -1;
187
+ let bestSplit: SplitEvaluation | null = null;
188
+
189
+ for (let idx = 0; idx < candidateFeatures.length; idx += 1) {
190
+ const featureIndex = candidateFeatures[idx];
191
+ const split = this.findBestThreshold(indices, featureIndex);
192
+ if (!split) {
193
+ continue;
194
+ }
195
+
196
+ if (!bestSplit || split.impurity < bestSplit.impurity) {
197
+ bestFeature = featureIndex;
198
+ bestSplit = split;
199
+ }
200
+ }
201
+
202
+ if (!bestSplit || bestFeature === -1 || bestSplit.impurity >= parentImpurity - 1e-12) {
203
+ return { isLeaf: true, prediction };
204
+ }
205
+
206
+ const partition = this.partitionIndices(indices, bestFeature, bestSplit.threshold);
207
+ if (!partition) {
208
+ return { isLeaf: true, prediction };
209
+ }
210
+
211
+ return {
212
+ isLeaf: false,
213
+ prediction,
214
+ featureIndex: bestFeature,
215
+ threshold: bestSplit.threshold,
216
+ left: this.buildTree(partition.leftIndices, depth + 1),
217
+ right: this.buildTree(partition.rightIndices, depth + 1),
218
+ };
219
+ }
220
+
221
+ private resolveMaxFeatures(featureCount: number): number {
222
+ if (this.maxFeatures === null || this.maxFeatures === undefined) {
223
+ return featureCount;
224
+ }
225
+ if (this.maxFeatures === "sqrt") {
226
+ return Math.max(1, Math.floor(Math.sqrt(featureCount)));
227
+ }
228
+ if (this.maxFeatures === "log2") {
229
+ return Math.max(1, Math.floor(Math.log2(featureCount)));
230
+ }
231
+ return Math.max(1, Math.min(featureCount, Math.floor(this.maxFeatures)));
232
+ }
233
+
234
+ private selectFeatureIndices(featureCount: number): number[] {
235
+ const k = this.resolveMaxFeatures(featureCount);
236
+ if (k >= featureCount) {
237
+ return this.allFeatureIndices;
238
+ }
239
+
240
+ const marks = this.featureSelectionMarks!;
241
+ marks.fill(0);
242
+
243
+ const selected = new Array<number>(k);
244
+ let selectedCount = 0;
245
+ while (selectedCount < k) {
246
+ const candidate = Math.floor(this.random() * featureCount);
247
+ if (marks[candidate] !== 0) {
248
+ continue;
249
+ }
250
+ marks[candidate] = 1;
251
+ selected[selectedCount] = candidate;
252
+ selectedCount += 1;
253
+ }
254
+
255
+ return selected;
256
+ }
257
+
258
+ private findBestThreshold(indices: number[], featureIndex: number): SplitEvaluation | null {
259
+ const x = this.flattenedXTrain!;
260
+ const y = this.yBinaryTrain!;
261
+ const stride = this.featureCount;
262
+ const sampleCount = indices.length;
263
+ let minValue = Number.POSITIVE_INFINITY;
264
+ let maxValue = Number.NEGATIVE_INFINITY;
265
+ let totalPositive = 0;
266
+ for (let i = 0; i < sampleCount; i += 1) {
267
+ const sampleIndex = indices[i];
268
+ const value = x[sampleIndex * stride + featureIndex];
269
+ if (value < minValue) {
270
+ minValue = value;
271
+ }
272
+ if (value > maxValue) {
273
+ maxValue = value;
274
+ }
275
+ totalPositive += y[sampleIndex];
276
+ }
277
+
278
+ if (!Number.isFinite(minValue) || !Number.isFinite(maxValue) || minValue === maxValue) {
279
+ return null;
280
+ }
281
+
282
+ const dynamicBins = Math.floor(Math.sqrt(sampleCount));
283
+ const binCount = Math.max(16, Math.min(MAX_THRESHOLD_BINS, dynamicBins));
284
+ const binTotals = this.binTotals;
285
+ const binPositives = this.binPositives;
286
+ binTotals.fill(0, 0, binCount);
287
+ binPositives.fill(0, 0, binCount);
288
+ const range = maxValue - minValue;
289
+
290
+ for (let i = 0; i < sampleCount; i += 1) {
291
+ const sampleIndex = indices[i];
292
+ const value = x[sampleIndex * stride + featureIndex];
293
+ let bin = Math.floor(((value - minValue) / range) * binCount);
294
+ if (bin < 0) {
295
+ bin = 0;
296
+ } else if (bin >= binCount) {
297
+ bin = binCount - 1;
298
+ }
299
+ binTotals[bin] += 1;
300
+ binPositives[bin] += y[sampleIndex];
301
+ }
302
+
303
+ let leftCount = 0;
304
+ let leftPositive = 0;
305
+ let bestImpurity = Number.POSITIVE_INFINITY;
306
+ let bestThreshold = 0;
307
+
308
+ for (let bin = 0; bin < binCount - 1; bin += 1) {
309
+ leftCount += binTotals[bin];
310
+ leftPositive += binPositives[bin];
311
+ const rightCount = sampleCount - leftCount;
312
+
313
+ if (leftCount < this.minSamplesLeaf || rightCount < this.minSamplesLeaf) {
314
+ continue;
315
+ }
316
+
317
+ const rightPositive = totalPositive - leftPositive;
318
+ const impurity =
319
+ (leftCount / sampleCount) * giniImpurity(leftPositive, leftCount) +
320
+ (rightCount / sampleCount) * giniImpurity(rightPositive, rightCount);
321
+
322
+ if (impurity < bestImpurity) {
323
+ bestImpurity = impurity;
324
+ bestThreshold = minValue + (range * (bin + 1)) / binCount;
325
+ }
326
+ }
327
+
328
+ if (!Number.isFinite(bestImpurity)) {
329
+ return null;
330
+ }
331
+
332
+ return {
333
+ threshold: bestThreshold,
334
+ impurity: bestImpurity,
335
+ };
336
+ }
337
+
338
+ private partitionIndices(
339
+ indices: number[],
340
+ featureIndex: number,
341
+ threshold: number,
342
+ ): SplitPartition | null {
343
+ const x = this.flattenedXTrain!;
344
+ const stride = this.featureCount;
345
+ const sampleCount = indices.length;
346
+ const leftIndices = new Array<number>(sampleCount);
347
+ const rightIndices = new Array<number>(sampleCount);
348
+ let leftPartitionCount = 0;
349
+ let rightPartitionCount = 0;
350
+ for (let i = 0; i < sampleCount; i += 1) {
351
+ const sampleIndex = indices[i];
352
+ if (x[sampleIndex * stride + featureIndex] <= threshold) {
353
+ leftIndices[leftPartitionCount] = sampleIndex;
354
+ leftPartitionCount += 1;
355
+ } else {
356
+ rightIndices[rightPartitionCount] = sampleIndex;
357
+ rightPartitionCount += 1;
358
+ }
359
+ }
360
+
361
+ if (
362
+ leftPartitionCount < this.minSamplesLeaf ||
363
+ rightPartitionCount < this.minSamplesLeaf
364
+ ) {
365
+ return null;
366
+ }
367
+
368
+ return {
369
+ leftIndices: leftIndices.slice(0, leftPartitionCount),
370
+ rightIndices: rightIndices.slice(0, rightPartitionCount),
371
+ };
372
+ }
373
+
374
+ private flattenTrainingMatrix(X: Matrix): Float64Array {
375
+ const sampleCount = X.length;
376
+ const flattened = new Float64Array(sampleCount * this.featureCount);
377
+ for (let i = 0; i < sampleCount; i += 1) {
378
+ const row = X[i];
379
+ const rowOffset = i * this.featureCount;
380
+ for (let j = 0; j < this.featureCount; j += 1) {
381
+ flattened[rowOffset + j] = row[j];
382
+ }
383
+ }
384
+ return flattened;
385
+ }
386
+
387
+ private buildBinaryTargets(y: Vector): Uint8Array {
388
+ const encoded = new Uint8Array(y.length);
389
+ for (let i = 0; i < y.length; i += 1) {
390
+ encoded[i] = y[i] === 1 ? 1 : 0;
391
+ }
392
+ return encoded;
393
+ }
394
+ }