npm - bun-scikit - Versions diffs - 0.1.3 → 0.1.4 - Mend

bun-scikit 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +73 -137
package/package.json +2 -2
package/scripts/check-benchmark-health.ts +62 -1
package/scripts/sync-benchmark-readme.ts +56 -0
package/src/dummy/DummyClassifier.ts +190 -0
package/src/dummy/DummyRegressor.ts +108 -0
package/src/feature_selection/VarianceThreshold.ts +88 -0
package/src/index.ts +23 -0
package/src/metrics/classification.ts +30 -0
package/src/metrics/regression.ts +40 -0
package/src/model_selection/RandomizedSearchCV.ts +269 -0
package/src/native/node-addon/bun_scikit_addon.cpp +149 -0
package/src/native/zigKernels.ts +33 -4
package/src/preprocessing/Binarizer.ts +46 -0
package/src/preprocessing/LabelEncoder.ts +62 -0
package/src/preprocessing/MaxAbsScaler.ts +77 -0
package/src/preprocessing/Normalizer.ts +66 -0
package/src/tree/DecisionTreeClassifier.ts +146 -3
package/zig/kernels.zig +63 -40

package/src/tree/DecisionTreeClassifier.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import {
   validateClassificationInputs,
 } from "../utils/validation";
 import { accuracyScore } from "../metrics/classification";
+import { getZigKernels } from "../native/zigKernels";
 export type MaxFeaturesOption = "sqrt" | "log2" | number | null;
@@ -38,6 +39,11 @@ interface SplitPartition {
 const MAX_THRESHOLD_BINS = 128;
+function isZigTreeBackendEnabled(): boolean {
+  const mode = process.env.BUN_SCIKIT_TREE_BACKEND?.trim().toLowerCase();
+  return mode === "zig" || mode === "native";
+}
 function mulberry32(seed: number): () => number {
   let state = seed >>> 0;
   return () => {
@@ -59,6 +65,8 @@ function giniImpurity(positiveCount: number, sampleCount: number): number {
 export class DecisionTreeClassifier implements ClassificationModel {
   classes_: Vector = [0, 1];
+  fitBackend_: "zig" | "js" = "js";
+  fitBackendLibrary_: string | null = null;
   private readonly maxDepth: number;
   private readonly minSamplesSplit: number;
   private readonly minSamplesLeaf: number;
@@ -73,6 +81,7 @@ export class DecisionTreeClassifier implements ClassificationModel {
   private featureSelectionMarks: Uint8Array | null = null;
   private binTotals: Uint32Array = new Uint32Array(MAX_THRESHOLD_BINS);
   private binPositives: Uint32Array = new Uint32Array(MAX_THRESHOLD_BINS);
+  private zigModelHandle: bigint | null = null;
   constructor(options: DecisionTreeClassifierOptions = {}) {
     this.maxDepth = options.maxDepth ?? 12;
@@ -90,6 +99,8 @@ export class DecisionTreeClassifier implements ClassificationModel {
     flattenedXTrain?: Float64Array,
     yBinaryTrain?: Uint8Array,
   ): this {
+    this.destroyZigModel();
     if (!skipValidation) {
       validateClassificationInputs(X, y);
     }
@@ -103,18 +114,28 @@ export class DecisionTreeClassifier implements ClassificationModel {
     this.featureSelectionMarks = new Uint8Array(this.featureCount);
     this.random = this.randomState === undefined ? Math.random : mulberry32(this.randomState);
-    let rootIndices: number[];
+    let validatedSampleIndices: Uint32Array | null = null;
     if (sampleIndices) {
       if (sampleIndices.length === 0) {
         throw new Error("sampleIndices must not be empty.");
       }
+      validatedSampleIndices = new Uint32Array(sampleIndices.length);
       for (let i = 0; i < sampleIndices.length; i += 1) {
         const index = sampleIndices[i];
         if (!Number.isInteger(index) || index < 0 || index >= X.length) {
           throw new Error(`sampleIndices contains invalid index: ${index}.`);
         }
+        validatedSampleIndices[i] = index;
       }
-      rootIndices = Array.from(sampleIndices);
+    }
+    if (isZigTreeBackendEnabled() && this.tryFitWithZig(X.length, validatedSampleIndices)) {
+      return this;
+    }
+    let rootIndices: number[];
+    if (validatedSampleIndices) {
+      rootIndices = Array.from(validatedSampleIndices);
     } else {
       rootIndices = new Array<number>(X.length);
       for (let idx = 0; idx < X.length; idx += 1) {
@@ -123,11 +144,13 @@ export class DecisionTreeClassifier implements ClassificationModel {
     }
     this.root = this.buildTree(rootIndices, 0);
+    this.fitBackend_ = "js";
+    this.fitBackendLibrary_ = null;
     return this;
   }
   predict(X: Matrix): Vector {
-    if (!this.root || this.featureCount === 0) {
+    if ((this.root === null && this.zigModelHandle === null) || this.featureCount === 0) {
       throw new Error("DecisionTreeClassifier has not been fitted.");
     }
@@ -140,6 +163,28 @@ export class DecisionTreeClassifier implements ClassificationModel {
       );
     }
+    if (this.zigModelHandle !== null) {
+      const kernels = getZigKernels();
+      const nativePredict = kernels?.decisionTreeModelPredict;
+      if (nativePredict) {
+        const flattenedX = this.flattenTrainingMatrix(X);
+        const outLabels = new Uint8Array(X.length);
+        const status = nativePredict(
+          this.zigModelHandle,
+          flattenedX,
+          X.length,
+          this.featureCount,
+          outLabels,
+        );
+        if (status === 1) {
+          return Array.from(outLabels);
+        }
+      }
+      if (!this.root) {
+        throw new Error("Native DecisionTree predict failed and no JS fallback tree is available.");
+      }
+    }
     return X.map((sample) => this.predictOne(sample, this.root!));
   }
@@ -228,9 +273,107 @@ export class DecisionTreeClassifier implements ClassificationModel {
     if (this.maxFeatures === "log2") {
       return Math.max(1, Math.floor(Math.log2(featureCount)));
     }
+    if (!Number.isFinite(this.maxFeatures)) {
+      return featureCount;
+    }
     return Math.max(1, Math.min(featureCount, Math.floor(this.maxFeatures)));
   }
+  private resolveNativeMaxFeatures(featureCount: number): {
+    mode: 0 | 1 | 2 | 3;
+    value: number;
+  } {
+    if (this.maxFeatures === null || this.maxFeatures === undefined) {
+      return { mode: 0, value: 0 };
+    }
+    if (this.maxFeatures === "sqrt") {
+      return { mode: 1, value: 0 };
+    }
+    if (this.maxFeatures === "log2") {
+      return { mode: 2, value: 0 };
+    }
+    const value = Number.isFinite(this.maxFeatures)
+      ? Math.max(1, Math.min(featureCount, Math.floor(this.maxFeatures)))
+      : featureCount;
+    return { mode: 3, value };
+  }
+  private tryFitWithZig(
+    sampleCount: number,
+    sampleIndices: Uint32Array | null,
+  ): boolean {
+    const kernels = getZigKernels();
+    const create = kernels?.decisionTreeModelCreate;
+    const fit = kernels?.decisionTreeModelFit;
+    const destroy = kernels?.decisionTreeModelDestroy;
+    if (!create || !fit || !destroy) {
+      return false;
+    }
+    const { mode, value } = this.resolveNativeMaxFeatures(this.featureCount);
+    const useRandomState = this.randomState === undefined ? 0 : 1;
+    const randomState = this.randomState ?? 0;
+    const handle = create(
+      this.maxDepth,
+      this.minSamplesSplit,
+      this.minSamplesLeaf,
+      mode,
+      value,
+      randomState >>> 0,
+      useRandomState,
+      this.featureCount,
+    );
+    if (handle === 0n) {
+      return false;
+    }
+    let shouldDestroy = true;
+    try {
+      const emptySampleIndices = new Uint32Array(0);
+      const status = fit(
+        handle,
+        this.flattenedXTrain!,
+        this.yBinaryTrain!,
+        sampleCount,
+        this.featureCount,
+        sampleIndices ?? emptySampleIndices,
+        sampleIndices?.length ?? 0,
+      );
+      if (status !== 1) {
+        return false;
+      }
+      this.zigModelHandle = handle;
+      this.root = null;
+      this.fitBackend_ = "zig";
+      this.fitBackendLibrary_ = kernels.libraryPath;
+      shouldDestroy = false;
+      return true;
+    } catch {
+      return false;
+    } finally {
+      if (shouldDestroy) {
+        destroy(handle);
+      }
+    }
+  }
+  private destroyZigModel(): void {
+    if (this.zigModelHandle === null) {
+      return;
+    }
+    const kernels = getZigKernels();
+    const destroy = kernels?.decisionTreeModelDestroy;
+    if (destroy) {
+      try {
+        destroy(this.zigModelHandle);
+      } catch {
+        // no-op: cleanup best effort
+      }
+    }
+    this.zigModelHandle = null;
+  }
   private selectFeatureIndices(featureCount: number): number[] {
     const k = this.resolveMaxFeatures(featureCount);
     if (k >= featureCount) {

package/zig/kernels.zig CHANGED Viewed

@@ -84,6 +84,8 @@ const SplitResult = struct {
     right_indices: []usize,
 };
+const MAX_THRESHOLD_BINS: usize = 128;
 const Mulberry32 = struct {
     state: u32,
@@ -213,55 +215,55 @@ fn findBestSplitForFeature(
     if (sample_count < 2) {
         return null;
     }
+    var min_value = std.math.inf(f64);
+    var max_value = -std.math.inf(f64);
+    var total_positive: usize = 0;
+    for (indices) |sample_index| {
+        const value = x_ptr[sample_index * model.n_features + feature_index];
+        if (value < min_value) {
+            min_value = value;
+        }
+        if (value > max_value) {
+            max_value = value;
+        }
+        total_positive += y_ptr[sample_index];
+    }
-    const sorted_indices = try allocator.alloc(usize, sample_count);
-    defer allocator.free(sorted_indices);
-    @memcpy(sorted_indices, indices);
+    if (!std.math.isFinite(min_value) or !std.math.isFinite(max_value) or min_value == max_value) {
+        return null;
+    }
-    const SortContext = struct {
-        x_ptr: [*]const f64,
-        n_features: usize,
-        feature_index: usize,
-        fn lessThan(ctx: @This(), a: usize, b: usize) bool {
-            return ctx.x_ptr[a * ctx.n_features + ctx.feature_index] <
-                ctx.x_ptr[b * ctx.n_features + ctx.feature_index];
-        }
-    };
-    std.sort.heap(usize, sorted_indices, SortContext{
-        .x_ptr = x_ptr,
-        .n_features = model.n_features,
-        .feature_index = feature_index,
-    }, SortContext.lessThan);
+    const dynamic_bins = @as(usize, @intFromFloat(@floor(@sqrt(@as(f64, @floatFromInt(sample_count))))));
+    const bin_count = std.math.clamp(dynamic_bins, 16, MAX_THRESHOLD_BINS);
+    var bin_totals: [MAX_THRESHOLD_BINS]usize = [_]usize{0} ** MAX_THRESHOLD_BINS;
+    var bin_positives: [MAX_THRESHOLD_BINS]usize = [_]usize{0} ** MAX_THRESHOLD_BINS;
+    const value_range = max_value - min_value;
-    var total_positive: usize = 0;
-    for (sorted_indices) |sample_index| {
-        total_positive += y_ptr[sample_index];
+    for (indices) |sample_index| {
+        const value = x_ptr[sample_index * model.n_features + feature_index];
+        var bin_index = @as(usize, @intFromFloat(@floor(((value - min_value) / value_range) * @as(f64, @floatFromInt(bin_count)))));
+        if (bin_index >= bin_count) {
+            bin_index = bin_count - 1;
+        }
+        bin_totals[bin_index] += 1;
+        bin_positives[bin_index] += y_ptr[sample_index];
     }
     var left_count: usize = 0;
     var left_positive: usize = 0;
     var best_impurity = std.math.inf(f64);
     var best_threshold: f64 = 0.0;
-    var best_split_index: usize = 0;
     var found = false;
-    var i: usize = 1;
-    while (i < sample_count) : (i += 1) {
-        const previous_index = sorted_indices[i - 1];
-        left_count += 1;
-        left_positive += y_ptr[previous_index];
+    var bin: usize = 0;
+    while (bin + 1 < bin_count) : (bin += 1) {
+        left_count += bin_totals[bin];
+        left_positive += bin_positives[bin];
         const right_count = sample_count - left_count;
         if (left_count < model.min_samples_leaf or right_count < model.min_samples_leaf) {
             continue;
         }
-        const left_value = x_ptr[previous_index * model.n_features + feature_index];
-        const right_value = x_ptr[sorted_indices[i] * model.n_features + feature_index];
-        if (left_value == right_value) {
-            continue;
-        }
         const right_positive = total_positive - left_positive;
         const impurity =
             (@as(f64, @floatFromInt(left_count)) / @as(f64, @floatFromInt(sample_count))) *
@@ -271,8 +273,7 @@ fn findBestSplitForFeature(
         if (impurity < best_impurity) {
             best_impurity = impurity;
-            best_threshold = (left_value + right_value) / 2.0;
-            best_split_index = i;
+            best_threshold = min_value + (value_range * @as(f64, @floatFromInt(bin + 1))) / @as(f64, @floatFromInt(bin_count));
             found = true;
         }
     }
@@ -281,14 +282,36 @@ fn findBestSplitForFeature(
         return null;
     }
-    const left_indices = try allocator.alloc(usize, best_split_index);
+    var left_partition_count: usize = 0;
+    for (indices) |sample_index| {
+        const value = x_ptr[sample_index * model.n_features + feature_index];
+        if (value <= best_threshold) {
+            left_partition_count += 1;
+        }
+    }
+    const right_partition_count = sample_count - left_partition_count;
+    if (left_partition_count < model.min_samples_leaf or right_partition_count < model.min_samples_leaf) {
+        return null;
+    }
+    const left_indices = try allocator.alloc(usize, left_partition_count);
     errdefer allocator.free(left_indices);
-    const right_size = sample_count - best_split_index;
-    const right_indices = try allocator.alloc(usize, right_size);
+    const right_indices = try allocator.alloc(usize, right_partition_count);
     errdefer allocator.free(right_indices);
-    @memcpy(left_indices, sorted_indices[0..best_split_index]);
-    @memcpy(right_indices, sorted_indices[best_split_index..]);
+    var left_write: usize = 0;
+    var right_write: usize = 0;
+    for (indices) |sample_index| {
+        const value = x_ptr[sample_index * model.n_features + feature_index];
+        if (value <= best_threshold) {
+            left_indices[left_write] = sample_index;
+            left_write += 1;
+        } else {
+            right_indices[right_write] = sample_index;
+            right_write += 1;
+        }
+    }
     return SplitResult{
         .threshold = best_threshold,