npm - bun-scikit - Versions diffs - 0.1.3 → 0.1.5 - Mend

bun-scikit 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +73 -137
package/package.json +3 -2
package/scripts/build-node-addon.ts +17 -1
package/scripts/check-benchmark-health.ts +112 -6
package/scripts/sync-benchmark-readme.ts +56 -0
package/src/dummy/DummyClassifier.ts +190 -0
package/src/dummy/DummyRegressor.ts +108 -0
package/src/ensemble/RandomForestClassifier.ts +154 -8
package/src/ensemble/RandomForestRegressor.ts +12 -8
package/src/feature_selection/VarianceThreshold.ts +88 -0
package/src/index.ts +23 -0
package/src/metrics/classification.ts +30 -0
package/src/metrics/regression.ts +40 -0
package/src/model_selection/RandomizedSearchCV.ts +269 -0
package/src/native/node-addon/bun_scikit_addon.cpp +307 -0
package/src/native/zigKernels.ts +122 -4
package/src/preprocessing/Binarizer.ts +46 -0
package/src/preprocessing/LabelEncoder.ts +62 -0
package/src/preprocessing/MaxAbsScaler.ts +77 -0
package/src/preprocessing/Normalizer.ts +66 -0
package/src/tree/DecisionTreeClassifier.ts +159 -4
package/zig/kernels.zig +333 -89

package/zig/kernels.zig CHANGED Viewed

@@ -74,16 +74,37 @@ const DecisionTreeModel = struct {
     use_random_state: bool,
     root_index: usize,
     has_root: bool,
+    feature_scratch: []usize,
     nodes: std.ArrayListUnmanaged(TreeNode),
 };
-const SplitResult = struct {
+const RandomForestClassifierModel = struct {
+    n_features: usize,
+    n_estimators: usize,
+    max_depth: usize,
+    min_samples_split: usize,
+    min_samples_leaf: usize,
+    max_features_mode: u8,
+    max_features_value: usize,
+    bootstrap: bool,
+    random_state: u32,
+    use_random_state: bool,
+    tree_handles: []usize,
+    fitted_estimators: usize,
+};
+const SplitEvaluation = struct {
     threshold: f64,
     impurity: f64,
+};
+const SplitPartition = struct {
     left_indices: []usize,
     right_indices: []usize,
 };
+const MAX_THRESHOLD_BINS: usize = 128;
 const Mulberry32 = struct {
     state: u32,
@@ -165,41 +186,33 @@ fn resolveMaxFeatures(model: *const DecisionTreeModel) usize {
     }
 }
-fn freeSplit(split: SplitResult) void {
-    allocator.free(split.left_indices);
-    allocator.free(split.right_indices);
+inline fn asRandomForestClassifierModel(handle: usize) ?*RandomForestClassifierModel {
+    if (handle == 0) {
+        return null;
+    }
+    return @as(*RandomForestClassifierModel, @ptrFromInt(handle));
 }
-fn selectCandidateFeatures(model: *const DecisionTreeModel, rng: *Mulberry32) ![]usize {
-    const k = resolveMaxFeatures(model);
-    if (k >= model.n_features) {
-        const all_features = try allocator.alloc(usize, model.n_features);
-        errdefer allocator.free(all_features);
-        for (all_features, 0..) |*entry, idx| {
-            entry.* = idx;
-        }
-        return all_features;
+fn selectCandidateFeatures(model: *DecisionTreeModel, rng: *Mulberry32) []const usize {
+    for (model.feature_scratch, 0..) |*entry, idx| {
+        entry.* = idx;
     }
-    const shuffled = try allocator.alloc(usize, model.n_features);
-    errdefer allocator.free(shuffled);
-    for (shuffled, 0..) |*entry, idx| {
-        entry.* = idx;
+    const k = resolveMaxFeatures(model);
+    if (k >= model.n_features) {
+        return model.feature_scratch[0..model.n_features];
     }
-    var i = model.n_features;
-    while (i > 1) {
-        i -= 1;
-        const j = rng.nextIndex(i + 1);
-        const tmp = shuffled[i];
-        shuffled[i] = shuffled[j];
-        shuffled[j] = tmp;
+    var i: usize = 0;
+    while (i < k) : (i += 1) {
+        const remaining = model.n_features - i;
+        const j = i + rng.nextIndex(remaining);
+        const tmp = model.feature_scratch[i];
+        model.feature_scratch[i] = model.feature_scratch[j];
+        model.feature_scratch[j] = tmp;
     }
-    const selected = try allocator.alloc(usize, k);
-    @memcpy(selected, shuffled[0..k]);
-    allocator.free(shuffled);
-    return selected;
+    return model.feature_scratch[0..k];
 }
 fn findBestSplitForFeature(
@@ -208,60 +221,60 @@ fn findBestSplitForFeature(
     y_ptr: [*]const u8,
     indices: []const usize,
     feature_index: usize,
-) !?SplitResult {
+) ?SplitEvaluation {
     const sample_count = indices.len;
     if (sample_count < 2) {
         return null;
     }
+    var min_value = std.math.inf(f64);
+    var max_value = -std.math.inf(f64);
+    var total_positive: usize = 0;
+    for (indices) |sample_index| {
+        const value = x_ptr[sample_index * model.n_features + feature_index];
+        if (value < min_value) {
+            min_value = value;
+        }
+        if (value > max_value) {
+            max_value = value;
+        }
+        total_positive += y_ptr[sample_index];
+    }
-    const sorted_indices = try allocator.alloc(usize, sample_count);
-    defer allocator.free(sorted_indices);
-    @memcpy(sorted_indices, indices);
+    if (!std.math.isFinite(min_value) or !std.math.isFinite(max_value) or min_value == max_value) {
+        return null;
+    }
-    const SortContext = struct {
-        x_ptr: [*]const f64,
-        n_features: usize,
-        feature_index: usize,
-        fn lessThan(ctx: @This(), a: usize, b: usize) bool {
-            return ctx.x_ptr[a * ctx.n_features + ctx.feature_index] <
-                ctx.x_ptr[b * ctx.n_features + ctx.feature_index];
-        }
-    };
-    std.sort.heap(usize, sorted_indices, SortContext{
-        .x_ptr = x_ptr,
-        .n_features = model.n_features,
-        .feature_index = feature_index,
-    }, SortContext.lessThan);
+    const dynamic_bins = @as(usize, @intFromFloat(@floor(@sqrt(@as(f64, @floatFromInt(sample_count))))));
+    const bin_count = std.math.clamp(dynamic_bins, 16, MAX_THRESHOLD_BINS);
+    var bin_totals: [MAX_THRESHOLD_BINS]usize = [_]usize{0} ** MAX_THRESHOLD_BINS;
+    var bin_positives: [MAX_THRESHOLD_BINS]usize = [_]usize{0} ** MAX_THRESHOLD_BINS;
+    const value_range = max_value - min_value;
-    var total_positive: usize = 0;
-    for (sorted_indices) |sample_index| {
-        total_positive += y_ptr[sample_index];
+    for (indices) |sample_index| {
+        const value = x_ptr[sample_index * model.n_features + feature_index];
+        var bin_index = @as(usize, @intFromFloat(@floor(((value - min_value) / value_range) * @as(f64, @floatFromInt(bin_count)))));
+        if (bin_index >= bin_count) {
+            bin_index = bin_count - 1;
+        }
+        bin_totals[bin_index] += 1;
+        bin_positives[bin_index] += y_ptr[sample_index];
     }
     var left_count: usize = 0;
     var left_positive: usize = 0;
     var best_impurity = std.math.inf(f64);
     var best_threshold: f64 = 0.0;
-    var best_split_index: usize = 0;
     var found = false;
-    var i: usize = 1;
-    while (i < sample_count) : (i += 1) {
-        const previous_index = sorted_indices[i - 1];
-        left_count += 1;
-        left_positive += y_ptr[previous_index];
+    var bin: usize = 0;
+    while (bin + 1 < bin_count) : (bin += 1) {
+        left_count += bin_totals[bin];
+        left_positive += bin_positives[bin];
         const right_count = sample_count - left_count;
         if (left_count < model.min_samples_leaf or right_count < model.min_samples_leaf) {
             continue;
         }
-        const left_value = x_ptr[previous_index * model.n_features + feature_index];
-        const right_value = x_ptr[sorted_indices[i] * model.n_features + feature_index];
-        if (left_value == right_value) {
-            continue;
-        }
         const right_positive = total_positive - left_positive;
         const impurity =
             (@as(f64, @floatFromInt(left_count)) / @as(f64, @floatFromInt(sample_count))) *
@@ -271,8 +284,7 @@ fn findBestSplitForFeature(
         if (impurity < best_impurity) {
             best_impurity = impurity;
-            best_threshold = (left_value + right_value) / 2.0;
-            best_split_index = i;
+            best_threshold = min_value + (value_range * @as(f64, @floatFromInt(bin + 1))) / @as(f64, @floatFromInt(bin_count));
             found = true;
         }
     }
@@ -281,18 +293,50 @@ fn findBestSplitForFeature(
         return null;
     }
-    const left_indices = try allocator.alloc(usize, best_split_index);
-    errdefer allocator.free(left_indices);
-    const right_size = sample_count - best_split_index;
-    const right_indices = try allocator.alloc(usize, right_size);
-    errdefer allocator.free(right_indices);
-    @memcpy(left_indices, sorted_indices[0..best_split_index]);
-    @memcpy(right_indices, sorted_indices[best_split_index..]);
-    return SplitResult{
+    return SplitEvaluation{
         .threshold = best_threshold,
         .impurity = best_impurity,
+    };
+}
+fn partitionIndicesForThreshold(
+    model: *const DecisionTreeModel,
+    workspace: std.mem.Allocator,
+    x_ptr: [*]const f64,
+    indices: []const usize,
+    feature_index: usize,
+    threshold: f64,
+) !?SplitPartition {
+    var left_count: usize = 0;
+    for (indices) |sample_index| {
+        const value = x_ptr[sample_index * model.n_features + feature_index];
+        if (value <= threshold) {
+            left_count += 1;
+        }
+    }
+    const right_count = indices.len - left_count;
+    if (left_count < model.min_samples_leaf or right_count < model.min_samples_leaf) {
+        return null;
+    }
+    const left_indices = try workspace.alloc(usize, left_count);
+    const right_indices = try workspace.alloc(usize, right_count);
+    var left_write: usize = 0;
+    var right_write: usize = 0;
+    for (indices) |sample_index| {
+        const value = x_ptr[sample_index * model.n_features + feature_index];
+        if (value <= threshold) {
+            left_indices[left_write] = sample_index;
+            left_write += 1;
+        } else {
+            right_indices[right_write] = sample_index;
+            right_write += 1;
+        }
+    }
+    return SplitPartition{
         .left_indices = left_indices,
         .right_indices = right_indices,
     };
@@ -300,6 +344,7 @@ fn findBestSplitForFeature(
 fn buildDecisionTreeNode(
     model: *DecisionTreeModel,
+    workspace: std.mem.Allocator,
     x_ptr: [*]const f64,
     y_ptr: [*]const u8,
     indices: []const usize,
@@ -330,25 +375,19 @@ fn buildDecisionTreeNode(
     }
     const parent_impurity = giniImpurity(positive_count, sample_count);
-    const candidate_features = try selectCandidateFeatures(model, rng);
-    defer allocator.free(candidate_features);
+    const candidate_features = selectCandidateFeatures(model, rng);
     var best_feature: usize = 0;
-    var best_split: ?SplitResult = null;
+    var best_split: ?SplitEvaluation = null;
     var best_found = false;
     for (candidate_features) |feature_index| {
-        const split_opt = try findBestSplitForFeature(model, x_ptr, y_ptr, indices, feature_index);
+        const split_opt = findBestSplitForFeature(model, x_ptr, y_ptr, indices, feature_index);
         if (split_opt) |split| {
             if (!best_found or split.impurity < best_split.?.impurity) {
-                if (best_split) |previous| {
-                    freeSplit(previous);
-                }
                 best_split = split;
                 best_feature = feature_index;
                 best_found = true;
-            } else {
-                freeSplit(split);
             }
         }
     }
@@ -367,7 +406,6 @@ fn buildDecisionTreeNode(
     }
     const split = best_split.?;
-    defer freeSplit(split);
     if (split.impurity >= parent_impurity - 1e-12) {
         const node_index = model.nodes.items.len;
         try model.nodes.append(allocator, TreeNode{
@@ -381,6 +419,25 @@ fn buildDecisionTreeNode(
         return node_index;
     }
+    const partition = (try partitionIndicesForThreshold(
+        model,
+        workspace,
+        x_ptr,
+        indices,
+        best_feature,
+        split.threshold,
+    )) orelse {
+        const node_index = model.nodes.items.len;
+        try model.nodes.append(allocator, TreeNode{
+            .prediction = prediction,
+            .feature_index = 0,
+            .threshold = 0.0,
+            .left_index = 0,
+            .right_index = 0,
+            .is_leaf = true,
+        });
+        return node_index;
+    };
     const node_index = model.nodes.items.len;
     try model.nodes.append(allocator, TreeNode{
         .prediction = prediction,
@@ -393,17 +450,19 @@ fn buildDecisionTreeNode(
     const left_index = try buildDecisionTreeNode(
         model,
+        workspace,
         x_ptr,
         y_ptr,
-        split.left_indices,
+        partition.left_indices,
         depth + 1,
         rng,
     );
     const right_index = try buildDecisionTreeNode(
         model,
+        workspace,
         x_ptr,
         y_ptr,
-        split.right_indices,
+        partition.right_indices,
         depth + 1,
         rng,
     );
@@ -1113,6 +1172,11 @@ pub export fn decision_tree_model_create(
     const model = allocator.create(DecisionTreeModel) catch return 0;
     errdefer allocator.destroy(model);
+    const feature_scratch = allocator.alloc(usize, n_features) catch return 0;
+    errdefer allocator.free(feature_scratch);
+    for (feature_scratch, 0..) |*entry, idx| {
+        entry.* = idx;
+    }
     model.* = .{
         .n_features = n_features,
         .max_depth = max_depth,
@@ -1124,6 +1188,7 @@ pub export fn decision_tree_model_create(
         .use_random_state = use_random_state != 0,
         .root_index = 0,
         .has_root = false,
+        .feature_scratch = feature_scratch,
         .nodes = .empty,
     };
     return @intFromPtr(model);
@@ -1131,6 +1196,7 @@ pub export fn decision_tree_model_create(
 pub export fn decision_tree_model_destroy(handle: usize) void {
     const model = asDecisionTreeModel(handle) orelse return;
+    allocator.free(model.feature_scratch);
     model.nodes.deinit(allocator);
     allocator.destroy(model);
 }
@@ -1157,8 +1223,11 @@ pub export fn decision_tree_model_fit(
         return 0;
     }
-    const root_indices = allocator.alloc(usize, root_size) catch return 0;
-    defer allocator.free(root_indices);
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+    const workspace = arena.allocator();
+    const root_indices = workspace.alloc(usize, root_size) catch return 0;
     if (sample_count == 0) {
         for (root_indices, 0..) |*entry, idx| {
@@ -1179,7 +1248,7 @@ pub export fn decision_tree_model_fit(
     else
         @as(u32, @truncate(@as(u64, @bitCast(std.time.microTimestamp()))));
     var rng = Mulberry32.init(rng_seed);
-    const root_index = buildDecisionTreeNode(model, x_ptr, y_ptr, root_indices, 0, &rng) catch {
+    const root_index = buildDecisionTreeNode(model, workspace, x_ptr, y_ptr, root_indices, 0, &rng) catch {
         model.nodes.clearRetainingCapacity();
         model.has_root = false;
         return 0;
@@ -1220,6 +1289,181 @@ pub export fn decision_tree_model_predict(
     return 1;
 }
+fn resetRandomForestClassifierModel(model: *RandomForestClassifierModel) void {
+    var i: usize = 0;
+    while (i < model.fitted_estimators) : (i += 1) {
+        const tree_handle = model.tree_handles[i];
+        if (tree_handle != 0) {
+            decision_tree_model_destroy(tree_handle);
+            model.tree_handles[i] = 0;
+        }
+    }
+    model.fitted_estimators = 0;
+}
+pub export fn random_forest_classifier_model_create(
+    n_estimators: usize,
+    max_depth: usize,
+    min_samples_split: usize,
+    min_samples_leaf: usize,
+    max_features_mode: u8,
+    max_features_value: usize,
+    bootstrap: u8,
+    random_state: u32,
+    use_random_state: u8,
+    n_features: usize,
+) usize {
+    if (n_features == 0 or max_depth == 0 or n_estimators == 0) {
+        return 0;
+    }
+    const model = allocator.create(RandomForestClassifierModel) catch return 0;
+    errdefer allocator.destroy(model);
+    const tree_handles = allocator.alloc(usize, n_estimators) catch return 0;
+    errdefer allocator.free(tree_handles);
+    @memset(tree_handles, 0);
+    model.* = .{
+        .n_features = n_features,
+        .n_estimators = n_estimators,
+        .max_depth = max_depth,
+        .min_samples_split = if (min_samples_split < 2) 2 else min_samples_split,
+        .min_samples_leaf = if (min_samples_leaf < 1) 1 else min_samples_leaf,
+        .max_features_mode = max_features_mode,
+        .max_features_value = max_features_value,
+        .bootstrap = bootstrap != 0,
+        .random_state = random_state,
+        .use_random_state = use_random_state != 0,
+        .tree_handles = tree_handles,
+        .fitted_estimators = 0,
+    };
+    return @intFromPtr(model);
+}
+pub export fn random_forest_classifier_model_destroy(handle: usize) void {
+    const model = asRandomForestClassifierModel(handle) orelse return;
+    resetRandomForestClassifierModel(model);
+    allocator.free(model.tree_handles);
+    allocator.destroy(model);
+}
+pub export fn random_forest_classifier_model_fit(
+    handle: usize,
+    x_ptr: [*]const f64,
+    y_ptr: [*]const u8,
+    n_samples: usize,
+    n_features: usize,
+) u8 {
+    const model = asRandomForestClassifierModel(handle) orelse return 0;
+    if (n_samples == 0 or n_features == 0 or n_features != model.n_features) {
+        return 0;
+    }
+    resetRandomForestClassifierModel(model);
+    const sample_indices = allocator.alloc(u32, n_samples) catch return 0;
+    defer allocator.free(sample_indices);
+    const rng_seed: u32 = if (model.use_random_state)
+        model.random_state
+    else
+        @as(u32, @truncate(@as(u64, @bitCast(std.time.microTimestamp()))));
+    var rng = Mulberry32.init(rng_seed);
+    var estimator_index: usize = 0;
+    while (estimator_index < model.n_estimators) : (estimator_index += 1) {
+        const tree_seed: u32 = if (model.use_random_state)
+            model.random_state +% @as(u32, @truncate(estimator_index + 1))
+        else
+            rng.state +% @as(u32, @truncate(estimator_index + 1));
+        const tree_handle = decision_tree_model_create(
+            model.max_depth,
+            model.min_samples_split,
+            model.min_samples_leaf,
+            model.max_features_mode,
+            model.max_features_value,
+            tree_seed,
+            if (model.use_random_state) 1 else 0,
+            model.n_features,
+        );
+        if (tree_handle == 0) {
+            resetRandomForestClassifierModel(model);
+            return 0;
+        }
+        if (model.bootstrap) {
+            var i: usize = 0;
+            while (i < n_samples) : (i += 1) {
+                sample_indices[i] = @as(u32, @truncate(rng.nextIndex(n_samples)));
+            }
+        } else {
+            for (sample_indices, 0..) |*entry, idx| {
+                entry.* = @as(u32, @truncate(idx));
+            }
+        }
+        const fit_status = decision_tree_model_fit(
+            tree_handle,
+            x_ptr,
+            y_ptr,
+            n_samples,
+            n_features,
+            sample_indices.ptr,
+            n_samples,
+        );
+        if (fit_status != 1) {
+            decision_tree_model_destroy(tree_handle);
+            resetRandomForestClassifierModel(model);
+            return 0;
+        }
+        model.tree_handles[estimator_index] = tree_handle;
+        model.fitted_estimators = estimator_index + 1;
+    }
+    return 1;
+}
+pub export fn random_forest_classifier_model_predict(
+    handle: usize,
+    x_ptr: [*]const f64,
+    n_samples: usize,
+    n_features: usize,
+    out_labels_ptr: [*]u8,
+) u8 {
+    const model = asRandomForestClassifierModel(handle) orelse return 0;
+    if (model.fitted_estimators == 0 or n_samples == 0 or n_features != model.n_features) {
+        return 0;
+    }
+    var i: usize = 0;
+    while (i < n_samples) : (i += 1) {
+        const row_offset = i * model.n_features;
+        var positive_votes: usize = 0;
+        var tree_index: usize = 0;
+        while (tree_index < model.fitted_estimators) : (tree_index += 1) {
+            const tree = asDecisionTreeModel(model.tree_handles[tree_index]) orelse continue;
+            if (!tree.has_root) {
+                continue;
+            }
+            var node_index = tree.root_index;
+            while (true) {
+                const node = tree.nodes.items[node_index];
+                if (node.is_leaf) {
+                    positive_votes += if (node.prediction == 1) 1 else 0;
+                    break;
+                }
+                const value = x_ptr[row_offset + node.feature_index];
+                node_index = if (value <= node.threshold) node.left_index else node.right_index;
+            }
+        }
+        out_labels_ptr[i] = if (positive_votes * 2 >= model.fitted_estimators) 1 else 0;
+    }
+    return 1;
+}
 pub export fn logistic_train_epoch(
     x_ptr: [*]const f64,
     y_ptr: [*]const f64,