bun-scikit 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +73 -137
- package/package.json +3 -2
- package/scripts/build-node-addon.ts +17 -1
- package/scripts/check-benchmark-health.ts +112 -6
- package/scripts/sync-benchmark-readme.ts +56 -0
- package/src/dummy/DummyClassifier.ts +190 -0
- package/src/dummy/DummyRegressor.ts +108 -0
- package/src/ensemble/RandomForestClassifier.ts +154 -8
- package/src/ensemble/RandomForestRegressor.ts +12 -8
- package/src/feature_selection/VarianceThreshold.ts +88 -0
- package/src/index.ts +23 -0
- package/src/metrics/classification.ts +30 -0
- package/src/metrics/regression.ts +40 -0
- package/src/model_selection/RandomizedSearchCV.ts +269 -0
- package/src/native/node-addon/bun_scikit_addon.cpp +307 -0
- package/src/native/zigKernels.ts +122 -4
- package/src/preprocessing/Binarizer.ts +46 -0
- package/src/preprocessing/LabelEncoder.ts +62 -0
- package/src/preprocessing/MaxAbsScaler.ts +77 -0
- package/src/preprocessing/Normalizer.ts +66 -0
- package/src/tree/DecisionTreeClassifier.ts +159 -4
- package/zig/kernels.zig +333 -89
package/zig/kernels.zig
CHANGED
|
@@ -74,16 +74,37 @@ const DecisionTreeModel = struct {
|
|
|
74
74
|
use_random_state: bool,
|
|
75
75
|
root_index: usize,
|
|
76
76
|
has_root: bool,
|
|
77
|
+
feature_scratch: []usize,
|
|
77
78
|
nodes: std.ArrayListUnmanaged(TreeNode),
|
|
78
79
|
};
|
|
79
80
|
|
|
80
|
-
const
|
|
81
|
+
const RandomForestClassifierModel = struct {
|
|
82
|
+
n_features: usize,
|
|
83
|
+
n_estimators: usize,
|
|
84
|
+
max_depth: usize,
|
|
85
|
+
min_samples_split: usize,
|
|
86
|
+
min_samples_leaf: usize,
|
|
87
|
+
max_features_mode: u8,
|
|
88
|
+
max_features_value: usize,
|
|
89
|
+
bootstrap: bool,
|
|
90
|
+
random_state: u32,
|
|
91
|
+
use_random_state: bool,
|
|
92
|
+
tree_handles: []usize,
|
|
93
|
+
fitted_estimators: usize,
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
const SplitEvaluation = struct {
|
|
81
97
|
threshold: f64,
|
|
82
98
|
impurity: f64,
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
const SplitPartition = struct {
|
|
83
102
|
left_indices: []usize,
|
|
84
103
|
right_indices: []usize,
|
|
85
104
|
};
|
|
86
105
|
|
|
106
|
+
const MAX_THRESHOLD_BINS: usize = 128;
|
|
107
|
+
|
|
87
108
|
const Mulberry32 = struct {
|
|
88
109
|
state: u32,
|
|
89
110
|
|
|
@@ -165,41 +186,33 @@ fn resolveMaxFeatures(model: *const DecisionTreeModel) usize {
|
|
|
165
186
|
}
|
|
166
187
|
}
|
|
167
188
|
|
|
168
|
-
fn
|
|
169
|
-
|
|
170
|
-
|
|
189
|
+
inline fn asRandomForestClassifierModel(handle: usize) ?*RandomForestClassifierModel {
|
|
190
|
+
if (handle == 0) {
|
|
191
|
+
return null;
|
|
192
|
+
}
|
|
193
|
+
return @as(*RandomForestClassifierModel, @ptrFromInt(handle));
|
|
171
194
|
}
|
|
172
195
|
|
|
173
|
-
fn selectCandidateFeatures(model: *
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
const all_features = try allocator.alloc(usize, model.n_features);
|
|
177
|
-
errdefer allocator.free(all_features);
|
|
178
|
-
for (all_features, 0..) |*entry, idx| {
|
|
179
|
-
entry.* = idx;
|
|
180
|
-
}
|
|
181
|
-
return all_features;
|
|
196
|
+
fn selectCandidateFeatures(model: *DecisionTreeModel, rng: *Mulberry32) []const usize {
|
|
197
|
+
for (model.feature_scratch, 0..) |*entry, idx| {
|
|
198
|
+
entry.* = idx;
|
|
182
199
|
}
|
|
183
200
|
|
|
184
|
-
const
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
entry.* = idx;
|
|
201
|
+
const k = resolveMaxFeatures(model);
|
|
202
|
+
if (k >= model.n_features) {
|
|
203
|
+
return model.feature_scratch[0..model.n_features];
|
|
188
204
|
}
|
|
189
205
|
|
|
190
|
-
var i =
|
|
191
|
-
while (i
|
|
192
|
-
|
|
193
|
-
const j = rng.nextIndex(
|
|
194
|
-
const tmp =
|
|
195
|
-
|
|
196
|
-
|
|
206
|
+
var i: usize = 0;
|
|
207
|
+
while (i < k) : (i += 1) {
|
|
208
|
+
const remaining = model.n_features - i;
|
|
209
|
+
const j = i + rng.nextIndex(remaining);
|
|
210
|
+
const tmp = model.feature_scratch[i];
|
|
211
|
+
model.feature_scratch[i] = model.feature_scratch[j];
|
|
212
|
+
model.feature_scratch[j] = tmp;
|
|
197
213
|
}
|
|
198
214
|
|
|
199
|
-
|
|
200
|
-
@memcpy(selected, shuffled[0..k]);
|
|
201
|
-
allocator.free(shuffled);
|
|
202
|
-
return selected;
|
|
215
|
+
return model.feature_scratch[0..k];
|
|
203
216
|
}
|
|
204
217
|
|
|
205
218
|
fn findBestSplitForFeature(
|
|
@@ -208,60 +221,60 @@ fn findBestSplitForFeature(
|
|
|
208
221
|
y_ptr: [*]const u8,
|
|
209
222
|
indices: []const usize,
|
|
210
223
|
feature_index: usize,
|
|
211
|
-
)
|
|
224
|
+
) ?SplitEvaluation {
|
|
212
225
|
const sample_count = indices.len;
|
|
213
226
|
if (sample_count < 2) {
|
|
214
227
|
return null;
|
|
215
228
|
}
|
|
229
|
+
var min_value = std.math.inf(f64);
|
|
230
|
+
var max_value = -std.math.inf(f64);
|
|
231
|
+
var total_positive: usize = 0;
|
|
232
|
+
for (indices) |sample_index| {
|
|
233
|
+
const value = x_ptr[sample_index * model.n_features + feature_index];
|
|
234
|
+
if (value < min_value) {
|
|
235
|
+
min_value = value;
|
|
236
|
+
}
|
|
237
|
+
if (value > max_value) {
|
|
238
|
+
max_value = value;
|
|
239
|
+
}
|
|
240
|
+
total_positive += y_ptr[sample_index];
|
|
241
|
+
}
|
|
216
242
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
243
|
+
if (!std.math.isFinite(min_value) or !std.math.isFinite(max_value) or min_value == max_value) {
|
|
244
|
+
return null;
|
|
245
|
+
}
|
|
220
246
|
|
|
221
|
-
const
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
return ctx.x_ptr[a * ctx.n_features + ctx.feature_index] <
|
|
227
|
-
ctx.x_ptr[b * ctx.n_features + ctx.feature_index];
|
|
228
|
-
}
|
|
229
|
-
};
|
|
230
|
-
std.sort.heap(usize, sorted_indices, SortContext{
|
|
231
|
-
.x_ptr = x_ptr,
|
|
232
|
-
.n_features = model.n_features,
|
|
233
|
-
.feature_index = feature_index,
|
|
234
|
-
}, SortContext.lessThan);
|
|
247
|
+
const dynamic_bins = @as(usize, @intFromFloat(@floor(@sqrt(@as(f64, @floatFromInt(sample_count))))));
|
|
248
|
+
const bin_count = std.math.clamp(dynamic_bins, 16, MAX_THRESHOLD_BINS);
|
|
249
|
+
var bin_totals: [MAX_THRESHOLD_BINS]usize = [_]usize{0} ** MAX_THRESHOLD_BINS;
|
|
250
|
+
var bin_positives: [MAX_THRESHOLD_BINS]usize = [_]usize{0} ** MAX_THRESHOLD_BINS;
|
|
251
|
+
const value_range = max_value - min_value;
|
|
235
252
|
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
253
|
+
for (indices) |sample_index| {
|
|
254
|
+
const value = x_ptr[sample_index * model.n_features + feature_index];
|
|
255
|
+
var bin_index = @as(usize, @intFromFloat(@floor(((value - min_value) / value_range) * @as(f64, @floatFromInt(bin_count)))));
|
|
256
|
+
if (bin_index >= bin_count) {
|
|
257
|
+
bin_index = bin_count - 1;
|
|
258
|
+
}
|
|
259
|
+
bin_totals[bin_index] += 1;
|
|
260
|
+
bin_positives[bin_index] += y_ptr[sample_index];
|
|
239
261
|
}
|
|
240
262
|
|
|
241
263
|
var left_count: usize = 0;
|
|
242
264
|
var left_positive: usize = 0;
|
|
243
265
|
var best_impurity = std.math.inf(f64);
|
|
244
266
|
var best_threshold: f64 = 0.0;
|
|
245
|
-
var best_split_index: usize = 0;
|
|
246
267
|
var found = false;
|
|
247
268
|
|
|
248
|
-
var
|
|
249
|
-
while (
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
left_positive += y_ptr[previous_index];
|
|
269
|
+
var bin: usize = 0;
|
|
270
|
+
while (bin + 1 < bin_count) : (bin += 1) {
|
|
271
|
+
left_count += bin_totals[bin];
|
|
272
|
+
left_positive += bin_positives[bin];
|
|
253
273
|
const right_count = sample_count - left_count;
|
|
254
|
-
|
|
255
274
|
if (left_count < model.min_samples_leaf or right_count < model.min_samples_leaf) {
|
|
256
275
|
continue;
|
|
257
276
|
}
|
|
258
277
|
|
|
259
|
-
const left_value = x_ptr[previous_index * model.n_features + feature_index];
|
|
260
|
-
const right_value = x_ptr[sorted_indices[i] * model.n_features + feature_index];
|
|
261
|
-
if (left_value == right_value) {
|
|
262
|
-
continue;
|
|
263
|
-
}
|
|
264
|
-
|
|
265
278
|
const right_positive = total_positive - left_positive;
|
|
266
279
|
const impurity =
|
|
267
280
|
(@as(f64, @floatFromInt(left_count)) / @as(f64, @floatFromInt(sample_count))) *
|
|
@@ -271,8 +284,7 @@ fn findBestSplitForFeature(
|
|
|
271
284
|
|
|
272
285
|
if (impurity < best_impurity) {
|
|
273
286
|
best_impurity = impurity;
|
|
274
|
-
best_threshold = (
|
|
275
|
-
best_split_index = i;
|
|
287
|
+
best_threshold = min_value + (value_range * @as(f64, @floatFromInt(bin + 1))) / @as(f64, @floatFromInt(bin_count));
|
|
276
288
|
found = true;
|
|
277
289
|
}
|
|
278
290
|
}
|
|
@@ -281,18 +293,50 @@ fn findBestSplitForFeature(
|
|
|
281
293
|
return null;
|
|
282
294
|
}
|
|
283
295
|
|
|
284
|
-
|
|
285
|
-
errdefer allocator.free(left_indices);
|
|
286
|
-
const right_size = sample_count - best_split_index;
|
|
287
|
-
const right_indices = try allocator.alloc(usize, right_size);
|
|
288
|
-
errdefer allocator.free(right_indices);
|
|
289
|
-
|
|
290
|
-
@memcpy(left_indices, sorted_indices[0..best_split_index]);
|
|
291
|
-
@memcpy(right_indices, sorted_indices[best_split_index..]);
|
|
292
|
-
|
|
293
|
-
return SplitResult{
|
|
296
|
+
return SplitEvaluation{
|
|
294
297
|
.threshold = best_threshold,
|
|
295
298
|
.impurity = best_impurity,
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
fn partitionIndicesForThreshold(
|
|
303
|
+
model: *const DecisionTreeModel,
|
|
304
|
+
workspace: std.mem.Allocator,
|
|
305
|
+
x_ptr: [*]const f64,
|
|
306
|
+
indices: []const usize,
|
|
307
|
+
feature_index: usize,
|
|
308
|
+
threshold: f64,
|
|
309
|
+
) !?SplitPartition {
|
|
310
|
+
var left_count: usize = 0;
|
|
311
|
+
for (indices) |sample_index| {
|
|
312
|
+
const value = x_ptr[sample_index * model.n_features + feature_index];
|
|
313
|
+
if (value <= threshold) {
|
|
314
|
+
left_count += 1;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
const right_count = indices.len - left_count;
|
|
319
|
+
if (left_count < model.min_samples_leaf or right_count < model.min_samples_leaf) {
|
|
320
|
+
return null;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
const left_indices = try workspace.alloc(usize, left_count);
|
|
324
|
+
const right_indices = try workspace.alloc(usize, right_count);
|
|
325
|
+
|
|
326
|
+
var left_write: usize = 0;
|
|
327
|
+
var right_write: usize = 0;
|
|
328
|
+
for (indices) |sample_index| {
|
|
329
|
+
const value = x_ptr[sample_index * model.n_features + feature_index];
|
|
330
|
+
if (value <= threshold) {
|
|
331
|
+
left_indices[left_write] = sample_index;
|
|
332
|
+
left_write += 1;
|
|
333
|
+
} else {
|
|
334
|
+
right_indices[right_write] = sample_index;
|
|
335
|
+
right_write += 1;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
return SplitPartition{
|
|
296
340
|
.left_indices = left_indices,
|
|
297
341
|
.right_indices = right_indices,
|
|
298
342
|
};
|
|
@@ -300,6 +344,7 @@ fn findBestSplitForFeature(
|
|
|
300
344
|
|
|
301
345
|
fn buildDecisionTreeNode(
|
|
302
346
|
model: *DecisionTreeModel,
|
|
347
|
+
workspace: std.mem.Allocator,
|
|
303
348
|
x_ptr: [*]const f64,
|
|
304
349
|
y_ptr: [*]const u8,
|
|
305
350
|
indices: []const usize,
|
|
@@ -330,25 +375,19 @@ fn buildDecisionTreeNode(
|
|
|
330
375
|
}
|
|
331
376
|
|
|
332
377
|
const parent_impurity = giniImpurity(positive_count, sample_count);
|
|
333
|
-
const candidate_features =
|
|
334
|
-
defer allocator.free(candidate_features);
|
|
378
|
+
const candidate_features = selectCandidateFeatures(model, rng);
|
|
335
379
|
|
|
336
380
|
var best_feature: usize = 0;
|
|
337
|
-
var best_split: ?
|
|
381
|
+
var best_split: ?SplitEvaluation = null;
|
|
338
382
|
var best_found = false;
|
|
339
383
|
|
|
340
384
|
for (candidate_features) |feature_index| {
|
|
341
|
-
const split_opt =
|
|
385
|
+
const split_opt = findBestSplitForFeature(model, x_ptr, y_ptr, indices, feature_index);
|
|
342
386
|
if (split_opt) |split| {
|
|
343
387
|
if (!best_found or split.impurity < best_split.?.impurity) {
|
|
344
|
-
if (best_split) |previous| {
|
|
345
|
-
freeSplit(previous);
|
|
346
|
-
}
|
|
347
388
|
best_split = split;
|
|
348
389
|
best_feature = feature_index;
|
|
349
390
|
best_found = true;
|
|
350
|
-
} else {
|
|
351
|
-
freeSplit(split);
|
|
352
391
|
}
|
|
353
392
|
}
|
|
354
393
|
}
|
|
@@ -367,7 +406,6 @@ fn buildDecisionTreeNode(
|
|
|
367
406
|
}
|
|
368
407
|
|
|
369
408
|
const split = best_split.?;
|
|
370
|
-
defer freeSplit(split);
|
|
371
409
|
if (split.impurity >= parent_impurity - 1e-12) {
|
|
372
410
|
const node_index = model.nodes.items.len;
|
|
373
411
|
try model.nodes.append(allocator, TreeNode{
|
|
@@ -381,6 +419,25 @@ fn buildDecisionTreeNode(
|
|
|
381
419
|
return node_index;
|
|
382
420
|
}
|
|
383
421
|
|
|
422
|
+
const partition = (try partitionIndicesForThreshold(
|
|
423
|
+
model,
|
|
424
|
+
workspace,
|
|
425
|
+
x_ptr,
|
|
426
|
+
indices,
|
|
427
|
+
best_feature,
|
|
428
|
+
split.threshold,
|
|
429
|
+
)) orelse {
|
|
430
|
+
const node_index = model.nodes.items.len;
|
|
431
|
+
try model.nodes.append(allocator, TreeNode{
|
|
432
|
+
.prediction = prediction,
|
|
433
|
+
.feature_index = 0,
|
|
434
|
+
.threshold = 0.0,
|
|
435
|
+
.left_index = 0,
|
|
436
|
+
.right_index = 0,
|
|
437
|
+
.is_leaf = true,
|
|
438
|
+
});
|
|
439
|
+
return node_index;
|
|
440
|
+
};
|
|
384
441
|
const node_index = model.nodes.items.len;
|
|
385
442
|
try model.nodes.append(allocator, TreeNode{
|
|
386
443
|
.prediction = prediction,
|
|
@@ -393,17 +450,19 @@ fn buildDecisionTreeNode(
|
|
|
393
450
|
|
|
394
451
|
const left_index = try buildDecisionTreeNode(
|
|
395
452
|
model,
|
|
453
|
+
workspace,
|
|
396
454
|
x_ptr,
|
|
397
455
|
y_ptr,
|
|
398
|
-
|
|
456
|
+
partition.left_indices,
|
|
399
457
|
depth + 1,
|
|
400
458
|
rng,
|
|
401
459
|
);
|
|
402
460
|
const right_index = try buildDecisionTreeNode(
|
|
403
461
|
model,
|
|
462
|
+
workspace,
|
|
404
463
|
x_ptr,
|
|
405
464
|
y_ptr,
|
|
406
|
-
|
|
465
|
+
partition.right_indices,
|
|
407
466
|
depth + 1,
|
|
408
467
|
rng,
|
|
409
468
|
);
|
|
@@ -1113,6 +1172,11 @@ pub export fn decision_tree_model_create(
|
|
|
1113
1172
|
|
|
1114
1173
|
const model = allocator.create(DecisionTreeModel) catch return 0;
|
|
1115
1174
|
errdefer allocator.destroy(model);
|
|
1175
|
+
const feature_scratch = allocator.alloc(usize, n_features) catch return 0;
|
|
1176
|
+
errdefer allocator.free(feature_scratch);
|
|
1177
|
+
for (feature_scratch, 0..) |*entry, idx| {
|
|
1178
|
+
entry.* = idx;
|
|
1179
|
+
}
|
|
1116
1180
|
model.* = .{
|
|
1117
1181
|
.n_features = n_features,
|
|
1118
1182
|
.max_depth = max_depth,
|
|
@@ -1124,6 +1188,7 @@ pub export fn decision_tree_model_create(
|
|
|
1124
1188
|
.use_random_state = use_random_state != 0,
|
|
1125
1189
|
.root_index = 0,
|
|
1126
1190
|
.has_root = false,
|
|
1191
|
+
.feature_scratch = feature_scratch,
|
|
1127
1192
|
.nodes = .empty,
|
|
1128
1193
|
};
|
|
1129
1194
|
return @intFromPtr(model);
|
|
@@ -1131,6 +1196,7 @@ pub export fn decision_tree_model_create(
|
|
|
1131
1196
|
|
|
1132
1197
|
pub export fn decision_tree_model_destroy(handle: usize) void {
|
|
1133
1198
|
const model = asDecisionTreeModel(handle) orelse return;
|
|
1199
|
+
allocator.free(model.feature_scratch);
|
|
1134
1200
|
model.nodes.deinit(allocator);
|
|
1135
1201
|
allocator.destroy(model);
|
|
1136
1202
|
}
|
|
@@ -1157,8 +1223,11 @@ pub export fn decision_tree_model_fit(
|
|
|
1157
1223
|
return 0;
|
|
1158
1224
|
}
|
|
1159
1225
|
|
|
1160
|
-
|
|
1161
|
-
defer
|
|
1226
|
+
var arena = std.heap.ArenaAllocator.init(allocator);
|
|
1227
|
+
defer arena.deinit();
|
|
1228
|
+
const workspace = arena.allocator();
|
|
1229
|
+
|
|
1230
|
+
const root_indices = workspace.alloc(usize, root_size) catch return 0;
|
|
1162
1231
|
|
|
1163
1232
|
if (sample_count == 0) {
|
|
1164
1233
|
for (root_indices, 0..) |*entry, idx| {
|
|
@@ -1179,7 +1248,7 @@ pub export fn decision_tree_model_fit(
|
|
|
1179
1248
|
else
|
|
1180
1249
|
@as(u32, @truncate(@as(u64, @bitCast(std.time.microTimestamp()))));
|
|
1181
1250
|
var rng = Mulberry32.init(rng_seed);
|
|
1182
|
-
const root_index = buildDecisionTreeNode(model, x_ptr, y_ptr, root_indices, 0, &rng) catch {
|
|
1251
|
+
const root_index = buildDecisionTreeNode(model, workspace, x_ptr, y_ptr, root_indices, 0, &rng) catch {
|
|
1183
1252
|
model.nodes.clearRetainingCapacity();
|
|
1184
1253
|
model.has_root = false;
|
|
1185
1254
|
return 0;
|
|
@@ -1220,6 +1289,181 @@ pub export fn decision_tree_model_predict(
|
|
|
1220
1289
|
return 1;
|
|
1221
1290
|
}
|
|
1222
1291
|
|
|
1292
|
+
fn resetRandomForestClassifierModel(model: *RandomForestClassifierModel) void {
|
|
1293
|
+
var i: usize = 0;
|
|
1294
|
+
while (i < model.fitted_estimators) : (i += 1) {
|
|
1295
|
+
const tree_handle = model.tree_handles[i];
|
|
1296
|
+
if (tree_handle != 0) {
|
|
1297
|
+
decision_tree_model_destroy(tree_handle);
|
|
1298
|
+
model.tree_handles[i] = 0;
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1301
|
+
model.fitted_estimators = 0;
|
|
1302
|
+
}
|
|
1303
|
+
|
|
1304
|
+
pub export fn random_forest_classifier_model_create(
|
|
1305
|
+
n_estimators: usize,
|
|
1306
|
+
max_depth: usize,
|
|
1307
|
+
min_samples_split: usize,
|
|
1308
|
+
min_samples_leaf: usize,
|
|
1309
|
+
max_features_mode: u8,
|
|
1310
|
+
max_features_value: usize,
|
|
1311
|
+
bootstrap: u8,
|
|
1312
|
+
random_state: u32,
|
|
1313
|
+
use_random_state: u8,
|
|
1314
|
+
n_features: usize,
|
|
1315
|
+
) usize {
|
|
1316
|
+
if (n_features == 0 or max_depth == 0 or n_estimators == 0) {
|
|
1317
|
+
return 0;
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
const model = allocator.create(RandomForestClassifierModel) catch return 0;
|
|
1321
|
+
errdefer allocator.destroy(model);
|
|
1322
|
+
const tree_handles = allocator.alloc(usize, n_estimators) catch return 0;
|
|
1323
|
+
errdefer allocator.free(tree_handles);
|
|
1324
|
+
@memset(tree_handles, 0);
|
|
1325
|
+
|
|
1326
|
+
model.* = .{
|
|
1327
|
+
.n_features = n_features,
|
|
1328
|
+
.n_estimators = n_estimators,
|
|
1329
|
+
.max_depth = max_depth,
|
|
1330
|
+
.min_samples_split = if (min_samples_split < 2) 2 else min_samples_split,
|
|
1331
|
+
.min_samples_leaf = if (min_samples_leaf < 1) 1 else min_samples_leaf,
|
|
1332
|
+
.max_features_mode = max_features_mode,
|
|
1333
|
+
.max_features_value = max_features_value,
|
|
1334
|
+
.bootstrap = bootstrap != 0,
|
|
1335
|
+
.random_state = random_state,
|
|
1336
|
+
.use_random_state = use_random_state != 0,
|
|
1337
|
+
.tree_handles = tree_handles,
|
|
1338
|
+
.fitted_estimators = 0,
|
|
1339
|
+
};
|
|
1340
|
+
return @intFromPtr(model);
|
|
1341
|
+
}
|
|
1342
|
+
|
|
1343
|
+
pub export fn random_forest_classifier_model_destroy(handle: usize) void {
|
|
1344
|
+
const model = asRandomForestClassifierModel(handle) orelse return;
|
|
1345
|
+
resetRandomForestClassifierModel(model);
|
|
1346
|
+
allocator.free(model.tree_handles);
|
|
1347
|
+
allocator.destroy(model);
|
|
1348
|
+
}
|
|
1349
|
+
|
|
1350
|
+
pub export fn random_forest_classifier_model_fit(
|
|
1351
|
+
handle: usize,
|
|
1352
|
+
x_ptr: [*]const f64,
|
|
1353
|
+
y_ptr: [*]const u8,
|
|
1354
|
+
n_samples: usize,
|
|
1355
|
+
n_features: usize,
|
|
1356
|
+
) u8 {
|
|
1357
|
+
const model = asRandomForestClassifierModel(handle) orelse return 0;
|
|
1358
|
+
if (n_samples == 0 or n_features == 0 or n_features != model.n_features) {
|
|
1359
|
+
return 0;
|
|
1360
|
+
}
|
|
1361
|
+
|
|
1362
|
+
resetRandomForestClassifierModel(model);
|
|
1363
|
+
|
|
1364
|
+
const sample_indices = allocator.alloc(u32, n_samples) catch return 0;
|
|
1365
|
+
defer allocator.free(sample_indices);
|
|
1366
|
+
|
|
1367
|
+
const rng_seed: u32 = if (model.use_random_state)
|
|
1368
|
+
model.random_state
|
|
1369
|
+
else
|
|
1370
|
+
@as(u32, @truncate(@as(u64, @bitCast(std.time.microTimestamp()))));
|
|
1371
|
+
var rng = Mulberry32.init(rng_seed);
|
|
1372
|
+
|
|
1373
|
+
var estimator_index: usize = 0;
|
|
1374
|
+
while (estimator_index < model.n_estimators) : (estimator_index += 1) {
|
|
1375
|
+
const tree_seed: u32 = if (model.use_random_state)
|
|
1376
|
+
model.random_state +% @as(u32, @truncate(estimator_index + 1))
|
|
1377
|
+
else
|
|
1378
|
+
rng.state +% @as(u32, @truncate(estimator_index + 1));
|
|
1379
|
+
const tree_handle = decision_tree_model_create(
|
|
1380
|
+
model.max_depth,
|
|
1381
|
+
model.min_samples_split,
|
|
1382
|
+
model.min_samples_leaf,
|
|
1383
|
+
model.max_features_mode,
|
|
1384
|
+
model.max_features_value,
|
|
1385
|
+
tree_seed,
|
|
1386
|
+
if (model.use_random_state) 1 else 0,
|
|
1387
|
+
model.n_features,
|
|
1388
|
+
);
|
|
1389
|
+
if (tree_handle == 0) {
|
|
1390
|
+
resetRandomForestClassifierModel(model);
|
|
1391
|
+
return 0;
|
|
1392
|
+
}
|
|
1393
|
+
|
|
1394
|
+
if (model.bootstrap) {
|
|
1395
|
+
var i: usize = 0;
|
|
1396
|
+
while (i < n_samples) : (i += 1) {
|
|
1397
|
+
sample_indices[i] = @as(u32, @truncate(rng.nextIndex(n_samples)));
|
|
1398
|
+
}
|
|
1399
|
+
} else {
|
|
1400
|
+
for (sample_indices, 0..) |*entry, idx| {
|
|
1401
|
+
entry.* = @as(u32, @truncate(idx));
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
const fit_status = decision_tree_model_fit(
|
|
1406
|
+
tree_handle,
|
|
1407
|
+
x_ptr,
|
|
1408
|
+
y_ptr,
|
|
1409
|
+
n_samples,
|
|
1410
|
+
n_features,
|
|
1411
|
+
sample_indices.ptr,
|
|
1412
|
+
n_samples,
|
|
1413
|
+
);
|
|
1414
|
+
if (fit_status != 1) {
|
|
1415
|
+
decision_tree_model_destroy(tree_handle);
|
|
1416
|
+
resetRandomForestClassifierModel(model);
|
|
1417
|
+
return 0;
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
model.tree_handles[estimator_index] = tree_handle;
|
|
1421
|
+
model.fitted_estimators = estimator_index + 1;
|
|
1422
|
+
}
|
|
1423
|
+
|
|
1424
|
+
return 1;
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
pub export fn random_forest_classifier_model_predict(
|
|
1428
|
+
handle: usize,
|
|
1429
|
+
x_ptr: [*]const f64,
|
|
1430
|
+
n_samples: usize,
|
|
1431
|
+
n_features: usize,
|
|
1432
|
+
out_labels_ptr: [*]u8,
|
|
1433
|
+
) u8 {
|
|
1434
|
+
const model = asRandomForestClassifierModel(handle) orelse return 0;
|
|
1435
|
+
if (model.fitted_estimators == 0 or n_samples == 0 or n_features != model.n_features) {
|
|
1436
|
+
return 0;
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1439
|
+
var i: usize = 0;
|
|
1440
|
+
while (i < n_samples) : (i += 1) {
|
|
1441
|
+
const row_offset = i * model.n_features;
|
|
1442
|
+
var positive_votes: usize = 0;
|
|
1443
|
+
var tree_index: usize = 0;
|
|
1444
|
+
while (tree_index < model.fitted_estimators) : (tree_index += 1) {
|
|
1445
|
+
const tree = asDecisionTreeModel(model.tree_handles[tree_index]) orelse continue;
|
|
1446
|
+
if (!tree.has_root) {
|
|
1447
|
+
continue;
|
|
1448
|
+
}
|
|
1449
|
+
|
|
1450
|
+
var node_index = tree.root_index;
|
|
1451
|
+
while (true) {
|
|
1452
|
+
const node = tree.nodes.items[node_index];
|
|
1453
|
+
if (node.is_leaf) {
|
|
1454
|
+
positive_votes += if (node.prediction == 1) 1 else 0;
|
|
1455
|
+
break;
|
|
1456
|
+
}
|
|
1457
|
+
const value = x_ptr[row_offset + node.feature_index];
|
|
1458
|
+
node_index = if (value <= node.threshold) node.left_index else node.right_index;
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
out_labels_ptr[i] = if (positive_votes * 2 >= model.fitted_estimators) 1 else 0;
|
|
1462
|
+
}
|
|
1463
|
+
|
|
1464
|
+
return 1;
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1223
1467
|
pub export fn logistic_train_epoch(
|
|
1224
1468
|
x_ptr: [*]const f64,
|
|
1225
1469
|
y_ptr: [*]const f64,
|