npm - bun-scikit - Versions diffs - 0.1.3 → 0.1.4 - Mend

bun-scikit 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +73 -137
package/package.json +2 -2
package/scripts/check-benchmark-health.ts +62 -1
package/scripts/sync-benchmark-readme.ts +56 -0
package/src/dummy/DummyClassifier.ts +190 -0
package/src/dummy/DummyRegressor.ts +108 -0
package/src/feature_selection/VarianceThreshold.ts +88 -0
package/src/index.ts +23 -0
package/src/metrics/classification.ts +30 -0
package/src/metrics/regression.ts +40 -0
package/src/model_selection/RandomizedSearchCV.ts +269 -0
package/src/native/node-addon/bun_scikit_addon.cpp +149 -0
package/src/native/zigKernels.ts +33 -4
package/src/preprocessing/Binarizer.ts +46 -0
package/src/preprocessing/LabelEncoder.ts +62 -0
package/src/preprocessing/MaxAbsScaler.ts +77 -0
package/src/preprocessing/Normalizer.ts +66 -0
package/src/tree/DecisionTreeClassifier.ts +146 -3
package/zig/kernels.zig +63 -40

package/src/dummy/DummyRegressor.ts ADDED Viewed

@@ -0,0 +1,108 @@
+import type { Matrix, Vector } from "../types";
+import { r2Score } from "../metrics/regression";
+import { assertFiniteVector, validateRegressionInputs } from "../utils/validation";
+export type DummyRegressorStrategy = "mean" | "median" | "quantile" | "constant";
+export interface DummyRegressorOptions {
+  strategy?: DummyRegressorStrategy;
+  constant?: number;
+  quantile?: number;
+}
+function computeMedian(values: number[]): number {
+  const sorted = [...values].sort((a, b) => a - b);
+  const mid = Math.floor(sorted.length / 2);
+  if (sorted.length % 2 === 0) {
+    return 0.5 * (sorted[mid - 1] + sorted[mid]);
+  }
+  return sorted[mid];
+}
+function computeQuantile(values: number[], q: number): number {
+  const sorted = [...values].sort((a, b) => a - b);
+  const pos = q * (sorted.length - 1);
+  const lo = Math.floor(pos);
+  const hi = Math.ceil(pos);
+  if (lo === hi) {
+    return sorted[lo];
+  }
+  const weight = pos - lo;
+  return sorted[lo] * (1 - weight) + sorted[hi] * weight;
+}
+export class DummyRegressor {
+  constant_: number | null = null;
+  private readonly strategy: DummyRegressorStrategy;
+  private readonly constant?: number;
+  private readonly quantile: number;
+  private nFeaturesIn_: number | null = null;
+  constructor(options: DummyRegressorOptions = {}) {
+    this.strategy = options.strategy ?? "mean";
+    this.constant = options.constant;
+    this.quantile = options.quantile ?? 0.5;
+    if (this.strategy === "constant") {
+      if (!Number.isFinite(this.constant)) {
+        throw new Error("constant strategy requires a finite constant value.");
+      }
+    }
+    if (this.strategy === "quantile") {
+      if (!Number.isFinite(this.quantile) || this.quantile < 0 || this.quantile > 1) {
+        throw new Error(`quantile must be in [0, 1]. Got ${this.quantile}.`);
+      }
+    }
+  }
+  fit(X: Matrix, y: Vector): this {
+    validateRegressionInputs(X, y);
+    this.nFeaturesIn_ = X[0].length;
+    switch (this.strategy) {
+      case "mean": {
+        let total = 0;
+        for (let i = 0; i < y.length; i += 1) {
+          total += y[i];
+        }
+        this.constant_ = total / y.length;
+        break;
+      }
+      case "median":
+        this.constant_ = computeMedian(y);
+        break;
+      case "quantile":
+        this.constant_ = computeQuantile(y, this.quantile);
+        break;
+      case "constant":
+        this.constant_ = this.constant!;
+        break;
+      default: {
+        const exhaustive: never = this.strategy;
+        throw new Error(`Unsupported strategy: ${exhaustive}`);
+      }
+    }
+    return this;
+  }
+  predict(X: Matrix): Vector {
+    if (this.constant_ === null || this.nFeaturesIn_ === null) {
+      throw new Error("DummyRegressor has not been fitted.");
+    }
+    if (!Array.isArray(X) || X.length === 0) {
+      throw new Error("X must be a non-empty 2D array.");
+    }
+    if (!Array.isArray(X[0]) || X[0].length !== this.nFeaturesIn_) {
+      throw new Error(`Feature size mismatch. Expected ${this.nFeaturesIn_}, got ${X[0]?.length ?? 0}.`);
+    }
+    return new Array<number>(X.length).fill(this.constant_);
+  }
+  score(X: Matrix, y: Vector): number {
+    assertFiniteVector(y);
+    return r2Score(y, this.predict(X));
+  }
+}

package/src/feature_selection/VarianceThreshold.ts ADDED Viewed

@@ -0,0 +1,88 @@
+import type { Matrix } from "../types";
+import {
+  assertConsistentRowSize,
+  assertFiniteMatrix,
+  assertNonEmptyMatrix,
+} from "../utils/validation";
+export interface VarianceThresholdOptions {
+  threshold?: number;
+}
+export class VarianceThreshold {
+  variances_: number[] | null = null;
+  nFeaturesIn_: number | null = null;
+  selectedFeatureIndices_: number[] | null = null;
+  private readonly threshold: number;
+  constructor(options: VarianceThresholdOptions = {}) {
+    this.threshold = options.threshold ?? 0;
+    if (!Number.isFinite(this.threshold) || this.threshold < 0) {
+      throw new Error(`threshold must be finite and >= 0. Got ${this.threshold}.`);
+    }
+  }
+  fit(X: Matrix): this {
+    assertNonEmptyMatrix(X);
+    assertConsistentRowSize(X);
+    assertFiniteMatrix(X);
+    const nSamples = X.length;
+    const nFeatures = X[0].length;
+    const means = new Array<number>(nFeatures).fill(0);
+    const variances = new Array<number>(nFeatures).fill(0);
+    for (let i = 0; i < nSamples; i += 1) {
+      for (let j = 0; j < nFeatures; j += 1) {
+        means[j] += X[i][j];
+      }
+    }
+    for (let j = 0; j < nFeatures; j += 1) {
+      means[j] /= nSamples;
+    }
+    for (let i = 0; i < nSamples; i += 1) {
+      for (let j = 0; j < nFeatures; j += 1) {
+        const diff = X[i][j] - means[j];
+        variances[j] += diff * diff;
+      }
+    }
+    for (let j = 0; j < nFeatures; j += 1) {
+      variances[j] /= nSamples;
+    }
+    const selectedFeatureIndices: number[] = [];
+    for (let j = 0; j < nFeatures; j += 1) {
+      if (variances[j] > this.threshold) {
+        selectedFeatureIndices.push(j);
+      }
+    }
+    if (selectedFeatureIndices.length === 0) {
+      throw new Error("No feature in X meets the variance threshold.");
+    }
+    this.nFeaturesIn_ = nFeatures;
+    this.variances_ = variances;
+    this.selectedFeatureIndices_ = selectedFeatureIndices;
+    return this;
+  }
+  transform(X: Matrix): Matrix {
+    if (!this.selectedFeatureIndices_ || this.nFeaturesIn_ === null) {
+      throw new Error("VarianceThreshold has not been fitted.");
+    }
+    assertNonEmptyMatrix(X);
+    assertConsistentRowSize(X);
+    assertFiniteMatrix(X);
+    if (X[0].length !== this.nFeaturesIn_) {
+      throw new Error(`Feature size mismatch. Expected ${this.nFeaturesIn_}, got ${X[0].length}.`);
+    }
+    return X.map((row) => this.selectedFeatureIndices_!.map((featureIdx) => row[featureIdx]));
+  }
+  fitTransform(X: Matrix): Matrix {
+    return this.fit(X).transform(X);
+  }
+}

package/src/index.ts CHANGED Viewed

@@ -1,15 +1,28 @@
 export * from "./types";
+// Baselines
+export * from "./dummy/DummyClassifier";
+export * from "./dummy/DummyRegressor";
+// Preprocessing
 export * from "./preprocessing/StandardScaler";
 export * from "./preprocessing/MinMaxScaler";
 export * from "./preprocessing/RobustScaler";
+export * from "./preprocessing/MaxAbsScaler";
+export * from "./preprocessing/Normalizer";
+export * from "./preprocessing/Binarizer";
+export * from "./preprocessing/LabelEncoder";
 export * from "./preprocessing/PolynomialFeatures";
 export * from "./preprocessing/SimpleImputer";
 export * from "./preprocessing/OneHotEncoder";
+// Linear models
 export * from "./linear_model/LinearRegression";
 export * from "./linear_model/LogisticRegression";
 export * from "./linear_model/SGDClassifier";
 export * from "./linear_model/SGDRegressor";
+// Other estimators
 export * from "./neighbors/KNeighborsClassifier";
 export * from "./naive_bayes/GaussianNB";
 export * from "./svm/LinearSVC";
@@ -17,6 +30,8 @@ export * from "./tree/DecisionTreeClassifier";
 export * from "./tree/DecisionTreeRegressor";
 export * from "./ensemble/RandomForestClassifier";
 export * from "./ensemble/RandomForestRegressor";
+// Model selection
 export * from "./model_selection/trainTestSplit";
 export * from "./model_selection/KFold";
 export * from "./model_selection/StratifiedKFold";
@@ -25,8 +40,16 @@ export * from "./model_selection/RepeatedKFold";
 export * from "./model_selection/RepeatedStratifiedKFold";
 export * from "./model_selection/crossValScore";
 export * from "./model_selection/GridSearchCV";
+export * from "./model_selection/RandomizedSearchCV";
+// Feature selection
+export * from "./feature_selection/VarianceThreshold";
+// Composition
 export * from "./pipeline/Pipeline";
 export * from "./pipeline/ColumnTransformer";
 export * from "./pipeline/FeatureUnion";
+// Metrics
 export * from "./metrics/regression";
 export * from "./metrics/classification";

package/src/metrics/classification.ts CHANGED Viewed

@@ -292,3 +292,33 @@ export function classificationReport(
     },
   };
 }
+export function balancedAccuracyScore(yTrue: number[], yPred: number[]): number {
+  const report = classificationReport(yTrue, yPred);
+  return report.macroAvg.recall;
+}
+export function matthewsCorrcoef(
+  yTrue: number[],
+  yPred: number[],
+  positiveLabel = 1,
+): number {
+  const { tp, fp, fn, tn } = confusionCounts(yTrue, yPred, positiveLabel);
+  const numerator = tp * tn - fp * fn;
+  const denominator = Math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn));
+  if (denominator === 0) {
+    return 0;
+  }
+  return numerator / denominator;
+}
+export function brierScoreLoss(yTrue: number[], yPredProb: number[]): number {
+  validateInputs(yTrue, yPredProb);
+  validateBinaryTargets(yTrue);
+  let total = 0;
+  for (let i = 0; i < yTrue.length; i += 1) {
+    const diff = yPredProb[i] - yTrue[i];
+    total += diff * diff;
+  }
+  return total / yTrue.length;
+}

package/src/metrics/regression.ts CHANGED Viewed

@@ -49,3 +49,43 @@ export function r2Score(yTrue: number[], yPred: number[]): number {
   return 1 - ssRes / ssTot;
 }
+export function meanAbsolutePercentageError(yTrue: number[], yPred: number[]): number {
+  validateInputs(yTrue, yPred);
+  let total = 0;
+  for (let i = 0; i < yTrue.length; i += 1) {
+    const denom = Math.max(Math.abs(yTrue[i]), 1e-12);
+    total += Math.abs((yTrue[i] - yPred[i]) / denom);
+  }
+  return total / yTrue.length;
+}
+export function explainedVarianceScore(yTrue: number[], yPred: number[]): number {
+  validateInputs(yTrue, yPred);
+  const n = yTrue.length;
+  const yTrueMean = mean(yTrue);
+  const residuals = new Array<number>(n);
+  let residualMean = 0;
+  for (let i = 0; i < n; i += 1) {
+    const r = yTrue[i] - yPred[i];
+    residuals[i] = r;
+    residualMean += r;
+  }
+  residualMean /= n;
+  let varTrue = 0;
+  let varResidual = 0;
+  for (let i = 0; i < n; i += 1) {
+    const centeredY = yTrue[i] - yTrueMean;
+    const centeredR = residuals[i] - residualMean;
+    varTrue += centeredY * centeredY;
+    varResidual += centeredR * centeredR;
+  }
+  varTrue /= n;
+  varResidual /= n;
+  if (varTrue === 0) {
+    return varResidual === 0 ? 1 : 0;
+  }
+  return 1 - varResidual / varTrue;
+}

package/src/model_selection/RandomizedSearchCV.ts ADDED Viewed

@@ -0,0 +1,269 @@
+import type { Matrix, Vector } from "../types";
+import { accuracyScore, f1Score, precisionScore, recallScore } from "../metrics/classification";
+import { meanSquaredError, r2Score } from "../metrics/regression";
+import {
+  crossValScore,
+  type BuiltInScoring,
+  type CrossValEstimator,
+  type CrossValSplitter,
+  type ScoringFn,
+} from "./crossValScore";
+export type ParamDistributions = Record<string, readonly unknown[]>;
+export interface RandomizedSearchCVOptions {
+  cv?: number | CrossValSplitter;
+  scoring?: BuiltInScoring | ScoringFn;
+  refit?: boolean;
+  errorScore?: "raise" | number;
+  nIter?: number;
+  randomState?: number;
+}
+export interface RandomizedSearchResultRow {
+  params: Record<string, unknown>;
+  splitScores: number[];
+  meanTestScore: number;
+  stdTestScore: number;
+  rank: number;
+  status: "ok" | "error";
+  errorMessage?: string;
+}
+function mean(values: number[]): number {
+  let sum = 0;
+  for (let i = 0; i < values.length; i += 1) {
+    sum += values[i];
+  }
+  return sum / values.length;
+}
+function std(values: number[]): number {
+  if (values.length < 2) {
+    return 0;
+  }
+  const avg = mean(values);
+  let sum = 0;
+  for (let i = 0; i < values.length; i += 1) {
+    const diff = values[i] - avg;
+    sum += diff * diff;
+  }
+  return Math.sqrt(sum / values.length);
+}
+function resolveBuiltInScorer(scoring: BuiltInScoring): ScoringFn {
+  switch (scoring) {
+    case "accuracy":
+      return accuracyScore;
+    case "f1":
+      return f1Score;
+    case "precision":
+      return precisionScore;
+    case "recall":
+      return recallScore;
+    case "r2":
+      return r2Score;
+    case "mean_squared_error":
+      return meanSquaredError;
+    case "neg_mean_squared_error":
+      return (yTrue, yPred) => -meanSquaredError(yTrue, yPred);
+    default: {
+      const exhaustive: never = scoring;
+      throw new Error(`Unsupported scoring metric: ${exhaustive}`);
+    }
+  }
+}
+function isLossMetric(scoring: BuiltInScoring | ScoringFn | undefined): boolean {
+  return scoring === "mean_squared_error";
+}
+class Mulberry32 {
+  private state: number;
+  constructor(seed: number) {
+    this.state = seed >>> 0;
+  }
+  next(): number {
+    this.state = (this.state + 0x6d2b79f5) >>> 0;
+    let t = this.state ^ (this.state >>> 15);
+    t = Math.imul(t, this.state | 1);
+    t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
+    return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
+  }
+  nextInt(maxExclusive: number): number {
+    return Math.floor(this.next() * maxExclusive);
+  }
+}
+function sampleParams(
+  distributions: ParamDistributions,
+  nIter: number,
+  randomState: number,
+): Record<string, unknown>[] {
+  const keys = Object.keys(distributions);
+  if (keys.length === 0) {
+    throw new Error("paramDistributions must include at least one parameter.");
+  }
+  for (let i = 0; i < keys.length; i += 1) {
+    const values = distributions[keys[i]];
+    if (!Array.isArray(values) || values.length === 0) {
+      throw new Error(`paramDistributions '${keys[i]}' must be a non-empty array.`);
+    }
+  }
+  const rng = new Mulberry32(randomState);
+  const out: Record<string, unknown>[] = [];
+  for (let i = 0; i < nIter; i += 1) {
+    const params: Record<string, unknown> = {};
+    for (let k = 0; k < keys.length; k += 1) {
+      const key = keys[k];
+      const values = distributions[key];
+      params[key] = values[rng.nextInt(values.length)];
+    }
+    out.push(params);
+  }
+  return out;
+}
+export class RandomizedSearchCV<TEstimator extends CrossValEstimator> {
+  bestEstimator_: TEstimator | null = null;
+  bestParams_: Record<string, unknown> | null = null;
+  bestScore_: number | null = null;
+  cvResults_: RandomizedSearchResultRow[] = [];
+  private readonly estimatorFactory: (params: Record<string, unknown>) => TEstimator;
+  private readonly paramDistributions: ParamDistributions;
+  private readonly cv?: number | CrossValSplitter;
+  private readonly scoring?: BuiltInScoring | ScoringFn;
+  private readonly refit: boolean;
+  private readonly errorScore: "raise" | number;
+  private readonly nIter: number;
+  private readonly randomState: number;
+  private isFitted = false;
+  constructor(
+    estimatorFactory: (params: Record<string, unknown>) => TEstimator,
+    paramDistributions: ParamDistributions,
+    options: RandomizedSearchCVOptions = {},
+  ) {
+    if (typeof estimatorFactory !== "function") {
+      throw new Error("estimatorFactory must be a function.");
+    }
+    this.estimatorFactory = estimatorFactory;
+    this.paramDistributions = paramDistributions;
+    this.cv = options.cv;
+    this.scoring = options.scoring;
+    this.refit = options.refit ?? true;
+    this.errorScore = options.errorScore ?? "raise";
+    this.nIter = options.nIter ?? 10;
+    this.randomState = options.randomState ?? 42;
+    if (!Number.isInteger(this.nIter) || this.nIter < 1) {
+      throw new Error(`nIter must be an integer >= 1. Got ${this.nIter}.`);
+    }
+  }
+  fit(X: Matrix, y: Vector): this {
+    const candidates = sampleParams(this.paramDistributions, this.nIter, this.randomState);
+    const minimize = isLossMetric(this.scoring);
+    const rows: RandomizedSearchResultRow[] = [];
+    const objectiveScores: number[] = [];
+    for (let candidateIndex = 0; candidateIndex < candidates.length; candidateIndex += 1) {
+      const params = candidates[candidateIndex];
+      try {
+        const splitScores = crossValScore(
+          () => this.estimatorFactory(params),
+          X,
+          y,
+          { cv: this.cv, scoring: this.scoring },
+        );
+        const meanTestScore = mean(splitScores);
+        rows.push({
+          params: { ...params },
+          splitScores,
+          meanTestScore,
+          stdTestScore: std(splitScores),
+          rank: 0,
+          status: "ok",
+        });
+        objectiveScores.push(minimize ? -meanTestScore : meanTestScore);
+      } catch (error) {
+        if (this.errorScore === "raise") {
+          throw error;
+        }
+        rows.push({
+          params: { ...params },
+          splitScores: [this.errorScore],
+          meanTestScore: this.errorScore,
+          stdTestScore: 0,
+          rank: 0,
+          status: "error",
+          errorMessage: error instanceof Error ? error.message : String(error),
+        });
+        objectiveScores.push(minimize ? -this.errorScore : this.errorScore);
+      }
+    }
+    const order = Array.from({ length: rows.length }, (_, idx) => idx).sort((a, b) => {
+      const delta = objectiveScores[b] - objectiveScores[a];
+      if (delta !== 0) {
+        return delta;
+      }
+      return a - b;
+    });
+    for (let rank = 0; rank < order.length; rank += 1) {
+      rows[order[rank]].rank = rank + 1;
+    }
+    const bestIndex = order[0];
+    this.bestParams_ = { ...rows[bestIndex].params };
+    this.bestScore_ = rows[bestIndex].meanTestScore;
+    this.cvResults_ = rows;
+    if (this.refit) {
+      const estimator = this.estimatorFactory(this.bestParams_);
+      estimator.fit(X, y);
+      this.bestEstimator_ = estimator;
+    } else {
+      this.bestEstimator_ = null;
+    }
+    this.isFitted = true;
+    return this;
+  }
+  predict(X: Matrix): Vector {
+    if (!this.isFitted) {
+      throw new Error("RandomizedSearchCV has not been fitted.");
+    }
+    if (!this.refit || !this.bestEstimator_) {
+      throw new Error("RandomizedSearchCV predict is unavailable when refit=false.");
+    }
+    return this.bestEstimator_.predict(X);
+  }
+  score(X: Matrix, y: Vector): number {
+    if (!this.isFitted) {
+      throw new Error("RandomizedSearchCV has not been fitted.");
+    }
+    if (!this.refit || !this.bestEstimator_) {
+      throw new Error("RandomizedSearchCV score is unavailable when refit=false.");
+    }
+    if (this.scoring) {
+      const scorer =
+        typeof this.scoring === "function" ? this.scoring : resolveBuiltInScorer(this.scoring);
+      return scorer(y, this.bestEstimator_.predict(X));
+    }
+    if (typeof this.bestEstimator_.score === "function") {
+      return this.bestEstimator_.score(X, y);
+    }
+    throw new Error("No scoring function available. Provide scoring in RandomizedSearchCV options.");
+  }
+}