npm - @wlearn/automl - Versions diffs - 0.1.0 - Mend

@wlearn/automl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json +27 -0
package/src/auto-fit.js +261 -0
package/src/common.js +108 -0
package/src/executor.js +209 -0
package/src/halving.js +95 -0
package/src/index.js +12 -0
package/src/leaderboard.js +106 -0
package/src/portfolio.js +332 -0
package/src/progressive.js +156 -0
package/src/sampler.js +162 -0
package/src/search.js +93 -0
package/src/strategy-halving.js +157 -0
package/src/strategy-progressive.js +126 -0
package/src/strategy-random.js +67 -0

package/package.json ADDED Viewed

@@ -0,0 +1,27 @@
+{
+  "name": "@wlearn/automl",
+  "version": "0.1.0",
+  "description": "AutoML engine for wlearn: search space sampling, random search, successive halving, ensemble construction",
+  "type": "module",
+  "main": "src/index.js",
+  "exports": {
+    ".": "./src/index.js"
+  },
+  "files": [
+    "src/"
+  ],
+  "sideEffects": false,
+  "dependencies": {
+    "@wlearn/core": "0.1.0",
+    "@wlearn/ensemble": "0.1.0",
+    "@wlearn/types": "0.1.0"
+  },
+  "scripts": {
+    "test": "node --test test/*.js"
+  },
+  "publishConfig": {
+    "access": "public"
+  },
+  "author": "Anton Zemlyansky",
+  "license": "MIT"
+}

package/src/auto-fit.js ADDED Viewed

@@ -0,0 +1,261 @@
+import { normalizeX, normalizeY, ValidationError, Preprocessor } from '@wlearn/core'
+import { getOofPredictions, caruanaSelect, VotingEnsemble, StackingEnsemble } from '@wlearn/ensemble'
+import { RandomSearch } from './search.js'
+import { SuccessiveHalvingSearch } from './halving.js'
+import { PortfolioSearch } from './portfolio.js'
+import { ProgressiveSearch } from './progressive.js'
+import { detectTask } from './common.js'
+/**
+ * Compute pairwise disagreement rate between two prediction vectors.
+ * For classification: fraction of samples where argmax differs.
+ * For regression: 1 - correlation (capped at [0,1]).
+ */
+function _disagreementRate(a, b, n, task) {
+  if (task === 'classification') {
+    const nClasses = a.length / n
+    let disagree = 0
+    for (let i = 0; i < n; i++) {
+      let bestA = 0, bestB = 0, bestVA = -Infinity, bestVB = -Infinity
+      for (let c = 0; c < nClasses; c++) {
+        const idx = i * nClasses + c
+        if (a[idx] > bestVA) { bestVA = a[idx]; bestA = c }
+        if (b[idx] > bestVB) { bestVB = b[idx]; bestB = c }
+      }
+      if (bestA !== bestB) disagree++
+    }
+    return disagree / n
+  }
+  // Regression: 1 - abs(correlation)
+  let sumA = 0, sumB = 0, sumAA = 0, sumBB = 0, sumAB = 0
+  for (let i = 0; i < n; i++) {
+    sumA += a[i]; sumB += b[i]
+    sumAA += a[i] * a[i]; sumBB += b[i] * b[i]
+    sumAB += a[i] * b[i]
+  }
+  const denom = Math.sqrt((sumAA - sumA * sumA / n) * (sumBB - sumB * sumB / n))
+  if (denom < 1e-12) return 1
+  const corr = (sumAB - sumA * sumB / n) / denom
+  return 1 - Math.abs(corr)
+}
+/**
+ * Filter pool by minimum pairwise disagreement.
+ * Always keeps index 0 (best model). Greedily adds candidates that
+ * have at least minDisagreement with all already-selected members.
+ * Returns array of retained indices.
+ */
+function _filterByDisagreement(oofPreds, yn, task, minDisagreement) {
+  const n = yn.length
+  if (oofPreds.length <= 2 || minDisagreement <= 0) {
+    return oofPreds.map((_, i) => i)
+  }
+  const kept = [0]
+  for (let i = 1; i < oofPreds.length; i++) {
+    let diverse = true
+    for (const j of kept) {
+      if (_disagreementRate(oofPreds[i], oofPreds[j], n, task) < minDisagreement) {
+        diverse = false
+        break
+      }
+    }
+    if (diverse) kept.push(i)
+  }
+  // Always keep at least 2 for ensemble
+  if (kept.length < 2 && oofPreds.length >= 2) {
+    if (!kept.includes(1)) kept.push(1)
+  }
+  return kept
+}
+/**
+ * Normalize model specs: accept both ModelSpec objects and [name, cls, params?] tuples.
+ */
+function _normalizeSpecs(models) {
+  return models.map(m => {
+    if (Array.isArray(m)) {
+      return { name: m[0], cls: m[1], params: m[2] || {} }
+    }
+    return m
+  })
+}
+/**
+ * High-level AutoML: random search + optional Caruana ensemble + refit.
+ *
+ * @param {Array} models - ModelSpec[] or EstimatorSpec tuples [name, cls, params?]
+ * @param {object|number[][]} X - feature matrix
+ * @param {TypedArray|number[]} y - labels
+ * @param {object} opts
+ * @returns {Promise<{ model: object, leaderboard: object[], bestParams: object, bestModelName: string, bestScore: number }>}
+ */
+export async function autoFit(models, X, y, opts = {}) {
+  const {
+    ensemble = true,
+    ensembleSize = 20,
+    refit = true,
+    strategy = 'random',
+    minDisagreement = 0.05,
+    stacking = 'auto',
+    metaEstimator = null,
+    preprocess = false,
+    onProgress = null,
+    ...searchOpts
+  } = opts
+  const specs = _normalizeSpecs(models)
+  if (specs.length === 0) {
+    throw new ValidationError('autoFit: at least one model is required')
+  }
+  // Optional preprocessing
+  let preprocessor = null
+  if (preprocess) {
+    const ppConfig = typeof preprocess === 'object' ? preprocess : {}
+    preprocessor = new Preprocessor(ppConfig)
+    const Xpre = normalizeX(X)
+    const ypre = normalizeY(y)
+    const Xt = preprocessor.fitTransform(Xpre, ypre)
+    X = Xt
+  }
+  // Run search
+  const searchOptsWithProgress = { ...searchOpts, onProgress }
+  let search
+  if (strategy === 'portfolio') {
+    search = new PortfolioSearch(specs, searchOptsWithProgress)
+  } else if (strategy === 'halving') {
+    search = new SuccessiveHalvingSearch(specs, searchOptsWithProgress)
+  } else if (strategy === 'progressive') {
+    search = new ProgressiveSearch(specs, searchOptsWithProgress)
+  } else {
+    search = new RandomSearch(specs, searchOptsWithProgress)
+  }
+  const { leaderboard, bestResult } = await search.fit(X, y)
+  const ranked = leaderboard.ranked()
+  const Xn = normalizeX(X)
+  const yn = normalizeY(y)
+  const task = searchOpts.task || detectTask(yn)
+  const scoring = searchOpts.scoring || (task === 'classification' ? 'accuracy' : 'r2')
+  const cv = searchOpts.cv || 5
+  const seed = searchOpts.seed || 42
+  let model = null
+  if (ensemble) {
+    if (onProgress) {
+      onProgress({ phase: 'ensemble', message: 'building ensemble' })
+    }
+    // Diversity-aware pool: best per family + top overall with disagreement filter
+    const familyBest = new Map()
+    const familySecond = new Map()
+    for (const entry of ranked) {
+      if (!familyBest.has(entry.modelName)) {
+        familyBest.set(entry.modelName, entry)
+      } else if (!familySecond.has(entry.modelName)) {
+        familySecond.set(entry.modelName, entry)
+      }
+    }
+    // Seed pool: best per family (guaranteed diversity)
+    const pool = [...familyBest.values()]
+    const poolIds = new Set(pool.map(e => e.id))
+    // Add second-best per family if available (for intra-family diversity)
+    for (const entry of familySecond.values()) {
+      if (pool.length >= ensembleSize * 2) break
+      if (!poolIds.has(entry.id)) {
+        pool.push(entry)
+        poolIds.add(entry.id)
+      }
+    }
+    // Fill remaining slots from top overall
+    for (const entry of ranked) {
+      if (pool.length >= ensembleSize * 2) break
+      if (!poolIds.has(entry.id)) {
+        pool.push(entry)
+        poolIds.add(entry.id)
+      }
+    }
+    // Map model names to classes
+    const clsMap = new Map()
+    for (const spec of specs) {
+      clsMap.set(spec.name, spec.cls)
+    }
+    // Build estimator specs for OOF
+    const estSpecs = pool.map((entry, i) => {
+      const cls = clsMap.get(entry.modelName)
+      return [`${entry.modelName}_${i}`, cls, entry.params]
+    })
+    // Generate OOF predictions
+    const { oofPreds } = await getOofPredictions(estSpecs, Xn, yn, {
+      cv, seed, task,
+    })
+    // Disagreement filter: remove near-duplicate predictions
+    const filteredIdx = _filterByDisagreement(oofPreds, yn, task, minDisagreement)
+    const filteredOofs = filteredIdx.map(i => oofPreds[i])
+    const filteredSpecs = filteredIdx.map(i => estSpecs[i])
+    // Caruana selection on filtered pool
+    const { indices: selIndices, weights } = caruanaSelect(filteredOofs, yn, {
+      maxSize: ensembleSize,
+      scoring,
+      task,
+    })
+    // Build ensemble from selected
+    const indices = selIndices
+    const selectedSpecs = Array.from(indices, i => filteredSpecs[i])
+    const selectedWeights = weights
+    // Determine if two-layer stacking should be used
+    const selectedFamilies = new Set(selectedSpecs.map(s => s[0].split('_')[0]))
+    const useStacking = stacking === true ||
+      (stacking === 'auto' && selectedFamilies.size >= 3 && metaEstimator)
+    if (useStacking && metaEstimator) {
+      // Two-layer stacking: L0 = selected base models, L1 = meta-model
+      const metaSpec = Array.isArray(metaEstimator)
+        ? metaEstimator
+        : ['meta', metaEstimator.cls || metaEstimator, metaEstimator.params || {}]
+      const ens = await StackingEnsemble.create({
+        estimators: selectedSpecs,
+        finalEstimator: metaSpec,
+        passthrough: true,
+        task,
+        cv,
+        seed,
+      })
+      await ens.fit(Xn, yn)
+      model = ens
+    } else {
+      // Default: VotingEnsemble
+      const ens = await VotingEnsemble.create({
+        estimators: selectedSpecs,
+        weights: selectedWeights,
+        voting: task === 'classification' ? 'soft' : undefined,
+        task,
+      })
+      await ens.fit(Xn, yn)
+      model = ens
+    }
+  } else if (refit) {
+    model = await search.refitBest(X, y)
+  }
+  return {
+    model,
+    preprocessor,
+    leaderboard: ranked,
+    bestParams: bestResult.params,
+    bestModelName: bestResult.modelName,
+    bestScore: bestResult.meanScore,
+  }
+}

package/src/common.js ADDED Viewed

@@ -0,0 +1,108 @@
+import { makeLCG } from '@wlearn/core'
+const { round } = Math
+/**
+ * Detect task type from labels.
+ */
+export function detectTask(y) {
+  if (y instanceof Int32Array) return 'classification'
+  const unique = new Set()
+  for (let i = 0; i < y.length; i++) {
+    if (y[i] !== round(y[i])) return 'regression'
+    unique.add(y[i])
+  }
+  return unique.size <= 20 ? 'classification' : 'regression'
+}
+/**
+ * High-resolution timer.
+ */
+export function now() {
+  if (typeof performance !== 'undefined') return performance.now()
+  return Date.now()
+}
+/**
+ * Stable JSON stringify with sorted keys.
+ * Numbers use toString() to avoid precision drift.
+ * Params must be JSON-serializable primitives only (enforced by SearchSpace IR).
+ */
+function stableStringify(obj) {
+  if (obj === null || obj === undefined) return String(obj)
+  if (typeof obj === 'number') return obj.toString()
+  if (typeof obj === 'string') return JSON.stringify(obj)
+  if (typeof obj === 'boolean') return String(obj)
+  if (Array.isArray(obj)) {
+    return '[' + obj.map(stableStringify).join(',') + ']'
+  }
+  if (typeof obj === 'object') {
+    const keys = Object.keys(obj).sort()
+    return '{' + keys.map(k => JSON.stringify(k) + ':' + stableStringify(obj[k])).join(',') + '}'
+  }
+  return String(obj)
+}
+/**
+ * Stable candidate ID from model label and params.
+ */
+export function makeCandidateId(modelLabel, params) {
+  return modelLabel + ':' + stableStringify(params)
+}
+/**
+ * Simple integer hash for strings (FNV-1a inspired).
+ */
+function hashString(str) {
+  let h = 0x811c9dc5
+  for (let i = 0; i < str.length; i++) {
+    h ^= str.charCodeAt(i)
+    h = (h * 0x01000193) & 0x7fffffff
+  }
+  return h
+}
+/**
+ * Derive a deterministic seed from base seed, candidate ID, and fold index.
+ */
+export function seedFor(candidateId, foldIdx, baseSeed) {
+  const h = hashString(candidateId)
+  // Mix: multiply-xor-shift
+  let s = (baseSeed * 2654435761 + h * 40503 + foldIdx * 65537) & 0x7fffffff
+  s = ((s >>> 16) ^ s) * 0x45d9f3b & 0x7fffffff
+  return s
+}
+/**
+ * Partial Fisher-Yates: shuffle only first k positions of indices array.
+ * O(k) time, mutates indices in-place. Returns indices subarray [0..k-1].
+ */
+export function partialShuffle(indices, k, rng) {
+  const n = indices.length
+  const m = Math.min(k, n)
+  for (let i = 0; i < m; i++) {
+    const j = i + ((rng() * (n - i)) | 0)
+    const tmp = indices[i]
+    indices[i] = indices[j]
+    indices[j] = tmp
+  }
+  return indices.subarray ? indices.subarray(0, m) : indices.slice(0, m)
+}
+/**
+ * Map scoring name to greaterIsBetter.
+ * All built-in scorers are greater-is-better (neg_mse, neg_mae are negated).
+ * Custom functions default to true.
+ */
+export function scorerGreaterIsBetter(scoring) {
+  if (typeof scoring === 'function') return true
+  switch (scoring) {
+    case 'accuracy':
+    case 'r2':
+    case 'neg_mse':
+    case 'neg_mae':
+      return true
+    default:
+      return true
+  }
+}

package/src/executor.js ADDED Viewed

@@ -0,0 +1,209 @@
+import { normalizeX, normalizeY, makeLCG, getScorer } from '@wlearn/core'
+import { Leaderboard } from './leaderboard.js'
+import { now, seedFor, partialShuffle } from './common.js'
+const { ceil, min } = Math
+/**
+ * Subset rows of X by index array.
+ */
+function subsetX(X, indices) {
+  const { data, cols } = X
+  const rows = indices.length
+  const out = new Float64Array(rows * cols)
+  for (let i = 0; i < rows; i++) {
+    const srcOff = indices[i] * cols
+    out.set(data.subarray(srcOff, srcOff + cols), i * cols)
+  }
+  return { data: out, rows, cols }
+}
+/**
+ * Subset labels by index array.
+ */
+function subsetY(y, indices) {
+  const out = new (y.constructor)(indices.length)
+  for (let i = 0; i < indices.length; i++) {
+    out[i] = y[indices[i]]
+  }
+  return out
+}
+/**
+ * Executor: evaluation engine and canonical leaderboard owner.
+ *
+ * Evaluates candidates across all CV folds, applies budgets,
+ * records results in a single Leaderboard instance.
+ */
+export class Executor {
+  #folds
+  #scorerFn
+  #X
+  #y
+  #timeLimitMs
+  #seed
+  #startTime
+  #leaderboard
+  #onProgress
+  /**
+   * @param {object} opts
+   * @param {Array<{train: Int32Array, test: Int32Array}>} opts.folds - CV folds
+   * @param {string|Function} opts.scoring - scorer name or function
+   * @param {object} opts.X - normalized feature matrix
+   * @param {TypedArray} opts.y - normalized labels
+   * @param {number} opts.timeLimitMs - global time limit (0 = no limit)
+   * @param {number} opts.seed - base seed for reproducibility
+   * @param {Function} opts.onProgress - optional progress callback
+   */
+  constructor({ folds, scoring, X, y, timeLimitMs = 0, seed = 42, onProgress }) {
+    this.#folds = folds
+    this.#scorerFn = getScorer(scoring)
+    this.#X = X
+    this.#y = y
+    this.#timeLimitMs = timeLimitMs
+    this.#seed = seed
+    this.#startTime = now()
+    this.#leaderboard = new Leaderboard()
+    this.#onProgress = onProgress || null
+  }
+  get leaderboard() {
+    return this.#leaderboard
+  }
+  get isTimedOut() {
+    if (this.#timeLimitMs <= 0) return false
+    return (now() - this.#startTime) > this.#timeLimitMs
+  }
+  /**
+   * Evaluate one candidate across all CV folds.
+   *
+   * @param {object} candidateEval
+   * @param {string} candidateEval.candidateId - stable identifier
+   * @param {object} candidateEval.cls - estimator class with create/fit/predict/dispose
+   * @param {object} candidateEval.params - hyperparameters
+   * @param {object} [candidateEval.budget] - optional budget constraint
+   * @returns {Promise<object>} CandidateResult
+   */
+  async evaluateCandidate({ candidateId, cls, params, budget }) {
+    const folds = this.#folds
+    const scores = new Float64Array(folds.length)
+    const t0 = now()
+    let totalTrainUsed = 0
+    // Resolve effective params (apply rounds budget if applicable)
+    const effectiveParams = this.#applyRoundsBudget(cls, params, budget)
+    for (let f = 0; f < folds.length; f++) {
+      let { train, test } = folds[f]
+      // Apply subsample budget to train only
+      if (budget && budget.type === 'subsample') {
+        train = this.#subsampleTrain(train, budget.value, candidateId, f)
+      }
+      totalTrainUsed += train.length
+      const Xtrain = subsetX(this.#X, train)
+      const ytrain = subsetY(this.#y, train)
+      const Xtest = subsetX(this.#X, test)
+      const ytest = subsetY(this.#y, test)
+      const model = await cls.create(effectiveParams)
+      try {
+        model.fit(Xtrain, ytrain)
+        const preds = await model.predict(Xtest)
+        scores[f] = this.#scorerFn(ytest, preds)
+      } finally {
+        model.dispose()
+      }
+    }
+    const fitTimeMs = now() - t0
+    // Record in leaderboard
+    const entry = this.#leaderboard.add({
+      modelName: candidateId.split(':')[0],
+      params,
+      scores,
+      fitTimeMs,
+    })
+    return {
+      candidateId,
+      meanScore: entry.meanScore,
+      foldScores: scores,
+      stdScore: entry.stdScore,
+      fitTimeMs,
+      nTrainUsed: Math.round(totalTrainUsed / folds.length),
+      nTest: folds[0].test.length,
+    }
+  }
+  /**
+   * Apply rounds budget by setting the model's rounds param if:
+   * 1. Budget type is 'rounds'
+   * 2. Model exposes budgetSpec().roundsParam
+   * 3. Candidate params don't already set that param (candidate config wins)
+   */
+  #applyRoundsBudget(cls, params, budget) {
+    if (!budget || budget.type !== 'rounds') return params
+    const spec = cls.budgetSpec?.()
+    if (!spec || !spec.roundsParam) return params
+    if (params[spec.roundsParam] !== undefined) return params
+    return { ...params, [spec.roundsParam]: budget.value }
+  }
+  /**
+   * Subsample train indices using partial Fisher-Yates with deterministic seed.
+   * Returns a new array of selected indices. Test indices are never subsampled.
+   */
+  #subsampleTrain(train, fraction, candidateId, foldIdx) {
+    const k = Math.max(1, ceil(train.length * fraction))
+    if (k >= train.length) return train
+    // Copy to avoid mutating the original fold indices
+    const copy = new Int32Array(train)
+    const seed = seedFor(candidateId, foldIdx, this.#seed)
+    const rng = makeLCG(seed)
+    return partialShuffle(copy, k, rng)
+  }
+  /**
+   * Run a strategy to completion.
+   * Returns { leaderboard } only. Callers decide "best".
+   */
+  async runStrategy(strategy) {
+    let done = 0
+    while (!strategy.isDone()) {
+      if (this.isTimedOut) break
+      const task = strategy.next()
+      if (task === null) break
+      try {
+        const result = await this.evaluateCandidate(task)
+        strategy.report(result)
+        done++
+        if (this.#onProgress) {
+          const best = this.#leaderboard.best()
+          this.#onProgress({
+            phase: 'search',
+            candidatesDone: done,
+            bestScore: best ? best.meanScore : null,
+            bestModel: best ? best.modelName : null,
+            lastCandidate: {
+              model: result.candidateId.split(':')[0],
+              score: result.meanScore,
+              timeMs: result.fitTimeMs,
+            },
+            elapsedMs: now() - this.#startTime,
+          })
+        }
+      } catch {
+        done++
+        // Skip failed candidates (invalid params, create errors, etc.)
+      }
+    }
+    return { leaderboard: this.#leaderboard }
+  }
+}

package/src/halving.js ADDED Viewed

@@ -0,0 +1,95 @@
+import { stratifiedKFold, kFold, normalizeX, normalizeY,
+  ValidationError } from '@wlearn/core'
+import { Executor } from './executor.js'
+import { HalvingStrategy } from './strategy-halving.js'
+import { detectTask, scorerGreaterIsBetter } from './common.js'
+/**
+ * Successive halving search: multi-round elimination tournament.
+ * Evaluates many candidates on small subsamples, progressively
+ * eliminates the worst and increases resource allocation.
+ */
+export class SuccessiveHalvingSearch {
+  #models
+  #opts
+  #leaderboard = null
+  #bestResult = null
+  #rounds = null
+  constructor(models, opts = {}) {
+    if (!models || models.length === 0) {
+      throw new ValidationError('SuccessiveHalvingSearch: at least one model is required')
+    }
+    this.#models = models
+    this.#opts = {
+      scoring: null,
+      cv: 5,
+      seed: 42,
+      task: null,
+      nIter: 20,
+      maxTimeMs: 0,
+      factor: 3,
+      minResources: 0,
+      onProgress: null,
+      ...opts,
+    }
+  }
+  async fit(X, y) {
+    const Xn = normalizeX(X)
+    const yn = normalizeY(y)
+    const n = Xn.rows
+    const task = this.#opts.task || detectTask(yn)
+    const scoring = this.#opts.scoring || (task === 'classification' ? 'accuracy' : 'r2')
+    const greaterIsBetter = scorerGreaterIsBetter(scoring)
+    const { cv, seed, nIter, maxTimeMs, factor, onProgress } = this.#opts
+    // Generate base folds on full data
+    const folds = task === 'classification'
+      ? stratifiedKFold(yn, cv, { shuffle: true, seed })
+      : kFold(n, cv, { shuffle: true, seed })
+    const executor = new Executor({
+      folds,
+      scoring,
+      X: Xn,
+      y: yn,
+      timeLimitMs: maxTimeMs,
+      seed,
+      onProgress,
+    })
+    const strategy = new HalvingStrategy(this.#models, {
+      nIter,
+      seed,
+      factor,
+      nSamples: n,
+      greaterIsBetter,
+      cv,
+    })
+    const { leaderboard } = await executor.runStrategy(strategy)
+    this.#leaderboard = leaderboard
+    this.#bestResult = leaderboard.best()
+    this.#rounds = strategy.rounds
+    return { leaderboard, bestResult: this.#bestResult, rounds: this.#rounds }
+  }
+  async refitBest(X, y) {
+    if (!this.#bestResult) {
+      throw new ValidationError('SuccessiveHalvingSearch: must call fit() first')
+    }
+    const best = this.#bestResult
+    const model = this.#models.find(m => m.name === best.modelName)
+    const instance = await model.cls.create(best.params)
+    const Xn = normalizeX(X)
+    const yn = normalizeY(y)
+    instance.fit(Xn, yn)
+    return instance
+  }
+  get leaderboard() { return this.#leaderboard }
+  get bestResult() { return this.#bestResult }
+  get rounds() { return this.#rounds }
+}