@wlearn/ensemble 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +26 -0
- package/src/bagging.js +410 -0
- package/src/index.js +6 -0
- package/src/oof.js +96 -0
- package/src/selection.js +127 -0
- package/src/stacking.js +372 -0
- package/src/voting.js +311 -0
- package/src/weights.js +143 -0
package/package.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@wlearn/ensemble",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Ensemble methods for wlearn: voting, stacking, Caruana selection",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "src/index.js",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./src/index.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"src/"
|
|
12
|
+
],
|
|
13
|
+
"sideEffects": false,
|
|
14
|
+
"dependencies": {
|
|
15
|
+
"@wlearn/core": "0.1.0",
|
|
16
|
+
"@wlearn/types": "0.1.0"
|
|
17
|
+
},
|
|
18
|
+
"scripts": {
|
|
19
|
+
"test": "node --test test/*.js"
|
|
20
|
+
},
|
|
21
|
+
"publishConfig": {
|
|
22
|
+
"access": "public"
|
|
23
|
+
},
|
|
24
|
+
"author": "Anton Zemlyansky",
|
|
25
|
+
"license": "MIT"
|
|
26
|
+
}
|
package/src/bagging.js
ADDED
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
import {
|
|
2
|
+
encodeBundle, decodeBundle, register, load as registryLoad,
|
|
3
|
+
normalizeX, normalizeY, accuracy, r2Score,
|
|
4
|
+
stratifiedKFold, kFold,
|
|
5
|
+
ValidationError, NotFittedError, DisposedError,
|
|
6
|
+
lift
|
|
7
|
+
} from '@wlearn/core'
|
|
8
|
+
|
|
9
|
+
const TYPE_ID_CLS = 'wlearn.ensemble.bagged.classifier@1'
|
|
10
|
+
const TYPE_ID_REG = 'wlearn.ensemble.bagged.regressor@1'
|
|
11
|
+
let _registered = false
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* K-fold bagged estimator with out-of-fold prediction storage.
|
|
15
|
+
*
|
|
16
|
+
* Trains K * nRepeats copies of a base model. Each repeat uses a different
|
|
17
|
+
* seed for fold assignment. OOF predictions are accumulated (sum + count)
|
|
18
|
+
* and averaged, matching AutoGluon's BaggedEnsembleModel pattern.
|
|
19
|
+
*/
|
|
20
|
+
export class BaggedEstimator {
|
|
21
|
+
#spec // [name, Class, params]
|
|
22
|
+
#kFold
|
|
23
|
+
#nRepeats
|
|
24
|
+
#task
|
|
25
|
+
#seed
|
|
26
|
+
#foldModels // fitted model instances, length K * nRepeats
|
|
27
|
+
#classes
|
|
28
|
+
#nClasses = 0
|
|
29
|
+
#nSamples = 0
|
|
30
|
+
#oofAccum // Float64Array: accumulated OOF predictions (sum)
|
|
31
|
+
#oofCounts // Uint8Array: per-sample prediction count
|
|
32
|
+
#fitted = false
|
|
33
|
+
#disposed = false
|
|
34
|
+
|
|
35
|
+
constructor(params = {}) {
|
|
36
|
+
this.#spec = params.estimator || null
|
|
37
|
+
this.#kFold = params.kFold || 5
|
|
38
|
+
this.#nRepeats = params.nRepeats || 1
|
|
39
|
+
this.#task = params.task || 'classification'
|
|
40
|
+
this.#seed = params.seed ?? 42
|
|
41
|
+
this.#foldModels = null
|
|
42
|
+
this.#classes = null
|
|
43
|
+
this.#oofAccum = null
|
|
44
|
+
this.#oofCounts = null
|
|
45
|
+
BaggedEstimator._register()
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
static async create(params = {}) {
|
|
49
|
+
return new BaggedEstimator(params)
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
#ensureAlive() {
|
|
53
|
+
if (this.#disposed) throw new DisposedError('BaggedEstimator has been disposed.')
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#ensureFitted() {
|
|
57
|
+
this.#ensureAlive()
|
|
58
|
+
if (!this.#fitted) throw new NotFittedError('BaggedEstimator is not fitted. Call fit() first.')
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async fit(X, y) {
|
|
62
|
+
this.#ensureAlive()
|
|
63
|
+
const Xn = normalizeX(X)
|
|
64
|
+
const yn = normalizeY(y)
|
|
65
|
+
const n = Xn.rows
|
|
66
|
+
this.#nSamples = n
|
|
67
|
+
|
|
68
|
+
if (this.#task === 'classification') {
|
|
69
|
+
const labelSet = new Set()
|
|
70
|
+
for (let i = 0; i < yn.length; i++) labelSet.add(yn[i])
|
|
71
|
+
this.#classes = new Int32Array([...labelSet].sort((a, b) => a - b))
|
|
72
|
+
this.#nClasses = this.#classes.length
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Initialize OOF accumulation
|
|
76
|
+
if (this.#task === 'classification') {
|
|
77
|
+
this.#oofAccum = new Float64Array(n * this.#nClasses)
|
|
78
|
+
} else {
|
|
79
|
+
this.#oofAccum = new Float64Array(n)
|
|
80
|
+
}
|
|
81
|
+
this.#oofCounts = new Uint8Array(n)
|
|
82
|
+
|
|
83
|
+
const [, EstClass, params] = this.#spec
|
|
84
|
+
this.#foldModels = []
|
|
85
|
+
|
|
86
|
+
for (let repeat = 0; repeat < this.#nRepeats; repeat++) {
|
|
87
|
+
const repeatSeed = this.#seed + repeat
|
|
88
|
+
|
|
89
|
+
const folds = this.#task === 'classification'
|
|
90
|
+
? stratifiedKFold(yn, this.#kFold, { shuffle: true, seed: repeatSeed })
|
|
91
|
+
: kFold(n, this.#kFold, { shuffle: true, seed: repeatSeed })
|
|
92
|
+
|
|
93
|
+
for (const { train, test } of folds) {
|
|
94
|
+
const Xtrain = _subsetX(Xn, train)
|
|
95
|
+
const ytrain = _subsetY(yn, train)
|
|
96
|
+
const Xtest = _subsetX(Xn, test)
|
|
97
|
+
|
|
98
|
+
const model = await EstClass.create(params || {})
|
|
99
|
+
model.fit(Xtrain, ytrain)
|
|
100
|
+
|
|
101
|
+
// Accumulate OOF predictions
|
|
102
|
+
if (this.#task === 'classification') {
|
|
103
|
+
const proba = await model.predictProba(Xtest)
|
|
104
|
+
const nc = this.#nClasses
|
|
105
|
+
for (let i = 0; i < test.length; i++) {
|
|
106
|
+
const row = test[i]
|
|
107
|
+
for (let c = 0; c < nc; c++) {
|
|
108
|
+
this.#oofAccum[row * nc + c] += proba[i * nc + c]
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
} else {
|
|
112
|
+
const preds = await model.predict(Xtest)
|
|
113
|
+
for (let i = 0; i < test.length; i++) {
|
|
114
|
+
this.#oofAccum[test[i]] += preds[i]
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
for (let i = 0; i < test.length; i++) {
|
|
119
|
+
this.#oofCounts[test[i]] += 1
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
this.#foldModels.push(model)
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
this.#fitted = true
|
|
127
|
+
return this
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
predict(X) {
|
|
131
|
+
this.#ensureFitted()
|
|
132
|
+
const Xn = normalizeX(X)
|
|
133
|
+
const n = Xn.rows
|
|
134
|
+
|
|
135
|
+
if (this.#task === 'regression') {
|
|
136
|
+
return this.#averagePredictions(Xn, n)
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Classification: average probabilities, then argmax
|
|
140
|
+
const proba = this.predictProba(Xn)
|
|
141
|
+
return lift(proba, p => {
|
|
142
|
+
const nc = this.#nClasses
|
|
143
|
+
const out = new Float64Array(n)
|
|
144
|
+
for (let i = 0; i < n; i++) {
|
|
145
|
+
let bestC = 0, bestV = -Infinity
|
|
146
|
+
for (let c = 0; c < nc; c++) {
|
|
147
|
+
if (p[i * nc + c] > bestV) {
|
|
148
|
+
bestV = p[i * nc + c]
|
|
149
|
+
bestC = c
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
out[i] = this.#classes[bestC]
|
|
153
|
+
}
|
|
154
|
+
return out
|
|
155
|
+
})
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
predictProba(X) {
|
|
159
|
+
this.#ensureFitted()
|
|
160
|
+
if (this.#task !== 'classification') {
|
|
161
|
+
throw new ValidationError('predictProba is only available for classification')
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
const Xn = normalizeX(X)
|
|
165
|
+
const n = Xn.rows
|
|
166
|
+
const nc = this.#nClasses
|
|
167
|
+
const nModels = this.#foldModels.length
|
|
168
|
+
|
|
169
|
+
const rawOutputs = []
|
|
170
|
+
let hasPromise = false
|
|
171
|
+
for (const model of this.#foldModels) {
|
|
172
|
+
const out = model.predictProba(Xn)
|
|
173
|
+
if (out != null && typeof out.then === 'function') hasPromise = true
|
|
174
|
+
rawOutputs.push(out)
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
const assemble = (outputs) => {
|
|
178
|
+
const result = new Float64Array(n * nc)
|
|
179
|
+
for (const proba of outputs) {
|
|
180
|
+
for (let i = 0; i < n * nc; i++) result[i] += proba[i]
|
|
181
|
+
}
|
|
182
|
+
for (let i = 0; i < n * nc; i++) result[i] /= nModels
|
|
183
|
+
return result
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return hasPromise ? Promise.all(rawOutputs).then(assemble) : assemble(rawOutputs)
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
score(X, y) {
|
|
190
|
+
this.#ensureFitted()
|
|
191
|
+
const preds = this.predict(X)
|
|
192
|
+
const yn = normalizeY(y)
|
|
193
|
+
const scorer = this.#task === 'classification' ? accuracy : r2Score
|
|
194
|
+
return lift(preds, p => scorer(yn, p))
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
#averagePredictions(Xn, n) {
|
|
198
|
+
const nModels = this.#foldModels.length
|
|
199
|
+
const rawOutputs = []
|
|
200
|
+
let hasPromise = false
|
|
201
|
+
for (const model of this.#foldModels) {
|
|
202
|
+
const out = model.predict(Xn)
|
|
203
|
+
if (out != null && typeof out.then === 'function') hasPromise = true
|
|
204
|
+
rawOutputs.push(out)
|
|
205
|
+
}
|
|
206
|
+
const assemble = (outputs) => {
|
|
207
|
+
const result = new Float64Array(n)
|
|
208
|
+
for (const preds of outputs) {
|
|
209
|
+
for (let i = 0; i < n; i++) result[i] += preds[i]
|
|
210
|
+
}
|
|
211
|
+
for (let i = 0; i < n; i++) result[i] /= nModels
|
|
212
|
+
return result
|
|
213
|
+
}
|
|
214
|
+
return hasPromise ? Promise.all(rawOutputs).then(assemble) : assemble(rawOutputs)
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Averaged OOF predictions.
|
|
219
|
+
* Classification: flat (n * nClasses) row-major probabilities.
|
|
220
|
+
* Regression: flat (n) predictions.
|
|
221
|
+
*/
|
|
222
|
+
get oofPredictions() {
|
|
223
|
+
this.#ensureFitted()
|
|
224
|
+
const counts = new Uint8Array(this.#oofCounts)
|
|
225
|
+
for (let i = 0; i < counts.length; i++) {
|
|
226
|
+
if (counts[i] === 0) counts[i] = 1
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if (this.#task === 'classification') {
|
|
230
|
+
const nc = this.#nClasses
|
|
231
|
+
const oof = new Float64Array(this.#oofAccum)
|
|
232
|
+
for (let i = 0; i < this.#nSamples; i++) {
|
|
233
|
+
const c = counts[i]
|
|
234
|
+
for (let j = 0; j < nc; j++) {
|
|
235
|
+
oof[i * nc + j] /= c
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
return oof
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
const oof = new Float64Array(this.#oofAccum)
|
|
242
|
+
for (let i = 0; i < this.#nSamples; i++) {
|
|
243
|
+
oof[i] /= counts[i]
|
|
244
|
+
}
|
|
245
|
+
return oof
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
save() {
|
|
249
|
+
this.#ensureFitted()
|
|
250
|
+
const typeId = this.#task === 'classification' ? TYPE_ID_CLS : TYPE_ID_REG
|
|
251
|
+
|
|
252
|
+
const manifest = {
|
|
253
|
+
typeId,
|
|
254
|
+
params: {
|
|
255
|
+
task: this.#task,
|
|
256
|
+
kFold: this.#kFold,
|
|
257
|
+
nRepeats: this.#nRepeats,
|
|
258
|
+
seed: this.#seed,
|
|
259
|
+
estimatorName: this.#spec[0],
|
|
260
|
+
classes: this.#classes ? [...this.#classes] : null,
|
|
261
|
+
nClasses: this.#nClasses,
|
|
262
|
+
nSamples: this.#nSamples,
|
|
263
|
+
},
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
const artifacts = this.#foldModels.map((model, i) => ({
|
|
267
|
+
id: `fold_${i}`,
|
|
268
|
+
data: model.save(),
|
|
269
|
+
mediaType: 'application/x-wlearn-bundle',
|
|
270
|
+
}))
|
|
271
|
+
|
|
272
|
+
// Store OOF data as raw float64 LE bytes
|
|
273
|
+
const oof = this.oofPredictions
|
|
274
|
+
const oofBytes = new Uint8Array(oof.buffer, oof.byteOffset, oof.byteLength)
|
|
275
|
+
artifacts.push({
|
|
276
|
+
id: 'oof',
|
|
277
|
+
data: oofBytes,
|
|
278
|
+
mediaType: 'application/octet-stream',
|
|
279
|
+
})
|
|
280
|
+
|
|
281
|
+
return encodeBundle(manifest, artifacts)
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
static async load(bytes) {
|
|
285
|
+
const { manifest, toc, blobs } = decodeBundle(bytes)
|
|
286
|
+
return BaggedEstimator._loadFromParts(manifest, toc, blobs)
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
dispose() {
|
|
290
|
+
if (this.#disposed) return
|
|
291
|
+
this.#disposed = true
|
|
292
|
+
if (this.#foldModels) {
|
|
293
|
+
for (const m of this.#foldModels) m.dispose()
|
|
294
|
+
}
|
|
295
|
+
this.#foldModels = null
|
|
296
|
+
this.#oofAccum = null
|
|
297
|
+
this.#oofCounts = null
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
getParams() {
|
|
301
|
+
return {
|
|
302
|
+
task: this.#task,
|
|
303
|
+
kFold: this.#kFold,
|
|
304
|
+
nRepeats: this.#nRepeats,
|
|
305
|
+
seed: this.#seed,
|
|
306
|
+
estimatorName: this.#spec ? this.#spec[0] : null,
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
setParams(p) {
|
|
311
|
+
this.#ensureAlive()
|
|
312
|
+
if (p.kFold !== undefined) this.#kFold = p.kFold
|
|
313
|
+
if (p.nRepeats !== undefined) this.#nRepeats = p.nRepeats
|
|
314
|
+
if (p.seed !== undefined) this.#seed = p.seed
|
|
315
|
+
return this
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
get capabilities() {
|
|
319
|
+
return {
|
|
320
|
+
classifier: this.#task === 'classification',
|
|
321
|
+
regressor: this.#task === 'regression',
|
|
322
|
+
predictProba: this.#task === 'classification',
|
|
323
|
+
decisionFunction: false,
|
|
324
|
+
sampleWeight: false,
|
|
325
|
+
csr: false,
|
|
326
|
+
earlyStopping: false,
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
get isFitted() { return this.#fitted && !this.#disposed }
|
|
331
|
+
get classes() { return this.#classes }
|
|
332
|
+
|
|
333
|
+
// --- Static internals ---
|
|
334
|
+
|
|
335
|
+
static _register() {
|
|
336
|
+
if (_registered) return
|
|
337
|
+
_registered = true
|
|
338
|
+
const loader = (manifest, toc, blobs) =>
|
|
339
|
+
BaggedEstimator._loadFromParts(manifest, toc, blobs)
|
|
340
|
+
register(TYPE_ID_CLS, loader)
|
|
341
|
+
register(TYPE_ID_REG, loader)
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
static async _loadFromParts(manifest, toc, blobs) {
|
|
345
|
+
const p = manifest.params
|
|
346
|
+
const bag = new BaggedEstimator({
|
|
347
|
+
task: p.task,
|
|
348
|
+
kFold: p.kFold || 5,
|
|
349
|
+
nRepeats: p.nRepeats || 1,
|
|
350
|
+
seed: p.seed ?? 42,
|
|
351
|
+
})
|
|
352
|
+
bag.#classes = p.classes ? new Int32Array(p.classes) : null
|
|
353
|
+
bag.#nClasses = p.nClasses || 0
|
|
354
|
+
bag.#nSamples = p.nSamples || 0
|
|
355
|
+
bag.#spec = [p.estimatorName || 'base', null, null]
|
|
356
|
+
|
|
357
|
+
// Load fold models
|
|
358
|
+
const nFoldModels = bag.#kFold * bag.#nRepeats
|
|
359
|
+
bag.#foldModels = []
|
|
360
|
+
for (let i = 0; i < nFoldModels; i++) {
|
|
361
|
+
const foldId = `fold_${i}`
|
|
362
|
+
const entry = toc.find(t => t.id === foldId)
|
|
363
|
+
if (!entry) throw new ValidationError(`No artifact for "${foldId}"`)
|
|
364
|
+
const blob = blobs.subarray(entry.offset, entry.offset + entry.length)
|
|
365
|
+
bag.#foldModels.push(await registryLoad(blob))
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// Load OOF data
|
|
369
|
+
const oofEntry = toc.find(t => t.id === 'oof')
|
|
370
|
+
if (oofEntry) {
|
|
371
|
+
const oofBlob = blobs.subarray(oofEntry.offset, oofEntry.offset + oofEntry.length)
|
|
372
|
+
const oof = new Float64Array(
|
|
373
|
+
oofBlob.buffer.slice(oofBlob.byteOffset, oofBlob.byteOffset + oofBlob.byteLength)
|
|
374
|
+
)
|
|
375
|
+
bag.#oofAccum = oof
|
|
376
|
+
bag.#oofCounts = new Uint8Array(bag.#nSamples).fill(1)
|
|
377
|
+
} else {
|
|
378
|
+
if (bag.#task === 'classification') {
|
|
379
|
+
bag.#oofAccum = new Float64Array(bag.#nSamples * bag.#nClasses)
|
|
380
|
+
} else {
|
|
381
|
+
bag.#oofAccum = new Float64Array(bag.#nSamples)
|
|
382
|
+
}
|
|
383
|
+
bag.#oofCounts = new Uint8Array(bag.#nSamples)
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
bag.#fitted = true
|
|
387
|
+
return bag
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// --- Subset helpers ---
|
|
392
|
+
|
|
393
|
+
function _subsetX(X, indices) {
|
|
394
|
+
const { data, cols } = X
|
|
395
|
+
const rows = indices.length
|
|
396
|
+
const out = new Float64Array(rows * cols)
|
|
397
|
+
for (let i = 0; i < rows; i++) {
|
|
398
|
+
const srcOff = indices[i] * cols
|
|
399
|
+
out.set(data.subarray(srcOff, srcOff + cols), i * cols)
|
|
400
|
+
}
|
|
401
|
+
return { data: out, rows, cols }
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
function _subsetY(y, indices) {
|
|
405
|
+
const out = new (y.constructor)(indices.length)
|
|
406
|
+
for (let i = 0; i < indices.length; i++) {
|
|
407
|
+
out[i] = y[indices[i]]
|
|
408
|
+
}
|
|
409
|
+
return out
|
|
410
|
+
}
|
package/src/index.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export { VotingEnsemble } from './voting.js'
|
|
2
|
+
export { StackingEnsemble } from './stacking.js'
|
|
3
|
+
export { BaggedEstimator } from './bagging.js'
|
|
4
|
+
export { caruanaSelect } from './selection.js'
|
|
5
|
+
export { getOofPredictions } from './oof.js'
|
|
6
|
+
export { optimizeWeights, projectSimplex } from './weights.js'
|
package/src/oof.js
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { stratifiedKFold, kFold, normalizeX, normalizeY, ValidationError } from '@wlearn/core'
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Generate out-of-fold predictions for a list of estimator specs.
|
|
5
|
+
*
|
|
6
|
+
* @param {Array<[string, Function, Object]>} estimatorSpecs - [name, EstimatorClass, params]
|
|
7
|
+
* @param {Object|number[][]} X - feature matrix
|
|
8
|
+
* @param {TypedArray|number[]} y - labels
|
|
9
|
+
* @param {Object} opts
|
|
10
|
+
* @returns {{ oofPreds: Float64Array[], classes: Int32Array|null }}
|
|
11
|
+
*/
|
|
12
|
+
export async function getOofPredictions(estimatorSpecs, X, y, {
|
|
13
|
+
cv = 5,
|
|
14
|
+
seed = 42,
|
|
15
|
+
task = 'classification',
|
|
16
|
+
} = {}) {
|
|
17
|
+
const Xn = normalizeX(X)
|
|
18
|
+
const yn = normalizeY(y)
|
|
19
|
+
const n = Xn.rows
|
|
20
|
+
|
|
21
|
+
const folds = task === 'classification'
|
|
22
|
+
? stratifiedKFold(yn, cv, { shuffle: true, seed })
|
|
23
|
+
: kFold(n, cv, { shuffle: true, seed })
|
|
24
|
+
|
|
25
|
+
// Discover classes for classification
|
|
26
|
+
let classes = null
|
|
27
|
+
let nClasses = 0
|
|
28
|
+
if (task === 'classification') {
|
|
29
|
+
const labelSet = new Set()
|
|
30
|
+
for (let i = 0; i < yn.length; i++) labelSet.add(yn[i])
|
|
31
|
+
classes = new Int32Array([...labelSet].sort((a, b) => a - b))
|
|
32
|
+
nClasses = classes.length
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const oofPreds = []
|
|
36
|
+
|
|
37
|
+
for (const [name, EstimatorClass, params] of estimatorSpecs) {
|
|
38
|
+
let oof
|
|
39
|
+
if (task === 'classification') {
|
|
40
|
+
oof = new Float64Array(n * nClasses)
|
|
41
|
+
} else {
|
|
42
|
+
oof = new Float64Array(n)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
for (const { train, test } of folds) {
|
|
46
|
+
const Xtrain = _subsetX(Xn, train)
|
|
47
|
+
const ytrain = _subsetY(yn, train)
|
|
48
|
+
const Xtest = _subsetX(Xn, test)
|
|
49
|
+
|
|
50
|
+
const model = await EstimatorClass.create(params || {})
|
|
51
|
+
try {
|
|
52
|
+
model.fit(Xtrain, ytrain)
|
|
53
|
+
if (task === 'classification') {
|
|
54
|
+
const proba = await model.predictProba(Xtest)
|
|
55
|
+
for (let i = 0; i < test.length; i++) {
|
|
56
|
+
const row = test[i]
|
|
57
|
+
for (let c = 0; c < nClasses; c++) {
|
|
58
|
+
oof[row * nClasses + c] = proba[i * nClasses + c]
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
} else {
|
|
62
|
+
const preds = await model.predict(Xtest)
|
|
63
|
+
for (let i = 0; i < test.length; i++) {
|
|
64
|
+
oof[test[i]] = preds[i]
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
} finally {
|
|
68
|
+
model.dispose()
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
oofPreds.push(oof)
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return { oofPreds, classes }
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// --- Internal helpers (same as core/cv.js) ---
|
|
78
|
+
|
|
79
|
+
function _subsetX(X, indices) {
|
|
80
|
+
const { data, cols } = X
|
|
81
|
+
const rows = indices.length
|
|
82
|
+
const out = new Float64Array(rows * cols)
|
|
83
|
+
for (let i = 0; i < rows; i++) {
|
|
84
|
+
const srcOff = indices[i] * cols
|
|
85
|
+
out.set(data.subarray(srcOff, srcOff + cols), i * cols)
|
|
86
|
+
}
|
|
87
|
+
return { data: out, rows, cols }
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function _subsetY(y, indices) {
|
|
91
|
+
const out = new (y.constructor)(indices.length)
|
|
92
|
+
for (let i = 0; i < indices.length; i++) {
|
|
93
|
+
out[i] = y[indices[i]]
|
|
94
|
+
}
|
|
95
|
+
return out
|
|
96
|
+
}
|
package/src/selection.js
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import { getScorer, normalizeY, ValidationError } from '@wlearn/core'
|
|
2
|
+
import { optimizeWeights } from './weights.js'
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Caruana greedy ensemble selection (Caruana et al., 2004).
|
|
6
|
+
*
|
|
7
|
+
* Selects a weighted subset from a pool of OOF predictions by greedily
|
|
8
|
+
* adding the candidate that most improves the ensemble score at each step.
|
|
9
|
+
* Candidates can be selected multiple times (with replacement).
|
|
10
|
+
*
|
|
11
|
+
* @param {Float64Array[]} oofPredictions - per-candidate OOF predictions
|
|
12
|
+
* Classification: each is n_samples * n_classes (flat row-major proba)
|
|
13
|
+
* Regression: each is n_samples
|
|
14
|
+
* @param {TypedArray|number[]} yTrue - true labels
|
|
15
|
+
* @param {Object} opts
|
|
16
|
+
* @param {boolean} opts.refineWeights - if true, optimize weights after selection
|
|
17
|
+
* @returns {{ indices: Int32Array, weights: Float64Array, scores: Float64Array }}
|
|
18
|
+
*/
|
|
19
|
+
export function caruanaSelect(oofPredictions, yTrue, {
|
|
20
|
+
maxSize = 20,
|
|
21
|
+
scoring = 'accuracy',
|
|
22
|
+
task = 'classification',
|
|
23
|
+
nClasses = 0,
|
|
24
|
+
refineWeights = true,
|
|
25
|
+
} = {}) {
|
|
26
|
+
const yn = normalizeY(yTrue)
|
|
27
|
+
const n = yn.length
|
|
28
|
+
const nCandidates = oofPredictions.length
|
|
29
|
+
|
|
30
|
+
if (nCandidates === 0) {
|
|
31
|
+
throw new ValidationError('caruanaSelect: need at least 1 candidate')
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const scorerFn = getScorer(scoring)
|
|
35
|
+
|
|
36
|
+
// Determine prediction size per sample
|
|
37
|
+
const predSize = oofPredictions[0].length / n
|
|
38
|
+
if (predSize !== Math.floor(predSize)) {
|
|
39
|
+
throw new ValidationError('caruanaSelect: oofPredictions[0].length must be divisible by n')
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (task === 'classification' && nClasses === 0) {
|
|
43
|
+
nClasses = predSize
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Current ensemble prediction (running weighted average)
|
|
47
|
+
const current = new Float64Array(oofPredictions[0].length)
|
|
48
|
+
const selected = []
|
|
49
|
+
const scores = []
|
|
50
|
+
|
|
51
|
+
for (let t = 0; t < maxSize; t++) {
|
|
52
|
+
let bestIdx = -1
|
|
53
|
+
let bestScore = -Infinity
|
|
54
|
+
|
|
55
|
+
for (let i = 0; i < nCandidates; i++) {
|
|
56
|
+
// Trial: ((t) * current + P[i]) / (t + 1)
|
|
57
|
+
const trial = _trialPredictions(current, oofPredictions[i], t, t + 1)
|
|
58
|
+
const trialScore = _score(trial, yn, scorerFn, task, nClasses, n)
|
|
59
|
+
if (trialScore > bestScore) {
|
|
60
|
+
bestScore = trialScore
|
|
61
|
+
bestIdx = i
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
selected.push(bestIdx)
|
|
66
|
+
scores.push(bestScore)
|
|
67
|
+
|
|
68
|
+
// Update running ensemble: current = (t * current + P[bestIdx]) / (t + 1)
|
|
69
|
+
const P = oofPredictions[bestIdx]
|
|
70
|
+
for (let j = 0; j < current.length; j++) {
|
|
71
|
+
current[j] = (t * current[j] + P[j]) / (t + 1)
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Compute weights from selection counts
|
|
76
|
+
const counts = new Map()
|
|
77
|
+
for (const idx of selected) {
|
|
78
|
+
counts.set(idx, (counts.get(idx) || 0) + 1)
|
|
79
|
+
}
|
|
80
|
+
const uniqueIndices = new Int32Array([...counts.keys()].sort((a, b) => a - b))
|
|
81
|
+
const weights = new Float64Array(uniqueIndices.length)
|
|
82
|
+
for (let i = 0; i < uniqueIndices.length; i++) {
|
|
83
|
+
weights[i] = counts.get(uniqueIndices[i]) / maxSize
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const result = {
|
|
87
|
+
indices: uniqueIndices,
|
|
88
|
+
weights,
|
|
89
|
+
scores: new Float64Array(scores),
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (refineWeights && uniqueIndices.length > 1) {
|
|
93
|
+
const selectedOofs = Array.from(uniqueIndices, idx => oofPredictions[idx])
|
|
94
|
+
result.weights = optimizeWeights(selectedOofs, yn, weights, { task })
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
return result
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// --- Internal helpers ---
|
|
101
|
+
|
|
102
|
+
function _trialPredictions(current, candidate, tCount, tTotal) {
|
|
103
|
+
const trial = new Float64Array(current.length)
|
|
104
|
+
for (let j = 0; j < current.length; j++) {
|
|
105
|
+
trial[j] = (tCount * current[j] + candidate[j]) / tTotal
|
|
106
|
+
}
|
|
107
|
+
return trial
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function _score(preds, yTrue, scorerFn, task, nClasses, n) {
|
|
111
|
+
if (task === 'regression') {
|
|
112
|
+
return scorerFn(yTrue, preds)
|
|
113
|
+
}
|
|
114
|
+
// Classification: convert proba to hard predictions via argmax
|
|
115
|
+
const hardPreds = new Float64Array(n)
|
|
116
|
+
for (let i = 0; i < n; i++) {
|
|
117
|
+
let bestC = 0, bestV = -Infinity
|
|
118
|
+
for (let c = 0; c < nClasses; c++) {
|
|
119
|
+
if (preds[i * nClasses + c] > bestV) {
|
|
120
|
+
bestV = preds[i * nClasses + c]
|
|
121
|
+
bestC = c
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
hardPreds[i] = bestC
|
|
125
|
+
}
|
|
126
|
+
return scorerFn(yTrue, hardPreds)
|
|
127
|
+
}
|