datly 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/code.js ADDED
@@ -0,0 +1,2466 @@
1
+ // datly.js — functional, text-first data-science toolkit for JavaScript
2
+ // design goals:
3
+ // - functional api (only functions)
4
+ // - every public function returns lowercase, human-readable structured TEXT
5
+ // - dataframe is a plain object; models are serialized as text (json string) and consumed by other funcs
6
+
7
+ // =========================
8
+ // helpers: types, math, formatting
9
+ // =========================
10
+
11
+ // =========================
12
+ // TEXT FORMATTING (tabular output)
13
+ // =========================
14
+
15
+ const _toTable = (data, opts = {}) => {
16
+ const maxWidth = opts.max_width ?? 80;
17
+ const padding = opts.padding ?? 2;
18
+
19
+ if (Array.isArray(data) && data.length > 0 && typeof data[0] === "object") {
20
+ // array of objects -> table
21
+ const keys = Object.keys(data[0]);
22
+ const rows = data.map((obj) => keys.map((k) => String(obj[k] ?? "")));
23
+ const headers = keys;
24
+
25
+ // calculate column widths
26
+ const widths = headers.map((h, i) => {
27
+ const maxContentWidth = Math.max(
28
+ h.length,
29
+ ...rows.map((r) => r[i].length)
30
+ );
31
+ return Math.min(maxContentWidth + padding, maxWidth / keys.length);
32
+ });
33
+
34
+ // build table
35
+ const separator = "+" + widths.map((w) => "-".repeat(w)).join("+") + "+";
36
+ const headerRow =
37
+ "|" + headers.map((h, i) => h.padEnd(widths[i])).join("|") + "|";
38
+ const dataRows = rows.map(
39
+ (row) =>
40
+ "|" +
41
+ row
42
+ .map((cell, i) => cell.slice(0, widths[i]).padEnd(widths[i]))
43
+ .join("|") +
44
+ "|"
45
+ );
46
+
47
+ return [separator, headerRow, separator, ...dataRows, separator].join("\n");
48
+ }
49
+
50
+ if (typeof data === "object" && !Array.isArray(data)) {
51
+ // single object -> key-value table
52
+ const entries = Object.entries(data).map(([k, v]) => ({
53
+ key: String(k),
54
+ value: typeof v === "object" ? JSON.stringify(v) : String(v),
55
+ }));
56
+ return _toTable(entries, opts);
57
+ }
58
+
59
+ return String(data);
60
+ };
61
+
62
+ const _textTable = (obj, opts = {}) => {
63
+ const format = opts.format ?? "auto"; // 'auto', 'yaml', 'table'
64
+
65
+ // check if we should use table format
66
+ const shouldTable =
67
+ format === "table" ||
68
+ (format === "auto" &&
69
+ obj.type &&
70
+ [
71
+ "split",
72
+ "prediction",
73
+ "metric",
74
+ "describe",
75
+ "missing_report",
76
+ "eda",
77
+ ].includes(obj.type));
78
+
79
+ if (!shouldTable) {
80
+ return _text(obj);
81
+ }
82
+
83
+ // format specific types as tables
84
+ if (obj.type === "split") {
85
+ const info = [
86
+ { metric: "train_size", value: obj.sizes?.train ?? 0 },
87
+ { metric: "test_size", value: obj.sizes?.test ?? 0 },
88
+ ];
89
+ let output = "type: split\n\n" + _toTable(info);
90
+
91
+ if (obj.indices) {
92
+ output +=
93
+ "\n\ntrain_indices: [" + (obj.indices.train || []).join(", ") + "]";
94
+ output += "\ntest_indices: [" + (obj.indices.test || []).join(", ") + "]";
95
+ }
96
+
97
+ if (obj.preview?.x_train && obj.preview.x_train.length > 0) {
98
+ output += "\n\nx_train (preview):";
99
+ const preview = obj.preview.x_train.map((row, i) => {
100
+ const rowObj = { row: i };
101
+ row.forEach((val, j) => {
102
+ rowObj[`f${j}`] = val;
103
+ });
104
+ return rowObj;
105
+ });
106
+ output += "\n" + _toTable(preview);
107
+ }
108
+
109
+ if (obj.preview?.y_train && obj.preview.y_train.length > 0) {
110
+ output +=
111
+ "\n\ny_train (preview): [" + obj.preview.y_train.join(", ") + "]";
112
+ }
113
+
114
+ return output.toLowerCase();
115
+ }
116
+
117
+ if (obj.type === "prediction") {
118
+ if (obj.predictions && obj.predictions.length <= 20) {
119
+ const table = obj.predictions.map((pred, i) => ({
120
+ index: i,
121
+ prediction: pred,
122
+ }));
123
+ let output = `type: prediction\nmodel: ${obj.name || "unknown"}\n\n`;
124
+ output += _toTable(table);
125
+ return output.toLowerCase();
126
+ }
127
+ }
128
+
129
+ if (obj.type === "metric" && obj.confusion_matrix) {
130
+ const cm = obj.confusion_matrix;
131
+ const metrics = [
132
+ { metric: "accuracy", value: obj.accuracy },
133
+ { metric: "precision", value: obj.precision },
134
+ { metric: "recall", value: obj.recall },
135
+ { metric: "f1", value: obj.f1 },
136
+ ];
137
+ let output = `type: ${obj.type}\nname: ${obj.name}\n\n`;
138
+ output += "confusion matrix:\n";
139
+ output += _toTable([
140
+ { "": "predicted_0", predicted_1: "" },
141
+ { "": `actual_0: ${cm.tn}`, predicted_1: cm.fp },
142
+ { "": `actual_1: ${cm.fn}`, predicted_1: cm.tp },
143
+ ]);
144
+ output += "\n\nmetrics:\n" + _toTable(metrics);
145
+ return output.toLowerCase();
146
+ }
147
+
148
+ if (obj.type === "describe" && obj.columns) {
149
+ const cols = Object.keys(obj.columns);
150
+ const table = cols.map((col) => {
151
+ const info = obj.columns[col];
152
+ const row = {
153
+ column: col,
154
+ dtype: info.dtype,
155
+ count: info.count,
156
+ missing: info.missing,
157
+ };
158
+ if (info.dtype === "number") {
159
+ row.mean = info.mean?.toPrecision(4);
160
+ row.std = info.std?.toPrecision(4);
161
+ row.min = info.min?.toPrecision(4);
162
+ row.median = info.median?.toPrecision(4);
163
+ row.max = info.max?.toPrecision(4);
164
+ } else if (info.unique) {
165
+ row.unique = info.unique;
166
+ }
167
+ return row;
168
+ });
169
+ return "type: describe\n\n" + _toTable(table).toLowerCase();
170
+ }
171
+
172
+ if (obj.type === "missing_report" && obj.rows) {
173
+ return "type: missing_report\n\n" + _toTable(obj.rows).toLowerCase();
174
+ }
175
+
176
+ // default to yaml format
177
+ return _text(obj);
178
+ };
179
+
180
+ const _isNumber = (v) => typeof v === "number" && Number.isFinite(v);
181
+ const _toNum = (v) => (v == null || v === "" ? NaN : Number(v));
182
+ const _clone = (o) => JSON.parse(JSON.stringify(o));
183
+
184
+ const _flatten = (arr) => arr.reduce((a, b) => a.concat(b), []);
185
+
186
+ const _numeric = (arr) => arr.map(_toNum).filter((x) => Number.isFinite(x));
187
+ const _uniq = (arr) => Array.from(new Set(arr));
188
+
189
+ const _sum = (arr) => _numeric(arr).reduce((a, b) => a + b, 0);
190
+ const _mean = (arr) => {
191
+ const x = _numeric(arr);
192
+ const n = x.length;
193
+ if (!n) return NaN;
194
+ return _sum(x) / n;
195
+ };
196
+ const _variance = (arr, sample = true) => {
197
+ const x = _numeric(arr);
198
+ const n = x.length;
199
+ if (n < 2) return NaN;
200
+ const m = _mean(x);
201
+ const s = x.reduce((a, b) => a + (b - m) ** 2, 0);
202
+ return s / (sample ? n - 1 : n);
203
+ };
204
+ const _std = (arr, sample = true) => Math.sqrt(_variance(arr, sample));
205
+ const _min = (arr) => Math.min(..._numeric(arr));
206
+ const _max = (arr) => Math.max(..._numeric(arr));
207
+ const _median = (arr) => {
208
+ const x = _numeric(arr).sort((a, b) => a - b);
209
+ const n = x.length;
210
+ if (!n) return NaN;
211
+ const mid = Math.floor(n / 2);
212
+ return n % 2 ? x[mid] : (x[mid - 1] + x[mid]) / 2;
213
+ };
214
+ const _quantile = (arr, q) => {
215
+ const x = _numeric(arr).sort((a, b) => a - b);
216
+ const n = x.length;
217
+ if (!n) return NaN;
218
+ const pos = (n - 1) * q;
219
+ const base = Math.floor(pos);
220
+ const rest = pos - base;
221
+ return x[base] + (x[Math.min(base + 1, n - 1)] - x[base]) * rest;
222
+ };
223
+ const _skewness = (arr) => {
224
+ const x = _numeric(arr);
225
+ const n = x.length;
226
+ if (n < 3) return NaN;
227
+ const m = _mean(x);
228
+ const s = _std(x, true);
229
+ const m3 = x.reduce((a, b) => a + (b - m) ** 3, 0) / n;
230
+ return ((m3 / s ** 3) * Math.sqrt(n * (n - 1))) / (n - 2);
231
+ };
232
+ const _kurtosis = (arr) => {
233
+ const x = _numeric(arr);
234
+ const n = x.length;
235
+ if (n < 4) return NaN;
236
+ const m = _mean(x);
237
+ const s2 = _variance(x, true);
238
+ const m4 = x.reduce((a, b) => a + (b - m) ** 4, 0) / n;
239
+ const g2 = m4 / s2 ** 2 - 3;
240
+ return g2;
241
+ };
242
+
243
+ const _corrPearson = (x, y) => {
244
+ const a = _numeric(x),
245
+ b = _numeric(y);
246
+ const n = Math.min(a.length, b.length);
247
+ if (n < 2) return NaN;
248
+ const ax = a.slice(0, n),
249
+ by = b.slice(0, n);
250
+ const mx = _mean(ax),
251
+ my = _mean(by);
252
+ let num = 0,
253
+ dx = 0,
254
+ dy = 0;
255
+ for (let i = 0; i < n; i++) {
256
+ const vx = ax[i] - mx,
257
+ vy = by[i] - my;
258
+ num += vx * vy;
259
+ dx += vx * vx;
260
+ dy += vy * vy;
261
+ }
262
+ return num / Math.sqrt(dx * dy);
263
+ };
264
+
265
+ const _rank = (arr) => {
266
+ const indexed = _numeric(arr)
267
+ .map((v, i) => ({ v, i }))
268
+ .sort((a, b) => a.v - b.v);
269
+ const ranks = Array(arr.length).fill(NaN);
270
+ let i = 0;
271
+ while (i < indexed.length) {
272
+ let j = i;
273
+ while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
274
+ const r = (i + j) / 2 + 1;
275
+ for (let k = i; k <= j; k++) ranks[indexed[k].i] = r;
276
+ i = j + 1;
277
+ }
278
+ return ranks.filter(Number.isFinite);
279
+ };
280
+
281
+ const _corrSpearman = (x, y) => _corrPearson(_rank(x), _rank(y));
282
+
283
+ const _invErf = (x) => {
284
+ // numerical approx of inverse error function (for normal quantile)
285
+ const a = 0.147;
286
+ const ln = Math.log(1 - x * x);
287
+ const t = 2 / (Math.PI * a) + ln / 2;
288
+ const s = Math.sign(x) * Math.sqrt(Math.sqrt(t * t - ln / a) - t);
289
+ return s;
290
+ };
291
+
292
+ // standard normal pdf/cdf/ppf
293
+ const _phi = (z) => Math.exp(-0.5 * z * z) / Math.sqrt(2 * Math.PI);
294
+ const _Phi = (z) => 0.5 * (1 + erf(z / Math.SQRT2));
295
+ const erf = (x) => {
296
+ // numerical approx for erf
297
+ const sign = Math.sign(x);
298
+ x = Math.abs(x);
299
+ const a1 = 0.254829592,
300
+ a2 = -0.284496736,
301
+ a3 = 1.421413741,
302
+ a4 = -1.453152027,
303
+ a5 = 1.061405429,
304
+ p = 0.3275911;
305
+ const t = 1 / (1 + p * x);
306
+ const y =
307
+ 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-x * x);
308
+ return sign * y;
309
+ };
310
+ const _normInv = (p) => {
311
+ if (p <= 0 || p >= 1) return NaN;
312
+ return Math.SQRT2 * _invErf(2 * p - 1);
313
+ };
314
+
315
+ // lowercased text output
316
+ const _text = (obj) => {
317
+ const lowerKeys = (o) =>
318
+ Array.isArray(o)
319
+ ? o.map(lowerKeys)
320
+ : o && typeof o === "object"
321
+ ? Object.fromEntries(
322
+ Object.entries(o).map(([k, v]) => [
323
+ String(k).toLowerCase(),
324
+ lowerKeys(v),
325
+ ])
326
+ )
327
+ : typeof o === "number" && Number.isFinite(o)
328
+ ? Number(Number(o).toPrecision(12))
329
+ : o;
330
+ const normalized = lowerKeys(obj);
331
+ const lines = [];
332
+ const walk = (o, indent = 0) => {
333
+ const pad = " ".repeat(indent);
334
+ if (Array.isArray(o)) {
335
+ lines.push(pad + "- list:");
336
+ o.forEach((v) => walk(v, indent + 2));
337
+ } else if (o && typeof o === "object") {
338
+ Object.keys(o).forEach((k) => {
339
+ const v = o[k];
340
+ if (v && typeof v === "object") {
341
+ lines.push(pad + k + ":");
342
+ walk(v, indent + 2);
343
+ } else {
344
+ lines.push(pad + k + ": " + String(v).toLowerCase());
345
+ }
346
+ });
347
+ } else {
348
+ lines.push(pad + String(o).toLowerCase());
349
+ }
350
+ };
351
+ walk(normalized);
352
+ return lines.join("\n");
353
+ };
354
+
355
+ const _ok = (type, payload) => _text({ type, ...payload });
356
+ const _err = (type, message) => _text({ type, error: message });
357
+
358
+ // =========================
359
+ // dataframe
360
+ // =========================
361
+
362
+ const dataframe_from_json = (input) => {
363
+ try {
364
+ const rows = Array.isArray(input)
365
+ ? input
366
+ : typeof input === "object"
367
+ ? [input]
368
+ : [];
369
+ const columns = _uniq(_flatten(rows.map((r) => Object.keys(r))));
370
+ const data = rows.map((r) => columns.map((c) => r[c] ?? null));
371
+ const dtypes = columns.map((c, j) => {
372
+ const col = data.map((row) => row[j]).filter((v) => v != null);
373
+ const nums = col.map(_toNum).filter(Number.isFinite);
374
+ if (nums.length === col.length) return "number";
375
+ if (col.every((v) => typeof v === "boolean")) return "boolean";
376
+ return "string";
377
+ });
378
+ return _ok("dataframe", {
379
+ columns,
380
+ n_rows: data.length,
381
+ n_cols: columns.length,
382
+ dtypes,
383
+ preview: rows.slice(0, 5),
384
+ });
385
+ } catch (e) {
386
+ return _err("dataframe", "invalid input");
387
+ }
388
+ };
389
+
390
+ const _df_parse = (df_text) => {
391
+ // expects text created by dataframe_from_json; we rehydrate minimally by parsing preview when needed
392
+ // for operations, accept a lightweight format: provide original rows too
393
+ try {
394
+ const lines = df_text.split("\n");
395
+ // not robust parsing; instead require users to carry raw rows via helper
396
+ return null;
397
+ } catch {
398
+ return null;
399
+ }
400
+ };
401
+
402
+ // utilities to operate directly on raw rows (array of objects)
403
+ const df_describe = (rows) => {
404
+ if (!Array.isArray(rows) || !rows.length)
405
+ return _err("describe", "empty data");
406
+ const cols = _uniq(_flatten(rows.map((r) => Object.keys(r))));
407
+ const out = { type: "describe", columns: {} };
408
+ cols.forEach((c) => {
409
+ const col = rows.map((r) => r[c]);
410
+ const nums = col.map(_toNum).filter(Number.isFinite);
411
+ const miss = col.filter(
412
+ (v) =>
413
+ v == null || (typeof v === "number" && !Number.isFinite(v)) || v === ""
414
+ ).length;
415
+ const dtype =
416
+ nums.length === col.length
417
+ ? "number"
418
+ : col.every((v) => typeof v === "boolean")
419
+ ? "boolean"
420
+ : "string";
421
+ const info = { dtype, count: col.length, missing: miss };
422
+ if (dtype === "number") {
423
+ info.mean = _mean(nums);
424
+ info.std = _std(nums);
425
+ info.min = _min(nums);
426
+ info.q1 = _quantile(nums, 0.25);
427
+ info.median = _median(nums);
428
+ info.q3 = _quantile(nums, 0.75);
429
+ info.max = _max(nums);
430
+ info.skewness = _skewness(nums);
431
+ info.kurtosis = _kurtosis(nums);
432
+ } else if (dtype === "string" || dtype === "boolean") {
433
+ const vc = {};
434
+ col.forEach((v) => {
435
+ const key = String(v);
436
+ vc[key] = (vc[key] || 0) + 1;
437
+ });
438
+ const entries = Object.entries(vc)
439
+ .sort((a, b) => b[1] - a[1])
440
+ .slice(0, 10);
441
+ info.top = entries.map(([k, v]) => ({ value: k, freq: v }));
442
+ info.unique = Object.keys(vc).length;
443
+ }
444
+ out.columns[c] = info;
445
+ });
446
+ return _text(out);
447
+ };
448
+
449
+ const df_missing_report = (rows) => {
450
+ if (!Array.isArray(rows) || !rows.length)
451
+ return _err("missing_report", "empty data");
452
+ const cols = _uniq(_flatten(rows.map((r) => Object.keys(r))));
453
+ const res = cols.map((c) => {
454
+ const col = rows.map((r) => r[c]);
455
+ const miss = col.filter((v) => v == null || v === "").length;
456
+ return { column: c, missing: miss, missing_rate: miss / col.length };
457
+ });
458
+ return _text({ type: "missing_report", rows: res });
459
+ };
460
+
461
+ const df_corr = (rows, method = "pearson") => {
462
+ const cols = _uniq(_flatten(rows.map((r) => Object.keys(r))));
463
+ const numericCols = cols.filter((c) =>
464
+ rows.every((r) => Number.isFinite(_toNum(r[c])) || r[c] == null)
465
+ );
466
+ const mat = {};
467
+ numericCols.forEach((a) => {
468
+ mat[a] = {};
469
+ const xa = rows.map((r) => _toNum(r[a]));
470
+ numericCols.forEach((b) => {
471
+ const xb = rows.map((r) => _toNum(r[b]));
472
+ const c =
473
+ method === "spearman" ? _corrSpearman(xa, xb) : _corrPearson(xa, xb);
474
+ mat[a][b] = c;
475
+ });
476
+ });
477
+ return _text({ type: "correlation_matrix", method, matrix: mat });
478
+ };
479
+
480
+ // =========================
481
+ // core statistics (public)
482
+ // =========================
483
+
484
+ const mean = (arr) =>
485
+ _ok("statistic", {
486
+ name: "mean",
487
+ n: _numeric(arr).length,
488
+ value: _mean(arr),
489
+ });
490
+ const stddeviation = (arr, sample = true) =>
491
+ _ok("statistic", {
492
+ name: "std_deviation",
493
+ sample,
494
+ n: _numeric(arr).length,
495
+ value: _std(arr, sample),
496
+ });
497
+ const variance = (arr, sample = true) =>
498
+ _ok("statistic", {
499
+ name: "variance",
500
+ sample,
501
+ n: _numeric(arr).length,
502
+ value: _variance(arr, sample),
503
+ });
504
+ const median = (arr) =>
505
+ _ok("statistic", {
506
+ name: "median",
507
+ n: _numeric(arr).length,
508
+ value: _median(arr),
509
+ });
510
+ const quantile = (arr, q) =>
511
+ _ok("statistic", {
512
+ name: "quantile",
513
+ q,
514
+ n: _numeric(arr).length,
515
+ value: _quantile(arr, q),
516
+ });
517
+ const minv = (arr) => _ok("statistic", { name: "min", value: _min(arr) });
518
+ const maxv = (arr) => _ok("statistic", { name: "max", value: _max(arr) });
519
+ const skewness = (arr) =>
520
+ _ok("statistic", { name: "skewness", value: _skewness(arr) });
521
+ const kurtosis = (arr) =>
522
+ _ok("statistic", { name: "kurtosis", value: _kurtosis(arr) });
523
+ const corr_pearson = (x, y) =>
524
+ _ok("statistic", { name: "pearson_correlation", value: _corrPearson(x, y) });
525
+ const corr_spearman = (x, y) =>
526
+ _ok("statistic", {
527
+ name: "spearman_correlation",
528
+ value: _corrSpearman(x, y),
529
+ });
530
+
531
+ // =========================
532
+ // probability distributions
533
+ // =========================
534
+
535
+ const normal_pdf = (x, mu = 0, sigma = 1) =>
536
+ _ok("distribution", {
537
+ name: "normal_pdf",
538
+ params: { mu, sigma },
539
+ value: Array.isArray(x)
540
+ ? x.map((v) => _phi((v - mu) / sigma) / sigma)
541
+ : _phi((x - mu) / sigma) / sigma,
542
+ });
543
+ const normal_cdf = (x, mu = 0, sigma = 1) =>
544
+ _ok("distribution", {
545
+ name: "normal_cdf",
546
+ params: { mu, sigma },
547
+ value: Array.isArray(x)
548
+ ? x.map((v) => _Phi((v - mu) / sigma))
549
+ : _Phi((x - mu) / sigma),
550
+ });
551
+ const normal_ppf = (p, mu = 0, sigma = 1) =>
552
+ _ok("distribution", {
553
+ name: "normal_ppf",
554
+ params: { mu, sigma },
555
+ value: Array.isArray(p)
556
+ ? p.map((q) => mu + sigma * _normInv(q))
557
+ : mu + sigma * _normInv(p),
558
+ });
559
+
560
+ const binomial_pmf = (k, n, p) => {
561
+ const C = (n, k) => {
562
+ if (k < 0 || k > n) return 0;
563
+ k = Math.min(k, n - k);
564
+ let num = 1,
565
+ den = 1;
566
+ for (let i = 1; i <= k; i++) {
567
+ num *= n - (k - i);
568
+ den *= i;
569
+ }
570
+ return num / den;
571
+ };
572
+ const f = (x) => C(n, x) * p ** x * (1 - p) ** (n - x);
573
+ const val = Array.isArray(k) ? k.map(f) : f(k);
574
+ return _ok("distribution", {
575
+ name: "binomial_pmf",
576
+ params: { n, p },
577
+ value: val,
578
+ });
579
+ };
580
+
581
+ const binomial_cdf = (k, n, p) => {
582
+ const pmf = (x) =>
583
+ JSON.parse(binomial_pmf(x, n, p).toLowerCase ? '{"ignore":0}' : "{}"); // safeguard no-op
584
+ const f = (t) => {
585
+ let s = 0;
586
+ for (let i = 0; i <= t; i++) {
587
+ s +=
588
+ (function C(n, k) {
589
+ if (k < 0 || k > n) return 0;
590
+ k = Math.min(k, n - k);
591
+ let num = 1,
592
+ den = 1;
593
+ for (let j = 1; j <= k; j++) {
594
+ num *= n - (k - j);
595
+ den *= j;
596
+ }
597
+ return num / den;
598
+ })(n, i) *
599
+ p ** i *
600
+ (1 - p) ** (n - i);
601
+ }
602
+ return s;
603
+ };
604
+ const val = Array.isArray(k) ? k.map(f) : f(k);
605
+ return _ok("distribution", {
606
+ name: "binomial_cdf",
607
+ params: { n, p },
608
+ value: val,
609
+ });
610
+ };
611
+
612
+ const poisson_pmf = (k, lambda) => {
613
+ const fact = (m) => {
614
+ let r = 1;
615
+ for (let i = 2; i <= m; i++) r *= i;
616
+ return r;
617
+ };
618
+ const f = (x) => (Math.exp(-lambda) * lambda ** x) / fact(x);
619
+ return _ok("distribution", {
620
+ name: "poisson_pmf",
621
+ params: { lambda },
622
+ value: Array.isArray(k) ? k.map(f) : f(k),
623
+ });
624
+ };
625
+
626
+ const poisson_cdf = (k, lambda) => {
627
+ const f = (t) => {
628
+ let s = 0;
629
+ for (let i = 0; i <= t; i++)
630
+ s +=
631
+ (Math.exp(-lambda) * lambda ** i) /
632
+ (function fact(m) {
633
+ let r = 1;
634
+ for (let j = 2; j <= m; j++) r *= j;
635
+ return r;
636
+ })(i);
637
+ return s;
638
+ };
639
+ return _ok("distribution", {
640
+ name: "poisson_cdf",
641
+ params: { lambda },
642
+ value: Array.isArray(k) ? k.map(f) : f(k),
643
+ });
644
+ };
645
+
646
+ // =========================
647
+ // hypothesis tests
648
+ // =========================
649
+
650
+ const _tCDF = (t, df) => {
651
+ // symmetric; use relationship with regularized incomplete beta (approx via numerical integration)
652
+ const a = df / 2,
653
+ b = 0.5;
654
+ const x = df / (df + t * t);
655
+ const betacf = (a, b, x) => {
656
+ const itmax = 200,
657
+ eps = 3e-7;
658
+ let am = 1,
659
+ bm = 1,
660
+ az = 1,
661
+ qab = a + b,
662
+ qap = a + 1,
663
+ qam = a - 1,
664
+ bz = 1 - (qab * x) / qap;
665
+ let aold;
666
+ for (let m = 1; m <= itmax; m++) {
667
+ const em = m,
668
+ tem = em + em;
669
+ let d = (em * (b - m) * x) / ((qam + tem) * (a + tem));
670
+ let ap = az + d * am;
671
+ let bp = bz + d * bm;
672
+ d = (-(a + em) * (qab + em) * x) / ((a + tem) * (qap + tem));
673
+ let app = ap + d * az;
674
+ let bpp = bp + d * bz;
675
+ aold = az;
676
+ am = ap / bpp;
677
+ bm = bp / bpp;
678
+ az = app / bpp;
679
+ bz = 1;
680
+ if (Math.abs(az - aold) < eps * Math.abs(az)) return az;
681
+ }
682
+ return az;
683
+ };
684
+ const ib = ((Math.pow(x, a) * Math.pow(1 - x, b)) / a) * betacf(a, b, x);
685
+ const p = 0.5 * ib;
686
+ return t >= 0 ? 1 - p : p;
687
+ };
688
+
689
+ const t_test_independent = (a, b, equal_var = true) => {
690
+ const xa = _numeric(a),
691
+ xb = _numeric(b);
692
+ const na = xa.length,
693
+ nb = xb.length;
694
+ if (na < 2 || nb < 2) return _err("t_test_independent", "insufficient data");
695
+ const ma = _mean(xa),
696
+ mb = _mean(xb),
697
+ va = _variance(xa, true),
698
+ vb = _variance(xb, true);
699
+ let df, se;
700
+ if (equal_var) {
701
+ const sp2 = ((na - 1) * va + (nb - 1) * vb) / (na + nb - 2);
702
+ se = Math.sqrt(sp2 * (1 / na + 1 / nb));
703
+ df = na + nb - 2;
704
+ } else {
705
+ se = Math.sqrt(va / na + vb / nb);
706
+ const num = (va / na + vb / nb) ** 2;
707
+ const den = va ** 2 / (na ** 2 * (na - 1)) + vb ** 2 / (nb ** 2 * (nb - 1));
708
+ df = num / den;
709
+ }
710
+ const t = (ma - mb) / se;
711
+ const p = 2 * (1 - _tCDF(Math.abs(t), df));
712
+ return _text({
713
+ type: "hypothesis_test",
714
+ name: "independent_t_test",
715
+ statistic: t,
716
+ df,
717
+ p_value: p,
718
+ means: { group_a: ma, group_b: mb },
719
+ });
720
+ };
721
+
722
+ const z_test_one_sample = (data, mu0 = 0, sigma = null, alpha = 0.05) => {
723
+ const x = _numeric(data);
724
+ const n = x.length;
725
+ if (n < 2) return _err("z_test_one_sample", "insufficient data");
726
+
727
+ const mean = _mean(x);
728
+ const s = sigma ?? _std(x, true);
729
+ const se = s / Math.sqrt(n);
730
+ const z = (mean - mu0) / se;
731
+
732
+ const p = 2 * (1 - normal_cdf(Math.abs(z)));
733
+ const zcrit = normal_ppf(1 - alpha / 2);
734
+ const moe = zcrit * se;
735
+
736
+ return _text({
737
+ type: "hypothesis_test",
738
+ name: "one_sample_z_test",
739
+ statistic: z,
740
+ p_value: p,
741
+ ci_lower: mean - moe,
742
+ ci_upper: mean + moe,
743
+ confidence: 1 - alpha,
744
+ extra: {
745
+ sample_mean: mean,
746
+ hypothesized_mean: mu0,
747
+ se,
748
+ sigma_used: s,
749
+ n,
750
+ effect_size: (mean - mu0) / s,
751
+ },
752
+ });
753
+ };
754
+
755
+ const chi_square_independence = (table, alpha = 0.05) => {
756
+ const r = table.length;
757
+ const c = table[0].length;
758
+ const rowS = table.map((row) => row.reduce((a, b) => a + b, 0));
759
+ const colS = Array(c).fill(0);
760
+ table.forEach((row) => row.forEach((v, j) => (colS[j] += v)));
761
+ const N = rowS.reduce((a, b) => a + b, 0);
762
+ let chi = 0;
763
+ const expected = Array.from({ length: r }, (_, i) =>
764
+ Array.from({ length: c }, (_, j) => (rowS[i] * colS[j]) / N)
765
+ );
766
+
767
+ for (let i = 0; i < r; i++) {
768
+ for (let j = 0; j < c; j++) {
769
+ chi += (table[i][j] - expected[i][j]) ** 2 / expected[i][j];
770
+ }
771
+ }
772
+ const df = (r - 1) * (c - 1);
773
+ const p = 1 - chi_square_cdf(chi, df);
774
+
775
+ return _text({
776
+ type: "hypothesis_test",
777
+ name: "chi_square_independence",
778
+ statistic: chi,
779
+ df,
780
+ p_value: p,
781
+ confidence: 1 - alpha,
782
+ extra: {
783
+ observed: table,
784
+ expected,
785
+ dof: df,
786
+ },
787
+ });
788
+ };
789
+
790
+ const anova_oneway = (groups, alpha = 0.05) => {
791
+ const k = groups.length;
792
+ const ns = groups.map((g) => _numeric(g).length);
793
+ const means = groups.map(_mean);
794
+ const overall = _mean(groups.flat());
795
+ const ssb = groups.reduce(
796
+ (s, g, i) => s + ns[i] * (means[i] - overall) ** 2,
797
+ 0
798
+ );
799
+ const ssw = groups.reduce(
800
+ (s, g, i) => s + _numeric(g).reduce((a, x) => a + (x - means[i]) ** 2, 0),
801
+ 0
802
+ );
803
+ const dfb = k - 1;
804
+ const dfw = ns.reduce((a, b) => a + b, 0) - k;
805
+ const msb = ssb / dfb;
806
+ const msw = ssw / dfw;
807
+ const F = msb / msw;
808
+ const p = 1 - f_cdf(F, dfb, dfw);
809
+
810
+ return _text({
811
+ type: "hypothesis_test",
812
+ name: "anova_oneway",
813
+ statistic: F,
814
+ df: { between: dfb, within: dfw },
815
+ p_value: p,
816
+ confidence: 1 - alpha,
817
+ extra: {
818
+ group_means: means,
819
+ grand_mean: overall,
820
+ ssb,
821
+ ssw,
822
+ },
823
+ });
824
+ };
825
+
826
+ // =========================
827
+ // machine learning (linear regression, logistic regression)
828
+ // models are serialized as lowercase json text strings
829
+ // =========================
830
+
831
+ const _addBias = (X) => X.map((row) => [1, ...row]);
832
+ const _transpose = (A) => A[0].map((_, j) => A.map((row) => row[j]));
833
+ const _dot = (A, B) => {
834
+ const n = A.length,
835
+ m = B[0].length,
836
+ p = B.length;
837
+ const out = Array(n)
838
+ .fill(0)
839
+ .map(() => Array(m).fill(0));
840
+ for (let i = 0; i < n; i++)
841
+ for (let j = 0; j < m; j++) {
842
+ let s = 0;
843
+ for (let k = 0; k < p; k++) s += A[i][k] * B[k][j];
844
+ out[i][j] = s;
845
+ }
846
+ return out;
847
+ };
848
+ const _pinv = (A, lambda = 1e-8) => {
849
+ // ridge-stabilized (A^T A + λI)^-1 A^T
850
+ const At = _transpose(A);
851
+ const AtA = _dot(At, A);
852
+ const n = AtA.length;
853
+ for (let i = 0; i < n; i++) AtA[i][i] += lambda;
854
+ const inv = _inv(AtA);
855
+ return _dot(inv, At);
856
+ };
857
+ const _inv = (M) => {
858
+ const n = M.length;
859
+ const A = M.map((row, i) =>
860
+ row.concat(Array.from({ length: n }, (_, j) => (i === j ? 1 : 0)))
861
+ );
862
+ for (let i = 0; i < n; i++) {
863
+ let pivot = A[i][i];
864
+ let r = i;
865
+ for (let k = i + 1; k < n; k++)
866
+ if (Math.abs(A[k][i]) > Math.abs(pivot)) {
867
+ pivot = A[k][i];
868
+ r = k;
869
+ }
870
+ if (r !== i) {
871
+ const tmp = A[i];
872
+ A[i] = A[r];
873
+ A[r] = tmp;
874
+ }
875
+ const pv = A[i][i];
876
+ if (Math.abs(pv) < 1e-12) continue;
877
+ for (let j = 0; j < 2 * n; j++) A[i][j] /= pv;
878
+ for (let k = 0; k < n; k++)
879
+ if (k !== i) {
880
+ const f = A[k][i];
881
+ for (let j = 0; j < 2 * n; j++) A[k][j] -= f * A[i][j];
882
+ }
883
+ }
884
+ return A.map((row) => row.slice(n));
885
+ };
886
+
887
+ const train_linear_regression = (X, y) => {
888
+ const Xb = _addBias(X);
889
+ const pinv = _pinv(Xb);
890
+ const w = _dot(
891
+ pinv,
892
+ y.map((v) => [v])
893
+ ).map((r) => r[0]);
894
+ const predict = (row) => w[0] + row.reduce((s, v, i) => s + w[i + 1] * v, 0);
895
+ const yhat = X.map(predict);
896
+ const resid = y.map((v, i) => v - yhat[i]);
897
+ const mse = _mean(resid.map((e) => e * e));
898
+ const r2 =
899
+ 1 - _sum(resid.map((e) => e * e)) / _sum(y.map((v) => (v - _mean(y)) ** 2));
900
+ const model = {
901
+ type: "linear_regression",
902
+ weights: w,
903
+ mse,
904
+ r2,
905
+ n: y.length,
906
+ p: X[0]?.length ?? 0,
907
+ };
908
+ return _text(model);
909
+ };
910
+
911
+ const _sigmoid = (z) => 1 / (1 + Math.exp(-z));
912
+
913
+ const train_logistic_regression = (X, y, opts = {}) => {
914
+ const lr = opts.learning_rate ?? 0.1;
915
+ const iters = opts.iterations ?? 1000;
916
+ const lambda = opts.l2 ?? 0;
917
+ const p = X[0]?.length ?? 0;
918
+ let w = Array(p + 1).fill(0);
919
+ const addBias = (row) => [1, ...row];
920
+ const Xb = X.map(addBias);
921
+ for (let t = 0; t < iters; t++) {
922
+ const grad = Array(p + 1).fill(0);
923
+ for (let i = 0; i < Xb.length; i++) {
924
+ const z = w.reduce((s, wi, j) => s + wi * Xb[i][j], 0);
925
+ const p1 = _sigmoid(z);
926
+ const e = p1 - y[i];
927
+ for (let j = 0; j < grad.length; j++) grad[j] += e * Xb[i][j];
928
+ }
929
+ for (let j = 0; j < w.length; j++) {
930
+ grad[j] = grad[j] / Xb.length + lambda * w[j];
931
+ w[j] -= lr * grad[j];
932
+ }
933
+ }
934
+ const predict_proba_row = (row) =>
935
+ _sigmoid(w[0] + row.reduce((s, v, i) => s + w[i + 1] * v, 0));
936
+ const proba = X.map(predict_proba_row);
937
+ const pred = proba.map((p) => (p >= 0.5 ? 1 : 0));
938
+ const acc = pred.filter((v, i) => v === y[i]).length / y.length;
939
+ const model = {
940
+ type: "logistic_regression",
941
+ weights: w,
942
+ accuracy: acc,
943
+ n: y.length,
944
+ p,
945
+ };
946
+ return _text(model);
947
+ };
948
+
949
+ const predict_linear = (model_text, X) => {
950
+ try {
951
+ const m = JSON.parse(JSON.stringify(_lowerJson(model_text)));
952
+ const w = m.weights || m.model?.weights;
953
+ if (!w) return _err("predict_linear", "invalid model");
954
+ const yhat = X.map(
955
+ (row) => w[0] + row.reduce((s, v, i) => s + w[i + 1] * v, 0)
956
+ );
957
+ return _text({
958
+ type: "prediction",
959
+ name: "linear_regression",
960
+ predictions: yhat,
961
+ });
962
+ } catch {
963
+ return _err("predict_linear", "invalid model text");
964
+ }
965
+ };
966
+
967
+ const predict_logistic = (model_text, X, threshold = 0.5) => {
968
+ try {
969
+ const m = JSON.parse(JSON.stringify(_lowerJson(model_text)));
970
+ const w = m.weights || m.model?.weights;
971
+ if (!w) return _err("predict_logistic", "invalid model");
972
+ const proba = X.map((row) =>
973
+ _sigmoid(w[0] + row.reduce((s, v, i) => s + w[i + 1] * v, 0))
974
+ );
975
+ const pred = proba.map((p) => (p >= threshold ? 1 : 0));
976
+ return _text({
977
+ type: "prediction",
978
+ name: "logistic_regression",
979
+ threshold,
980
+ probabilities: proba,
981
+ classes: pred,
982
+ });
983
+ } catch {
984
+ return _err("predict_logistic", "invalid model text");
985
+ }
986
+ };
987
+
988
+ const _lowerJson = (textOrObj) => {
989
+ const parse = (v) => {
990
+ try {
991
+ return typeof v === "string" ? JSON.parse(v) : v;
992
+ } catch {
993
+ return {};
994
+ }
995
+ };
996
+ const lower = (o) =>
997
+ Array.isArray(o)
998
+ ? o.map(lower)
999
+ : o && typeof o === "object"
1000
+ ? Object.fromEntries(
1001
+ Object.entries(o).map(([k, v]) => [String(k).toLowerCase(), lower(v)])
1002
+ )
1003
+ : o;
1004
+ return lower(parse(textOrObj));
1005
+ };
1006
+
1007
+ const train_test_split = (X, y, test_size = 0.2, seed = 42) => {
1008
+ const n = X.length;
1009
+ const idx = Array.from({ length: n }, (_, i) => i);
1010
+ let s = seed;
1011
+ const rand = () => (s = (s * 9301 + 49297) % 233280) / 233280;
1012
+ idx.sort(() => rand() - 0.5);
1013
+ const ntest = Math.max(1, Math.floor(n * test_size));
1014
+ const test_idx = idx.slice(0, ntest);
1015
+ const train_idx = idx.slice(ntest);
1016
+ const X_train = train_idx.map((i) => X[i]);
1017
+ const y_train = train_idx.map((i) => y[i]);
1018
+ const X_test = test_idx.map((i) => X[i]);
1019
+ const y_test = test_idx.map((i) => y[i]);
1020
+ return _text({
1021
+ type: "split",
1022
+ sizes: { train: y_train.length, test: y_test.length },
1023
+ indices: { train: train_idx, test: test_idx },
1024
+ preview: { x_train: X_train.slice(0, 2), y_train: y_train.slice(0, 5) },
1025
+ });
1026
+ };
1027
+
1028
+ const metrics_classification = (y_true, y_pred) => {
1029
+ const n = Math.min(y_true.length, y_pred.length);
1030
+ let tp = 0,
1031
+ tn = 0,
1032
+ fp = 0,
1033
+ fn = 0;
1034
+ for (let i = 0; i < n; i++) {
1035
+ if (y_pred[i] === 1 && y_true[i] === 1) tp++;
1036
+ else if (y_pred[i] === 0 && y_true[i] === 0) tn++;
1037
+ else if (y_pred[i] === 1 && y_true[i] === 0) fp++;
1038
+ else if (y_pred[i] === 0 && y_true[i] === 1) fn++;
1039
+ }
1040
+ const accuracy = (tp + tn) / n;
1041
+ const precision = tp + fp ? tp / (tp + fp) : 0;
1042
+ const recall = tp + fn ? tp / (tp + fn) : 0;
1043
+ const f1 =
1044
+ precision + recall ? (2 * precision * recall) / (precision + recall) : 0;
1045
+ return _text({
1046
+ type: "metric",
1047
+ name: "classification_report",
1048
+ confusion_matrix: { tp, fp, tn, fn },
1049
+ accuracy,
1050
+ precision,
1051
+ recall,
1052
+ f1,
1053
+ });
1054
+ };
1055
+
1056
+ const metrics_regression = (y_true, y_pred) => {
1057
+ const n = Math.min(y_true.length, y_pred.length);
1058
+ const e = Array.from({ length: n }, (_, i) => y_true[i] - y_pred[i]);
1059
+ const mse = _mean(e.map((v) => v * v));
1060
+ const mae = _mean(e.map((v) => Math.abs(v)));
1061
+ const r2 =
1062
+ 1 -
1063
+ _sum(e.map((v) => v * v)) /
1064
+ _sum(y_true.map((v) => (v - _mean(y_true)) ** 2));
1065
+ return _text({ type: "metric", name: "regression_report", mse, mae, r2 });
1066
+ };
1067
+
1068
+ // =========================
1069
+ // eda convenience
1070
+ // =========================
1071
+
1072
+ const eda_overview = (rows) => {
1073
+ const desc = _lowerJson(df_describe(rows));
1074
+ const miss = _lowerJson(df_missing_report(rows));
1075
+ const corr = _lowerJson(df_corr(rows, "pearson"));
1076
+ return _text({
1077
+ type: "eda",
1078
+ summary: desc.columns ?? desc,
1079
+ missing: miss.rows ?? miss,
1080
+ correlation: corr.matrix ?? corr,
1081
+ });
1082
+ };
1083
+
1084
+ // =========================
1085
+ // ADDITIONAL STATISTICAL TESTS
1086
+ // =========================
1087
+
1088
+ const t_test_paired = (a, b) => {
1089
+ const xa = _numeric(a),
1090
+ xb = _numeric(b);
1091
+ const n = Math.min(xa.length, xb.length);
1092
+ if (n < 2) return _err("t_test_paired", "insufficient data");
1093
+ const diffs = Array.from({ length: n }, (_, i) => xa[i] - xb[i]);
1094
+ const md = _mean(diffs),
1095
+ sd = _std(diffs, true);
1096
+ const t = md / (sd / Math.sqrt(n));
1097
+ const df = n - 1;
1098
+ const p = 2 * (1 - _tCDF(Math.abs(t), df));
1099
+ return _text({
1100
+ type: "hypothesis_test",
1101
+ name: "paired_t_test",
1102
+ statistic: t,
1103
+ df,
1104
+ p_value: p,
1105
+ mean_difference: md,
1106
+ });
1107
+ };
1108
+
1109
+ const t_test_one_sample = (arr, mu0) => {
1110
+ const x = _numeric(arr);
1111
+ const n = x.length;
1112
+ if (n < 2) return _err("t_test_one_sample", "insufficient data");
1113
+ const m = _mean(x),
1114
+ s = _std(x, true);
1115
+ const t = (m - mu0) / (s / Math.sqrt(n));
1116
+ const df = n - 1;
1117
+ const p = 2 * (1 - _tCDF(Math.abs(t), df));
1118
+ return _text({
1119
+ type: "hypothesis_test",
1120
+ name: "one_sample_t_test",
1121
+ statistic: t,
1122
+ df,
1123
+ p_value: p,
1124
+ mean: m,
1125
+ hypothesized_mean: mu0,
1126
+ });
1127
+ };
1128
+
1129
+ const shapiro_wilk = (arr) => {
1130
+ const x = _numeric(arr).sort((a, b) => a - b);
1131
+ const n = x.length;
1132
+ if (n < 3 || n > 5000)
1133
+ return _err("shapiro_wilk", "sample size must be between 3 and 5000");
1134
+ const m = _mean(x);
1135
+ const ss = x.reduce((s, v) => s + (v - m) ** 2, 0);
1136
+ let b = 0;
1137
+ const k = Math.floor(n / 2);
1138
+ for (let i = 0; i < k; i++) {
1139
+ const ai =
1140
+ i === 0
1141
+ ? -2.706056 / Math.sqrt(n)
1142
+ : i === k - 1 && n % 2 === 0
1143
+ ? 2.706056 / Math.sqrt(n)
1144
+ : 0;
1145
+ b += ai * (x[n - 1 - i] - x[i]);
1146
+ }
1147
+ const w = (b * b) / ss;
1148
+ return _text({
1149
+ type: "hypothesis_test",
1150
+ name: "shapiro_wilk",
1151
+ statistic: w,
1152
+ n,
1153
+ note: "approximation; w > 0.9 suggests normality",
1154
+ });
1155
+ };
1156
+
1157
+ const jarque_bera = (arr) => {
1158
+ const x = _numeric(arr);
1159
+ const n = x.length;
1160
+ if (n < 4) return _err("jarque_bera", "insufficient data");
1161
+ const s = _skewness(x);
1162
+ const k = _kurtosis(x);
1163
+ const jb = (n / 6) * (s * s + (k * k) / 4);
1164
+ return _text({
1165
+ type: "hypothesis_test",
1166
+ name: "jarque_bera",
1167
+ statistic: jb,
1168
+ n,
1169
+ df: 2,
1170
+ note: "tests normality; low p-value rejects normality",
1171
+ });
1172
+ };
1173
+
1174
+ const levene_test = (groups) => {
1175
+ const k = groups.length;
1176
+ const medians = groups.map(_median);
1177
+ const zs = groups.map((g, i) =>
1178
+ _numeric(g).map((v) => Math.abs(v - medians[i]))
1179
+ );
1180
+ const allz = _flatten(zs);
1181
+ const overall_median = _median(allz);
1182
+ const ns = zs.map((z) => z.length);
1183
+ const N = ns.reduce((a, b) => a + b, 0);
1184
+ const ssb = zs.reduce(
1185
+ (s, z, i) => s + ns[i] * (_mean(z) - overall_median) ** 2,
1186
+ 0
1187
+ );
1188
+ const ssw = zs.reduce(
1189
+ (s, z) => s + z.reduce((a, v) => a + (v - _mean(z)) ** 2, 0),
1190
+ 0
1191
+ );
1192
+ const dfb = k - 1;
1193
+ const dfw = N - k;
1194
+ const msb = ssb / dfb;
1195
+ const msw = ssw / dfw;
1196
+ const W = msb / msw;
1197
+ return _text({
1198
+ type: "hypothesis_test",
1199
+ name: "levene_test",
1200
+ statistic: W,
1201
+ df_between: dfb,
1202
+ df_within: dfw,
1203
+ note: "tests homogeneity of variance",
1204
+ });
1205
+ };
1206
+
1207
+ const kruskal_wallis = (groups) => {
1208
+ const all = _flatten(groups);
1209
+ const n = all.length;
1210
+ const ranks = _rank(all);
1211
+ let pos = 0;
1212
+ const rankSums = groups.map((g) => {
1213
+ const len = _numeric(g).length;
1214
+ const rsum = ranks.slice(pos, pos + len).reduce((a, b) => a + b, 0);
1215
+ pos += len;
1216
+ return { n: len, rsum };
1217
+ });
1218
+ const H =
1219
+ (12 / (n * (n + 1))) *
1220
+ rankSums.reduce((s, { n: ni, rsum }) => s + (rsum * rsum) / ni, 0) -
1221
+ 3 * (n + 1);
1222
+ const df = groups.length - 1;
1223
+ return _text({
1224
+ type: "hypothesis_test",
1225
+ name: "kruskal_wallis",
1226
+ statistic: H,
1227
+ df,
1228
+ note: "non-parametric alternative to anova",
1229
+ });
1230
+ };
1231
+
1232
+ const mann_whitney = (a, b) => {
1233
+ const xa = _numeric(a);
1234
+ const xb = _numeric(b);
1235
+ const na = xa.length;
1236
+ const nb = xb.length;
1237
+ if (na < 1 || nb < 1) return _err("mann_whitney", "insufficient data");
1238
+ const combined = xa.concat(xb);
1239
+ const ranks = _rank(combined);
1240
+ const ra = ranks.slice(0, na).reduce((s, r) => s + r, 0);
1241
+ const U1 = ra - (na * (na + 1)) / 2;
1242
+ const U2 = na * nb - U1;
1243
+ const U = Math.min(U1, U2);
1244
+ const mu = (na * nb) / 2;
1245
+ const sigma = Math.sqrt((na * nb * (na + nb + 1)) / 12);
1246
+ const z = (U - mu) / sigma;
1247
+ const p = 2 * (1 - _Phi(Math.abs(z)));
1248
+ return _text({
1249
+ type: "hypothesis_test",
1250
+ name: "mann_whitney_u",
1251
+ statistic: U,
1252
+ z_score: z,
1253
+ p_value: p,
1254
+ note: "non-parametric alternative to t-test",
1255
+ });
1256
+ };
1257
+
1258
+ const wilcoxon_signed_rank = (a, b) => {
1259
+ const xa = _numeric(a);
1260
+ const xb = _numeric(b);
1261
+ const n = Math.min(xa.length, xb.length);
1262
+ if (n < 2) return _err("wilcoxon_signed_rank", "insufficient data");
1263
+ const diffs = Array.from({ length: n }, (_, i) => xa[i] - xb[i]).filter(
1264
+ (d) => d !== 0
1265
+ );
1266
+ const absDiffs = diffs.map(Math.abs);
1267
+ const ranks = _rank(absDiffs);
1268
+ const Wplus = ranks.reduce((s, r, i) => s + (diffs[i] > 0 ? r : 0), 0);
1269
+ const m = diffs.length;
1270
+ const mu = (m * (m + 1)) / 4;
1271
+ const sigma = Math.sqrt((m * (m + 1) * (2 * m + 1)) / 24);
1272
+ const z = (Wplus - mu) / sigma;
1273
+ const p = 2 * (1 - _Phi(Math.abs(z)));
1274
+ return _text({
1275
+ type: "hypothesis_test",
1276
+ name: "wilcoxon_signed_rank",
1277
+ statistic: Wplus,
1278
+ z_score: z,
1279
+ p_value: p,
1280
+ n: m,
1281
+ });
1282
+ };
1283
+
1284
+ const chi_square_goodness = (observed, expected, alpha = 0.05) => {
1285
+ const obs = _numeric(observed);
1286
+ const exp = _numeric(expected);
1287
+ const n = Math.min(obs.length, exp.length);
1288
+ let chi = 0;
1289
+ for (let i = 0; i < n; i++) chi += (obs[i] - exp[i]) ** 2 / exp[i];
1290
+ const df = n - 1;
1291
+ const p = 1 - chi_square_cdf(chi, df);
1292
+
1293
+ return _text({
1294
+ type: "hypothesis_test",
1295
+ name: "chi_square_goodness_of_fit",
1296
+ statistic: chi,
1297
+ df,
1298
+ p_value: p,
1299
+ confidence: 1 - alpha,
1300
+ extra: {
1301
+ observed: obs,
1302
+ expected: exp,
1303
+ dof: df,
1304
+ },
1305
+ });
1306
+ };
1307
+
1308
+ // =========================
1309
+ // CONFIDENCE INTERVALS
1310
+ // =========================
1311
+
1312
+ const confidence_interval_mean = (arr, confidence = 0.95) => {
1313
+ const x = _numeric(arr);
1314
+ const n = x.length;
1315
+ if (n < 2) return _err("confidence_interval_mean", "insufficient data");
1316
+ const m = _mean(x);
1317
+ const s = _std(x, true);
1318
+ const alpha = 1 - confidence;
1319
+ const t_crit = _normInv(1 - alpha / 2) * (n > 30 ? 1 : 1.15);
1320
+ const margin = (t_crit * s) / Math.sqrt(n);
1321
+ const lower = m - margin;
1322
+ const upper = m + margin;
1323
+ return _text({
1324
+ type: "confidence_interval",
1325
+ parameter: "mean",
1326
+ confidence,
1327
+ n,
1328
+ mean: m,
1329
+ lower,
1330
+ upper,
1331
+ margin,
1332
+ });
1333
+ };
1334
+
1335
+ const confidence_interval_proportion = (successes, n, confidence = 0.95) => {
1336
+ if (n < 1)
1337
+ return _err("confidence_interval_proportion", "invalid sample size");
1338
+ const p = successes / n;
1339
+ const alpha = 1 - confidence;
1340
+ const z = _normInv(1 - alpha / 2);
1341
+ const se = Math.sqrt((p * (1 - p)) / n);
1342
+ const margin = z * se;
1343
+ const lower = Math.max(0, p - margin);
1344
+ const upper = Math.min(1, p + margin);
1345
+ return _text({
1346
+ type: "confidence_interval",
1347
+ parameter: "proportion",
1348
+ confidence,
1349
+ n,
1350
+ proportion: p,
1351
+ lower,
1352
+ upper,
1353
+ margin,
1354
+ });
1355
+ };
1356
+
1357
+ const confidence_interval_variance = (arr, confidence = 0.95) => {
1358
+ const x = _numeric(arr);
1359
+ const n = x.length;
1360
+ if (n < 2) return _err("confidence_interval_variance", "insufficient data");
1361
+ const s2 = _variance(x, true);
1362
+ const df = n - 1;
1363
+ const alpha = 1 - confidence;
1364
+ const chi_lower = df / (1 + _normInv(1 - alpha / 2) * Math.sqrt(2 / df));
1365
+ const chi_upper = df / (1 - _normInv(1 - alpha / 2) * Math.sqrt(2 / df));
1366
+ const lower = (df * s2) / chi_upper;
1367
+ const upper = (df * s2) / chi_lower;
1368
+ return _text({
1369
+ type: "confidence_interval",
1370
+ parameter: "variance",
1371
+ confidence,
1372
+ n,
1373
+ variance: s2,
1374
+ lower,
1375
+ upper,
1376
+ });
1377
+ };
1378
+
1379
+ const confidence_interval_difference = (a, b, confidence = 0.95) => {
1380
+ const xa = _numeric(a);
1381
+ const xb = _numeric(b);
1382
+ const na = xa.length;
1383
+ const nb = xb.length;
1384
+ if (na < 2 || nb < 2)
1385
+ return _err("confidence_interval_difference", "insufficient data");
1386
+ const ma = _mean(xa);
1387
+ const mb = _mean(xb);
1388
+ const va = _variance(xa, true);
1389
+ const vb = _variance(xb, true);
1390
+ const diff = ma - mb;
1391
+ const se = Math.sqrt(va / na + vb / nb);
1392
+ const alpha = 1 - confidence;
1393
+ const z = _normInv(1 - alpha / 2);
1394
+ const margin = z * se;
1395
+ const lower = diff - margin;
1396
+ const upper = diff + margin;
1397
+ return _text({
1398
+ type: "confidence_interval",
1399
+ parameter: "difference_of_means",
1400
+ confidence,
1401
+ difference: diff,
1402
+ lower,
1403
+ upper,
1404
+ margin,
1405
+ means: { group_a: ma, group_b: mb },
1406
+ });
1407
+ };
1408
+
1409
+ // =========================
1410
+ // ADDITIONAL CORRELATIONS
1411
+ // =========================
1412
+
1413
+ const corr_kendall = (x, y) => {
1414
+ const ax = _numeric(x);
1415
+ const by = _numeric(y);
1416
+ const n = Math.min(ax.length, by.length);
1417
+ if (n < 2) return _err("corr_kendall", "insufficient data");
1418
+ let concordant = 0;
1419
+ let discordant = 0;
1420
+ for (let i = 0; i < n - 1; i++) {
1421
+ for (let j = i + 1; j < n; j++) {
1422
+ const dx = ax[j] - ax[i];
1423
+ const dy = by[j] - by[i];
1424
+ if (dx * dy > 0) concordant++;
1425
+ else if (dx * dy < 0) discordant++;
1426
+ }
1427
+ }
1428
+ const tau = (concordant - discordant) / (0.5 * n * (n - 1));
1429
+ return _ok("statistic", {
1430
+ name: "kendall_tau",
1431
+ value: tau,
1432
+ concordant,
1433
+ discordant,
1434
+ n,
1435
+ });
1436
+ };
1437
+
1438
+ const corr_partial = (x, y, z) => {
1439
+ const rxy = _corrPearson(x, y);
1440
+ const rxz = _corrPearson(x, z);
1441
+ const ryz = _corrPearson(y, z);
1442
+ const rxy_z =
1443
+ (rxy - rxz * ryz) / Math.sqrt((1 - rxz * rxz) * (1 - ryz * ryz));
1444
+ return _ok("statistic", {
1445
+ name: "partial_correlation",
1446
+ value: rxy_z,
1447
+ controlling_for: "third_variable",
1448
+ });
1449
+ };
1450
+
1451
+ const corr_matrix_all = (rows, method = "pearson") => {
1452
+ const pearson = _lowerJson(df_corr(rows, "pearson")).matrix;
1453
+ const spearman = _lowerJson(df_corr(rows, "spearman")).matrix;
1454
+ const cols = Object.keys(pearson);
1455
+ const kendall = {};
1456
+ cols.forEach((a) => {
1457
+ kendall[a] = {};
1458
+ cols.forEach((b) => {
1459
+ const xa = rows.map((r) => _toNum(r[a]));
1460
+ const xb = rows.map((r) => _toNum(r[b]));
1461
+ const tau = _lowerJson(corr_kendall(xa, xb)).value ?? NaN;
1462
+ kendall[a][b] = tau;
1463
+ });
1464
+ });
1465
+ return _text({ type: "correlation_analysis", pearson, spearman, kendall });
1466
+ };
1467
+
1468
+ // =========================
1469
+ // K-NEAREST NEIGHBORS
1470
+ // =========================
1471
+
1472
+ const _euclidean = (a, b) =>
1473
+ Math.sqrt(a.reduce((s, v, i) => s + (v - b[i]) ** 2, 0));
1474
+
1475
+ const train_knn_classifier = (X, y, k = 5) => {
1476
+ if (X.length !== y.length)
1477
+ return _err("train_knn_classifier", "X and y length mismatch");
1478
+ const model = {
1479
+ type: "knn_classifier",
1480
+ k,
1481
+ X,
1482
+ y,
1483
+ n: y.length,
1484
+ p: X[0]?.length ?? 0,
1485
+ };
1486
+ return _text(model);
1487
+ };
1488
+
1489
+ const predict_knn_classifier = (model_text, X_test) => {
1490
+ try {
1491
+ const m = _lowerJson(model_text);
1492
+ if (m.type !== "knn_classifier")
1493
+ return _err("predict_knn_classifier", "invalid model type");
1494
+ const { k, x: X_train, y: y_train } = m;
1495
+ const predictions = X_test.map((x) => {
1496
+ const distances = X_train.map((xt, i) => ({
1497
+ dist: _euclidean(x, xt),
1498
+ label: y_train[i],
1499
+ }));
1500
+ distances.sort((a, b) => a.dist - b.dist);
1501
+ const neighbors = distances.slice(0, k);
1502
+ const votes = {};
1503
+ neighbors.forEach(({ label }) => {
1504
+ votes[label] = (votes[label] || 0) + 1;
1505
+ });
1506
+ const pred = Object.entries(votes).sort((a, b) => b[1] - a[1])[0][0];
1507
+ return Number(pred);
1508
+ });
1509
+ return _text({
1510
+ type: "prediction",
1511
+ name: "knn_classifier",
1512
+ k,
1513
+ predictions,
1514
+ });
1515
+ } catch {
1516
+ return _err("predict_knn_classifier", "invalid model text");
1517
+ }
1518
+ };
1519
+
1520
+ const train_knn_regressor = (X, y, k = 5) => {
1521
+ if (X.length !== y.length)
1522
+ return _err("train_knn_regressor", "X and y length mismatch");
1523
+ const model = {
1524
+ type: "knn_regressor",
1525
+ k,
1526
+ X,
1527
+ y,
1528
+ n: y.length,
1529
+ p: X[0]?.length ?? 0,
1530
+ };
1531
+ return _text(model);
1532
+ };
1533
+
1534
+ const predict_knn_regressor = (model_text, X_test) => {
1535
+ try {
1536
+ const m = _lowerJson(model_text);
1537
+ if (m.type !== "knn_regressor")
1538
+ return _err("predict_knn_regressor", "invalid model type");
1539
+ const { k, x: X_train, y: y_train } = m;
1540
+ const predictions = X_test.map((x) => {
1541
+ const distances = X_train.map((xt, i) => ({
1542
+ dist: _euclidean(x, xt),
1543
+ value: y_train[i],
1544
+ }));
1545
+ distances.sort((a, b) => a.dist - b.dist);
1546
+ const neighbors = distances.slice(0, k);
1547
+ return _mean(neighbors.map((n) => n.value));
1548
+ });
1549
+ return _text({ type: "prediction", name: "knn_regressor", k, predictions });
1550
+ } catch {
1551
+ return _err("predict_knn_regressor", "invalid model text");
1552
+ }
1553
+ };
1554
+
1555
+ // =========================
1556
+ // DECISION TREE (CART)
1557
+ // =========================
1558
+
1559
+ const _gini = (y) => {
1560
+ const counts = {};
1561
+ y.forEach((v) => {
1562
+ counts[v] = (counts[v] || 0) + 1;
1563
+ });
1564
+ const total = y.length;
1565
+ return 1 - Object.values(counts).reduce((s, c) => s + (c / total) ** 2, 0);
1566
+ };
1567
+
1568
+ const _mse_split = (y) => {
1569
+ const m = _mean(y);
1570
+ return _mean(y.map((v) => (v - m) ** 2));
1571
+ };
1572
+
1573
+ const _best_split = (X, y, task = "classification") => {
1574
+ let best = { feature: -1, threshold: 0, score: Infinity };
1575
+ const n_features = X[0].length;
1576
+ for (let f = 0; f < n_features; f++) {
1577
+ const values = _uniq(X.map((row) => row[f])).sort((a, b) => a - b);
1578
+ for (let i = 0; i < values.length - 1; i++) {
1579
+ const thresh = (values[i] + values[i + 1]) / 2;
1580
+ const left_idx = [];
1581
+ const right_idx = [];
1582
+ X.forEach((row, idx) => {
1583
+ if (row[f] <= thresh) left_idx.push(idx);
1584
+ else right_idx.push(idx);
1585
+ });
1586
+ if (left_idx.length === 0 || right_idx.length === 0) continue;
1587
+ const left_y = left_idx.map((i) => y[i]);
1588
+ const right_y = right_idx.map((i) => y[i]);
1589
+ let score;
1590
+ if (task === "classification") {
1591
+ score =
1592
+ (left_y.length / y.length) * _gini(left_y) +
1593
+ (right_y.length / y.length) * _gini(right_y);
1594
+ } else {
1595
+ score =
1596
+ (left_y.length / y.length) * _mse_split(left_y) +
1597
+ (right_y.length / y.length) * _mse_split(right_y);
1598
+ }
1599
+ if (score < best.score) best = { feature: f, threshold: thresh, score };
1600
+ }
1601
+ }
1602
+ return best;
1603
+ };
1604
+
1605
+ const _build_tree = (X, y, depth, max_depth, min_samples, task) => {
1606
+ if (depth >= max_depth || y.length < min_samples) {
1607
+ const pred =
1608
+ task === "classification"
1609
+ ? Object.entries(
1610
+ y.reduce((a, v) => {
1611
+ a[v] = (a[v] || 0) + 1;
1612
+ return a;
1613
+ }, {})
1614
+ ).sort((a, b) => b[1] - a[1])[0][0]
1615
+ : _mean(y);
1616
+ return { leaf: true, prediction: Number(pred), n: y.length };
1617
+ }
1618
+ const split = _best_split(X, y, task);
1619
+ if (split.feature === -1) {
1620
+ const pred =
1621
+ task === "classification"
1622
+ ? Object.entries(
1623
+ y.reduce((a, v) => {
1624
+ a[v] = (a[v] || 0) + 1;
1625
+ return a;
1626
+ }, {})
1627
+ ).sort((a, b) => b[1] - a[1])[0][0]
1628
+ : _mean(y);
1629
+ return { leaf: true, prediction: Number(pred), n: y.length };
1630
+ }
1631
+ const left_idx = [];
1632
+ const right_idx = [];
1633
+ X.forEach((row, i) => {
1634
+ if (row[split.feature] <= split.threshold) left_idx.push(i);
1635
+ else right_idx.push(i);
1636
+ });
1637
+ const left_X = left_idx.map((i) => X[i]);
1638
+ const left_y = left_idx.map((i) => y[i]);
1639
+ const right_X = right_idx.map((i) => X[i]);
1640
+ const right_y = right_idx.map((i) => y[i]);
1641
+ return {
1642
+ leaf: false,
1643
+ feature: split.feature,
1644
+ threshold: split.threshold,
1645
+ left: _build_tree(left_X, left_y, depth + 1, max_depth, min_samples, task),
1646
+ right: _build_tree(
1647
+ right_X,
1648
+ right_y,
1649
+ depth + 1,
1650
+ max_depth,
1651
+ min_samples,
1652
+ task
1653
+ ),
1654
+ };
1655
+ };
1656
+
1657
+ const train_decision_tree_classifier = (X, y, opts = {}) => {
1658
+ const max_depth = opts.max_depth ?? 5;
1659
+ const min_samples = opts.min_samples_split ?? 2;
1660
+ const tree = _build_tree(X, y, 0, max_depth, min_samples, "classification");
1661
+ const model = {
1662
+ type: "decision_tree_classifier",
1663
+ tree,
1664
+ max_depth,
1665
+ min_samples,
1666
+ n: y.length,
1667
+ p: X[0]?.length ?? 0,
1668
+ };
1669
+ return _text(model);
1670
+ };
1671
+
1672
+ const train_decision_tree_regressor = (X, y, opts = {}) => {
1673
+ const max_depth = opts.max_depth ?? 5;
1674
+ const min_samples = opts.min_samples_split ?? 2;
1675
+ const tree = _build_tree(X, y, 0, max_depth, min_samples, "regression");
1676
+ const model = {
1677
+ type: "decision_tree_regressor",
1678
+ tree,
1679
+ max_depth,
1680
+ min_samples,
1681
+ n: y.length,
1682
+ p: X[0]?.length ?? 0,
1683
+ };
1684
+ return _text(model);
1685
+ };
1686
+
1687
+ const _predict_tree = (tree, x) => {
1688
+ if (tree.leaf) return tree.prediction;
1689
+ return x[tree.feature] <= tree.threshold
1690
+ ? _predict_tree(tree.left, x)
1691
+ : _predict_tree(tree.right, x);
1692
+ };
1693
+
1694
+ const predict_decision_tree = (model_text, X_test) => {
1695
+ try {
1696
+ const m = _lowerJson(model_text);
1697
+ const predictions = X_test.map((x) => _predict_tree(m.tree, x));
1698
+ return _text({ type: "prediction", name: m.type, predictions });
1699
+ } catch {
1700
+ return _err("predict_decision_tree", "invalid model text");
1701
+ }
1702
+ };
1703
+
1704
+ // =========================
1705
+ // RANDOM FOREST
1706
+ // =========================
1707
+
1708
+ const _bootstrap_sample = (X, y, seed) => {
1709
+ const n = X.length;
1710
+ let s = seed;
1711
+ const rand = () => (s = (s * 9301 + 49297) % 233280) / 233280;
1712
+ const indices = Array.from({ length: n }, () => Math.floor(rand() * n));
1713
+ const X_boot = indices.map((i) => X[i]);
1714
+ const y_boot = indices.map((i) => y[i]);
1715
+ return { X_boot, y_boot };
1716
+ };
1717
+
1718
+ const train_random_forest_classifier = (X, y, opts = {}) => {
1719
+ const n_trees = opts.n_estimators ?? 10;
1720
+ const max_depth = opts.max_depth ?? 5;
1721
+ const min_samples = opts.min_samples_split ?? 2;
1722
+ const seed = opts.seed ?? 42;
1723
+ const trees = [];
1724
+ for (let i = 0; i < n_trees; i++) {
1725
+ const { X_boot, y_boot } = _bootstrap_sample(X, y, seed + i);
1726
+ const tree_model = _lowerJson(
1727
+ train_decision_tree_classifier(X_boot, y_boot, { max_depth, min_samples })
1728
+ );
1729
+ trees.push(tree_model.tree);
1730
+ }
1731
+ const model = {
1732
+ type: "random_forest_classifier",
1733
+ trees,
1734
+ n_trees,
1735
+ max_depth,
1736
+ min_samples,
1737
+ n: y.length,
1738
+ p: X[0]?.length ?? 0,
1739
+ };
1740
+ return _text(model);
1741
+ };
1742
+
1743
+ const train_random_forest_regressor = (X, y, opts = {}) => {
1744
+ const n_trees = opts.n_estimators ?? 10;
1745
+ const max_depth = opts.max_depth ?? 5;
1746
+ const min_samples = opts.min_samples_split ?? 2;
1747
+ const seed = opts.seed ?? 42;
1748
+ const trees = [];
1749
+ for (let i = 0; i < n_trees; i++) {
1750
+ const { X_boot, y_boot } = _bootstrap_sample(X, y, seed + i);
1751
+ const tree_model = _lowerJson(
1752
+ train_decision_tree_regressor(X_boot, y_boot, { max_depth, min_samples })
1753
+ );
1754
+ trees.push(tree_model.tree);
1755
+ }
1756
+ const model = {
1757
+ type: "random_forest_regressor",
1758
+ trees,
1759
+ n_trees,
1760
+ max_depth,
1761
+ min_samples,
1762
+ n: y.length,
1763
+ p: X[0]?.length ?? 0,
1764
+ };
1765
+ return _text(model);
1766
+ };
1767
+
1768
+ const predict_random_forest_classifier = (model_text, X_test) => {
1769
+ try {
1770
+ const m = _lowerJson(model_text);
1771
+ const predictions = X_test.map((x) => {
1772
+ const votes = m.trees.map((tree) => _predict_tree(tree, x));
1773
+ const counts = {};
1774
+ votes.forEach((v) => {
1775
+ counts[v] = (counts[v] || 0) + 1;
1776
+ });
1777
+ return Number(Object.entries(counts).sort((a, b) => b[1] - a[1])[0][0]);
1778
+ });
1779
+ return _text({
1780
+ type: "prediction",
1781
+ name: "random_forest_classifier",
1782
+ n_trees: m.n_trees,
1783
+ predictions,
1784
+ });
1785
+ } catch {
1786
+ return _err("predict_random_forest_classifier", "invalid model text");
1787
+ }
1788
+ };
1789
+
1790
+ const predict_random_forest_regressor = (model_text, X_test) => {
1791
+ try {
1792
+ const m = _lowerJson(model_text);
1793
+ const predictions = X_test.map((x) => {
1794
+ const preds = m.trees.map((tree) => _predict_tree(tree, x));
1795
+ return _mean(preds);
1796
+ });
1797
+ return _text({
1798
+ type: "prediction",
1799
+ name: "random_forest_regressor",
1800
+ n_trees: m.n_trees,
1801
+ predictions,
1802
+ });
1803
+ } catch {
1804
+ return _err("predict_random_forest_regressor", "invalid model text");
1805
+ }
1806
+ };
1807
+
1808
+ // =========================
1809
+ // NAIVE BAYES
1810
+ // =========================
1811
+
1812
+ const train_naive_bayes = (X, y) => {
1813
+ const classes = _uniq(y);
1814
+ const n = y.length;
1815
+ const p = X[0]?.length ?? 0;
1816
+ const priors = {};
1817
+ const stats = {};
1818
+ classes.forEach((c) => {
1819
+ const indices = y.map((v, i) => (v === c ? i : -1)).filter((i) => i >= 0);
1820
+ priors[c] = indices.length / n;
1821
+ stats[c] = Array.from({ length: p }, (_, j) => {
1822
+ const col = indices.map((i) => X[i][j]);
1823
+ return { mean: _mean(col), std: _std(col, true) };
1824
+ });
1825
+ });
1826
+ const model = { type: "naive_bayes", classes, priors, stats, n, p };
1827
+ return _text(model);
1828
+ };
1829
+
1830
+ const predict_naive_bayes = (model_text, X_test) => {
1831
+ try {
1832
+ const m = _lowerJson(model_text);
1833
+ const predictions = X_test.map((x) => {
1834
+ const scores = {};
1835
+ m.classes.forEach((c) => {
1836
+ let log_prob = Math.log(m.priors[c]);
1837
+ m.stats[c].forEach((s, j) => {
1838
+ const val = (x[j] - s.mean) / s.std;
1839
+ log_prob +=
1840
+ -0.5 * val * val - Math.log(s.std) - 0.5 * Math.log(2 * Math.PI);
1841
+ });
1842
+ scores[c] = log_prob;
1843
+ });
1844
+ return Number(Object.entries(scores).sort((a, b) => b[1] - a[1])[0][0]);
1845
+ });
1846
+ return _text({ type: "prediction", name: "naive_bayes", predictions });
1847
+ } catch {
1848
+ return _err("predict_naive_bayes", "invalid model text");
1849
+ }
1850
+ };
1851
+
1852
+ // =========================
1853
+ // FEATURE SCALING
1854
+ // =========================
1855
+
1856
+ const standard_scaler_fit = (X) => {
1857
+ const p = X[0]?.length ?? 0;
1858
+ const params = Array.from({ length: p }, (_, j) => {
1859
+ const col = X.map((row) => row[j]);
1860
+ return { mean: _mean(col), std: _std(col, true) };
1861
+ });
1862
+ return _text({ type: "standard_scaler", params, n: X.length, p });
1863
+ };
1864
+
1865
+ const standard_scaler_transform = (scaler_text, X) => {
1866
+ try {
1867
+ const m = _lowerJson(scaler_text);
1868
+ const X_scaled = X.map((row) =>
1869
+ row.map((v, j) => (v - m.params[j].mean) / m.params[j].std)
1870
+ );
1871
+ return _text({
1872
+ type: "scaled_data",
1873
+ method: "standard",
1874
+ preview: X_scaled.slice(0, 5),
1875
+ });
1876
+ } catch {
1877
+ return _err("standard_scaler_transform", "invalid scaler text");
1878
+ }
1879
+ };
1880
+
1881
+ const minmax_scaler_fit = (X) => {
1882
+ const p = X[0]?.length ?? 0;
1883
+ const params = Array.from({ length: p }, (_, j) => {
1884
+ const col = X.map((row) => row[j]);
1885
+ return { min: _min(col), max: _max(col) };
1886
+ });
1887
+ return _text({ type: "minmax_scaler", params, n: X.length, p });
1888
+ };
1889
+
1890
+ const minmax_scaler_transform = (scaler_text, X) => {
1891
+ try {
1892
+ const m = _lowerJson(scaler_text);
1893
+ const X_scaled = X.map((row) =>
1894
+ row.map((v, j) => {
1895
+ const range = m.params[j].max - m.params[j].min;
1896
+ return range === 0 ? 0 : (v - m.params[j].min) / range;
1897
+ })
1898
+ );
1899
+ return _text({
1900
+ type: "scaled_data",
1901
+ method: "minmax",
1902
+ preview: X_scaled.slice(0, 5),
1903
+ });
1904
+ } catch {
1905
+ return _err("minmax_scaler_transform", "invalid scaler text");
1906
+ }
1907
+ };
1908
+
1909
+ // =========================
1910
+ // DIMENSIONALITY REDUCTION (PCA)
1911
+ // =========================
1912
+
1913
+ const train_pca = (X, n_components = 2) => {
1914
+ const n = X.length;
1915
+ const p = X[0]?.length ?? 0;
1916
+ if (n_components > p)
1917
+ return _err("train_pca", "n_components cannot exceed number of features");
1918
+
1919
+ // center data
1920
+ const means = Array.from({ length: p }, (_, j) =>
1921
+ _mean(X.map((row) => row[j]))
1922
+ );
1923
+ const X_centered = X.map((row) => row.map((v, j) => v - means[j]));
1924
+
1925
+ // covariance matrix
1926
+ const cov = Array.from({ length: p }, (_, i) =>
1927
+ Array.from({ length: p }, (_, j) => {
1928
+ let sum = 0;
1929
+ for (let k = 0; k < n; k++) sum += X_centered[k][i] * X_centered[k][j];
1930
+ return sum / (n - 1);
1931
+ })
1932
+ );
1933
+
1934
+ // simple power iteration for first n_components eigenvectors
1935
+ const components = [];
1936
+ for (let c = 0; c < n_components; c++) {
1937
+ let v = Array.from({ length: p }, () => Math.random());
1938
+ for (let iter = 0; iter < 100; iter++) {
1939
+ const v_new = Array.from({ length: p }, (_, i) =>
1940
+ cov[i].reduce((s, val, j) => s + val * v[j], 0)
1941
+ );
1942
+ const norm = Math.sqrt(v_new.reduce((s, val) => s + val * val, 0));
1943
+ v = v_new.map((val) => val / norm);
1944
+ }
1945
+ components.push(v);
1946
+
1947
+ // deflate covariance matrix
1948
+ for (let i = 0; i < p; i++) {
1949
+ for (let j = 0; j < p; j++) {
1950
+ cov[i][j] -=
1951
+ v[i] * v[j] * cov[i].reduce((s, val, k) => s + val * v[k], 0);
1952
+ }
1953
+ }
1954
+ }
1955
+
1956
+ const model = { type: "pca", n_components, means, components, n, p };
1957
+ return _text(model);
1958
+ };
1959
+
1960
+ const transform_pca = (model_text, X) => {
1961
+ try {
1962
+ const m = _lowerJson(model_text);
1963
+ const X_centered = X.map((row) => row.map((v, j) => v - m.means[j]));
1964
+ const X_transformed = X_centered.map((row) =>
1965
+ m.components.map((comp) => row.reduce((s, v, i) => s + v * comp[i], 0))
1966
+ );
1967
+ return _text({
1968
+ type: "pca_transform",
1969
+ n_components: m.n_components,
1970
+ preview: X_transformed.slice(0, 5),
1971
+ });
1972
+ } catch {
1973
+ return _err("transform_pca", "invalid model text");
1974
+ }
1975
+ };
1976
+
1977
+ // =========================
1978
+ // CLUSTERING (K-MEANS)
1979
+ // =========================
1980
+
1981
+ const train_kmeans = (X, k = 3, opts = {}) => {
1982
+ const max_iter = opts.max_iterations ?? 100;
1983
+ const seed = opts.seed ?? 42;
1984
+ let s = seed;
1985
+ const rand = () => (s = (s * 9301 + 49297) % 233280) / 233280;
1986
+
1987
+ const n = X.length;
1988
+ const p = X[0]?.length ?? 0;
1989
+
1990
+ // initialize centroids randomly
1991
+ const indices = Array.from({ length: k }, () => Math.floor(rand() * n));
1992
+ let centroids = indices.map((i) => [...X[i]]);
1993
+ let labels = Array(n).fill(0);
1994
+
1995
+ for (let iter = 0; iter < max_iter; iter++) {
1996
+ // assign points to nearest centroid
1997
+ const new_labels = X.map((x) => {
1998
+ const distances = centroids.map((c) => _euclidean(x, c));
1999
+ return distances.indexOf(Math.min(...distances));
2000
+ });
2001
+
2002
+ // check convergence
2003
+ if (labels.every((l, i) => l === new_labels[i])) break;
2004
+ labels = new_labels;
2005
+
2006
+ // update centroids
2007
+ centroids = Array.from({ length: k }, (_, c) => {
2008
+ const cluster_points = X.filter((_, i) => labels[i] === c);
2009
+ if (cluster_points.length === 0) return centroids[c];
2010
+ return Array.from({ length: p }, (_, j) =>
2011
+ _mean(cluster_points.map((pt) => pt[j]))
2012
+ );
2013
+ });
2014
+ }
2015
+
2016
+ // calculate inertia
2017
+ const inertia = X.reduce(
2018
+ (s, x, i) => s + _euclidean(x, centroids[labels[i]]) ** 2,
2019
+ 0
2020
+ );
2021
+
2022
+ const model = { type: "kmeans", k, centroids, inertia, n, p };
2023
+ return _text(model);
2024
+ };
2025
+
2026
+ const predict_kmeans = (model_text, X_test) => {
2027
+ try {
2028
+ const m = _lowerJson(model_text);
2029
+ const predictions = X_test.map((x) => {
2030
+ const distances = m.centroids.map((c) => _euclidean(x, c));
2031
+ return distances.indexOf(Math.min(...distances));
2032
+ });
2033
+ return _text({
2034
+ type: "prediction",
2035
+ name: "kmeans",
2036
+ k: m.k,
2037
+ cluster_labels: predictions,
2038
+ });
2039
+ } catch {
2040
+ return _err("predict_kmeans", "invalid model text");
2041
+ }
2042
+ };
2043
+
2044
+ // =========================
2045
+ // ENSEMBLE VOTING
2046
+ // =========================
2047
+
2048
+ const ensemble_voting_classifier = (models_text, X_test, voting = "hard") => {
2049
+ try {
2050
+ const models = models_text.map((mt) => _lowerJson(mt));
2051
+
2052
+ if (voting === "hard") {
2053
+ const all_preds = models.map((m) => {
2054
+ if (m.type === "logistic_regression") {
2055
+ const pred_result = _lowerJson(predict_logistic(_text(m), X_test));
2056
+ return pred_result.classes;
2057
+ } else if (m.type === "knn_classifier") {
2058
+ const pred_result = _lowerJson(
2059
+ predict_knn_classifier(_text(m), X_test)
2060
+ );
2061
+ return pred_result.predictions;
2062
+ } else if (m.type === "decision_tree_classifier") {
2063
+ const pred_result = _lowerJson(
2064
+ predict_decision_tree(_text(m), X_test)
2065
+ );
2066
+ return pred_result.predictions;
2067
+ } else if (m.type === "random_forest_classifier") {
2068
+ const pred_result = _lowerJson(
2069
+ predict_random_forest_classifier(_text(m), X_test)
2070
+ );
2071
+ return pred_result.predictions;
2072
+ } else if (m.type === "naive_bayes") {
2073
+ const pred_result = _lowerJson(predict_naive_bayes(_text(m), X_test));
2074
+ return pred_result.predictions;
2075
+ }
2076
+ return [];
2077
+ });
2078
+
2079
+ const ensemble_preds = X_test.map((_, i) => {
2080
+ const votes = {};
2081
+ all_preds.forEach((preds) => {
2082
+ const v = preds[i];
2083
+ votes[v] = (votes[v] || 0) + 1;
2084
+ });
2085
+ return Number(Object.entries(votes).sort((a, b) => b[1] - a[1])[0][0]);
2086
+ });
2087
+
2088
+ return _text({
2089
+ type: "ensemble_prediction",
2090
+ method: "voting_hard",
2091
+ n_models: models.length,
2092
+ predictions: ensemble_preds,
2093
+ });
2094
+ } else {
2095
+ return _err(
2096
+ "ensemble_voting_classifier",
2097
+ "soft voting not yet implemented"
2098
+ );
2099
+ }
2100
+ } catch {
2101
+ return _err("ensemble_voting_classifier", "invalid models");
2102
+ }
2103
+ };
2104
+
2105
+ const ensemble_voting_regressor = (models_text, X_test) => {
2106
+ try {
2107
+ const models = models_text.map((mt) => _lowerJson(mt));
2108
+
2109
+ const all_preds = models.map((m) => {
2110
+ if (m.type === "linear_regression") {
2111
+ const pred_result = _lowerJson(predict_linear(_text(m), X_test));
2112
+ return pred_result.predictions;
2113
+ } else if (m.type === "knn_regressor") {
2114
+ const pred_result = _lowerJson(predict_knn_regressor(_text(m), X_test));
2115
+ return pred_result.predictions;
2116
+ } else if (m.type === "decision_tree_regressor") {
2117
+ const pred_result = _lowerJson(predict_decision_tree(_text(m), X_test));
2118
+ return pred_result.predictions;
2119
+ } else if (m.type === "random_forest_regressor") {
2120
+ const pred_result = _lowerJson(
2121
+ predict_random_forest_regressor(_text(m), X_test)
2122
+ );
2123
+ return pred_result.predictions;
2124
+ }
2125
+ return [];
2126
+ });
2127
+
2128
+ const ensemble_preds = X_test.map((_, i) => {
2129
+ const values = all_preds.map((preds) => preds[i]);
2130
+ return _mean(values);
2131
+ });
2132
+
2133
+ return _text({
2134
+ type: "ensemble_prediction",
2135
+ method: "voting_average",
2136
+ n_models: models.length,
2137
+ predictions: ensemble_preds,
2138
+ });
2139
+ } catch {
2140
+ return _err("ensemble_voting_regressor", "invalid models");
2141
+ }
2142
+ };
2143
+
2144
+ // =========================
2145
+ // CROSS-VALIDATION
2146
+ // =========================
2147
+
2148
+ const cross_validate = (X, y, model_type, opts = {}) => {
2149
+ const k_folds = opts.k_folds ?? 5;
2150
+ const n = X.length;
2151
+ const fold_size = Math.floor(n / k_folds);
2152
+
2153
+ const scores = [];
2154
+
2155
+ for (let fold = 0; fold < k_folds; fold++) {
2156
+ const test_start = fold * fold_size;
2157
+ const test_end = fold === k_folds - 1 ? n : (fold + 1) * fold_size;
2158
+
2159
+ const X_train = [...X.slice(0, test_start), ...X.slice(test_end)];
2160
+ const y_train = [...y.slice(0, test_start), ...y.slice(test_end)];
2161
+ const X_test = X.slice(test_start, test_end);
2162
+ const y_test = y.slice(test_start, test_end);
2163
+
2164
+ let model_text;
2165
+ let predictions;
2166
+
2167
+ if (model_type === "linear_regression") {
2168
+ model_text = train_linear_regression(X_train, y_train);
2169
+ const pred_result = _lowerJson(predict_linear(model_text, X_test));
2170
+ predictions = pred_result.predictions;
2171
+ const metrics = _lowerJson(metrics_regression(y_test, predictions));
2172
+ scores.push(metrics.r2);
2173
+ } else if (model_type === "logistic_regression") {
2174
+ model_text = train_logistic_regression(X_train, y_train, opts);
2175
+ const pred_result = _lowerJson(predict_logistic(model_text, X_test));
2176
+ predictions = pred_result.classes;
2177
+ const metrics = _lowerJson(metrics_classification(y_test, predictions));
2178
+ scores.push(metrics.accuracy);
2179
+ } else if (model_type === "knn_classifier") {
2180
+ model_text = train_knn_classifier(X_train, y_train, opts.k ?? 5);
2181
+ const pred_result = _lowerJson(
2182
+ predict_knn_classifier(model_text, X_test)
2183
+ );
2184
+ predictions = pred_result.predictions;
2185
+ const metrics = _lowerJson(metrics_classification(y_test, predictions));
2186
+ scores.push(metrics.accuracy);
2187
+ } else if (model_type === "decision_tree_classifier") {
2188
+ model_text = train_decision_tree_classifier(X_train, y_train, opts);
2189
+ const pred_result = _lowerJson(predict_decision_tree(model_text, X_test));
2190
+ predictions = pred_result.predictions;
2191
+ const metrics = _lowerJson(metrics_classification(y_test, predictions));
2192
+ scores.push(metrics.accuracy);
2193
+ } else if (model_type === "random_forest_classifier") {
2194
+ model_text = train_random_forest_classifier(X_train, y_train, opts);
2195
+ const pred_result = _lowerJson(
2196
+ predict_random_forest_classifier(model_text, X_test)
2197
+ );
2198
+ predictions = pred_result.predictions;
2199
+ const metrics = _lowerJson(metrics_classification(y_test, predictions));
2200
+ scores.push(metrics.accuracy);
2201
+ }
2202
+ }
2203
+
2204
+ return _text({
2205
+ type: "cross_validation",
2206
+ model_type,
2207
+ k_folds,
2208
+ scores,
2209
+ mean_score: _mean(scores),
2210
+ std_score: _std(scores, true),
2211
+ });
2212
+ };
2213
+
2214
+ // =========================
2215
+ // FEATURE IMPORTANCE (for tree-based models)
2216
+ // =========================
2217
+
2218
+ const _tree_feature_importance = (tree, n_features) => {
2219
+ const importance = Array(n_features).fill(0);
2220
+
2221
+ const traverse = (node, n_samples) => {
2222
+ if (node.leaf) return;
2223
+ importance[node.feature] += n_samples;
2224
+ traverse(node.left, n_samples);
2225
+ traverse(node.right, n_samples);
2226
+ };
2227
+
2228
+ traverse(tree, 1);
2229
+ const total = importance.reduce((a, b) => a + b, 0);
2230
+ return importance.map((v) => (total > 0 ? v / total : 0));
2231
+ };
2232
+
2233
+ const feature_importance_tree = (model_text) => {
2234
+ try {
2235
+ const m = _lowerJson(model_text);
2236
+ if (
2237
+ m.type === "decision_tree_classifier" ||
2238
+ m.type === "decision_tree_regressor"
2239
+ ) {
2240
+ const importance = _tree_feature_importance(m.tree, m.p);
2241
+ return _text({ type: "feature_importance", model: m.type, importance });
2242
+ } else if (
2243
+ m.type === "random_forest_classifier" ||
2244
+ m.type === "random_forest_regressor"
2245
+ ) {
2246
+ const all_importance = m.trees.map((tree) =>
2247
+ _tree_feature_importance(tree, m.p)
2248
+ );
2249
+ const avg_importance = Array.from({ length: m.p }, (_, i) =>
2250
+ _mean(all_importance.map((imp) => imp[i]))
2251
+ );
2252
+ return _text({
2253
+ type: "feature_importance",
2254
+ model: m.type,
2255
+ n_trees: m.n_trees,
2256
+ importance: avg_importance,
2257
+ });
2258
+ } else {
2259
+ return _err("feature_importance_tree", "model must be tree-based");
2260
+ }
2261
+ } catch {
2262
+ return _err("feature_importance_tree", "invalid model text");
2263
+ }
2264
+ };
2265
+
2266
+ // =========================
2267
+ // OUTLIER DETECTION
2268
+ // =========================
2269
+
2270
+ const outliers_iqr = (arr) => {
2271
+ const x = _numeric(arr);
2272
+ const q1 = _quantile(x, 0.25);
2273
+ const q3 = _quantile(x, 0.75);
2274
+ const iqr = q3 - q1;
2275
+ const lower = q1 - 1.5 * iqr;
2276
+ const upper = q3 + 1.5 * iqr;
2277
+ const outliers = x.filter((v) => v < lower || v > upper);
2278
+ const indices = arr
2279
+ .map((v, i) => (_isNumber(v) && (v < lower || v > upper) ? i : -1))
2280
+ .filter((i) => i >= 0);
2281
+ return _text({
2282
+ type: "outlier_detection",
2283
+ method: "iqr",
2284
+ lower_bound: lower,
2285
+ upper_bound: upper,
2286
+ n_outliers: outliers.length,
2287
+ outlier_indices: indices,
2288
+ outlier_values: outliers,
2289
+ });
2290
+ };
2291
+
2292
+ const outliers_zscore = (arr, threshold = 3) => {
2293
+ const x = _numeric(arr);
2294
+ const m = _mean(x);
2295
+ const s = _std(x, true);
2296
+ const zscores = x.map((v) => Math.abs((v - m) / s));
2297
+ const outliers = x.filter((_, i) => zscores[i] > threshold);
2298
+ const indices = arr
2299
+ .map((v, i) => (_isNumber(v) && Math.abs((v - m) / s) > threshold ? i : -1))
2300
+ .filter((i) => i >= 0);
2301
+ return _text({
2302
+ type: "outlier_detection",
2303
+ method: "zscore",
2304
+ threshold,
2305
+ n_outliers: outliers.length,
2306
+ outlier_indices: indices,
2307
+ outlier_values: outliers,
2308
+ });
2309
+ };
2310
+
2311
+ // =========================
2312
+ // TIME SERIES BASICS
2313
+ // =========================
2314
+
2315
+ const moving_average = (arr, window = 3) => {
2316
+ const x = _numeric(arr);
2317
+ const ma = [];
2318
+ for (let i = 0; i < x.length; i++) {
2319
+ const start = Math.max(0, i - window + 1);
2320
+ const slice = x.slice(start, i + 1);
2321
+ ma.push(_mean(slice));
2322
+ }
2323
+ return _text({
2324
+ type: "time_series",
2325
+ method: "moving_average",
2326
+ window,
2327
+ values: ma,
2328
+ });
2329
+ };
2330
+
2331
+ const exponential_smoothing = (arr, alpha = 0.3) => {
2332
+ const x = _numeric(arr);
2333
+ if (x.length === 0) return _err("exponential_smoothing", "empty data");
2334
+ const smoothed = [x[0]];
2335
+ for (let i = 1; i < x.length; i++) {
2336
+ smoothed.push(alpha * x[i] + (1 - alpha) * smoothed[i - 1]);
2337
+ }
2338
+ return _text({
2339
+ type: "time_series",
2340
+ method: "exponential_smoothing",
2341
+ alpha,
2342
+ values: smoothed,
2343
+ });
2344
+ };
2345
+
2346
+ const autocorrelation = (arr, lag = 1) => {
2347
+ const x = _numeric(arr);
2348
+ const n = x.length;
2349
+ if (n < lag + 1) return _err("autocorrelation", "insufficient data for lag");
2350
+ const m = _mean(x);
2351
+ let num = 0;
2352
+ let den = 0;
2353
+ for (let i = 0; i < n; i++) {
2354
+ den += (x[i] - m) ** 2;
2355
+ if (i >= lag) num += (x[i] - m) * (x[i - lag] - m);
2356
+ }
2357
+ const acf = num / den;
2358
+ return _ok("statistic", { name: "autocorrelation", lag, value: acf });
2359
+ };
2360
+
2361
+ // =========================
2362
+ // ADDITIONAL EXPORTS
2363
+ // =========================
2364
+
2365
+ export {
2366
+ // dataframe
2367
+ dataframe_from_json,
2368
+ df_describe,
2369
+ df_missing_report,
2370
+ df_corr,
2371
+ eda_overview,
2372
+ // stats
2373
+ mean,
2374
+ stddeviation,
2375
+ variance,
2376
+ median,
2377
+ quantile,
2378
+ minv,
2379
+ maxv,
2380
+ skewness,
2381
+ kurtosis,
2382
+ corr_pearson,
2383
+ corr_spearman,
2384
+ // distributions
2385
+ normal_pdf,
2386
+ normal_cdf,
2387
+ normal_ppf,
2388
+ binomial_pmf,
2389
+ binomial_cdf,
2390
+ poisson_pmf,
2391
+ poisson_cdf,
2392
+ // hypothesis tests
2393
+ t_test_independent,
2394
+ z_test_one_sample,
2395
+ chi_square_independence,
2396
+ anova_oneway,
2397
+ // ml
2398
+ train_test_split,
2399
+ train_linear_regression,
2400
+ train_logistic_regression,
2401
+ predict_linear,
2402
+ predict_logistic,
2403
+ metrics_classification,
2404
+ metrics_regression,
2405
+ // additional statistical tests
2406
+ t_test_paired,
2407
+ t_test_one_sample,
2408
+ shapiro_wilk,
2409
+ jarque_bera,
2410
+ levene_test,
2411
+ kruskal_wallis,
2412
+ mann_whitney,
2413
+ wilcoxon_signed_rank,
2414
+ chi_square_goodness,
2415
+ // confidence intervals
2416
+ confidence_interval_mean,
2417
+ confidence_interval_proportion,
2418
+ confidence_interval_variance,
2419
+ confidence_interval_difference,
2420
+ // additional correlations
2421
+ corr_kendall,
2422
+ corr_partial,
2423
+ corr_matrix_all,
2424
+ // knn
2425
+ train_knn_classifier,
2426
+ predict_knn_classifier,
2427
+ train_knn_regressor,
2428
+ predict_knn_regressor,
2429
+ // decision trees
2430
+ train_decision_tree_classifier,
2431
+ train_decision_tree_regressor,
2432
+ predict_decision_tree,
2433
+ // random forest
2434
+ train_random_forest_classifier,
2435
+ train_random_forest_regressor,
2436
+ predict_random_forest_classifier,
2437
+ predict_random_forest_regressor,
2438
+ // naive bayes
2439
+ train_naive_bayes,
2440
+ predict_naive_bayes,
2441
+ // feature scaling
2442
+ standard_scaler_fit,
2443
+ standard_scaler_transform,
2444
+ minmax_scaler_fit,
2445
+ minmax_scaler_transform,
2446
+ // dimensionality reduction
2447
+ train_pca,
2448
+ transform_pca,
2449
+ // clustering
2450
+ train_kmeans,
2451
+ predict_kmeans,
2452
+ // ensemble
2453
+ ensemble_voting_classifier,
2454
+ ensemble_voting_regressor,
2455
+ // cross-validation
2456
+ cross_validate,
2457
+ // feature importance
2458
+ feature_importance_tree,
2459
+ // outlier detection
2460
+ outliers_iqr,
2461
+ outliers_zscore,
2462
+ // time series
2463
+ moving_average,
2464
+ exponential_smoothing,
2465
+ autocorrelation,
2466
+ };