@fuzdev/fuz_util 0.42.0 โ 0.44.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +19 -12
- package/dist/async.d.ts +2 -2
- package/dist/async.d.ts.map +1 -1
- package/dist/async.js +2 -2
- package/dist/benchmark.d.ts +179 -0
- package/dist/benchmark.d.ts.map +1 -0
- package/dist/benchmark.js +400 -0
- package/dist/benchmark_baseline.d.ts +195 -0
- package/dist/benchmark_baseline.d.ts.map +1 -0
- package/dist/benchmark_baseline.js +388 -0
- package/dist/benchmark_format.d.ts +87 -0
- package/dist/benchmark_format.d.ts.map +1 -0
- package/dist/benchmark_format.js +266 -0
- package/dist/benchmark_stats.d.ts +112 -0
- package/dist/benchmark_stats.d.ts.map +1 -0
- package/dist/benchmark_stats.js +219 -0
- package/dist/benchmark_types.d.ts +174 -0
- package/dist/benchmark_types.d.ts.map +1 -0
- package/dist/benchmark_types.js +1 -0
- package/dist/git.d.ts +12 -0
- package/dist/git.d.ts.map +1 -1
- package/dist/git.js +14 -0
- package/dist/library_json.d.ts +3 -3
- package/dist/library_json.d.ts.map +1 -1
- package/dist/library_json.js +1 -1
- package/dist/maths.d.ts +4 -0
- package/dist/maths.d.ts.map +1 -1
- package/dist/maths.js +8 -0
- package/dist/object.js +1 -1
- package/dist/source_json.d.ts +4 -4
- package/dist/stats.d.ts +180 -0
- package/dist/stats.d.ts.map +1 -0
- package/dist/stats.js +402 -0
- package/dist/string.d.ts +13 -0
- package/dist/string.d.ts.map +1 -1
- package/dist/string.js +58 -0
- package/dist/time.d.ts +165 -0
- package/dist/time.d.ts.map +1 -0
- package/dist/time.js +264 -0
- package/dist/timings.d.ts +1 -7
- package/dist/timings.d.ts.map +1 -1
- package/dist/timings.js +16 -16
- package/package.json +21 -19
- package/src/lib/async.ts +3 -3
- package/src/lib/benchmark.ts +498 -0
- package/src/lib/benchmark_baseline.ts +538 -0
- package/src/lib/benchmark_format.ts +314 -0
- package/src/lib/benchmark_stats.ts +311 -0
- package/src/lib/benchmark_types.ts +197 -0
- package/src/lib/git.ts +24 -0
- package/src/lib/library_json.ts +3 -3
- package/src/lib/maths.ts +8 -0
- package/src/lib/object.ts +1 -1
- package/src/lib/stats.ts +534 -0
- package/src/lib/string.ts +66 -0
- package/src/lib/time.ts +319 -0
- package/src/lib/timings.ts +17 -17
- package/src/lib/types.ts +2 -2
package/src/lib/stats.ts
ADDED
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Statistical analysis utilities.
|
|
3
|
+
* Pure functions with zero dependencies - can be used standalone for any data analysis.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// Statistical constants (defaults)
|
|
7
|
+
const DEFAULT_IQR_MULTIPLIER = 1.5;
|
|
8
|
+
const DEFAULT_MAD_Z_SCORE_THRESHOLD = 3.5;
|
|
9
|
+
const DEFAULT_MAD_Z_SCORE_EXTREME = 5.0;
|
|
10
|
+
const DEFAULT_MAD_CONSTANT = 0.6745; // For normal distribution approximation
|
|
11
|
+
const DEFAULT_OUTLIER_RATIO_HIGH = 0.3;
|
|
12
|
+
const DEFAULT_OUTLIER_RATIO_EXTREME = 0.4;
|
|
13
|
+
const DEFAULT_OUTLIER_KEEP_RATIO = 0.8;
|
|
14
|
+
const DEFAULT_CONFIDENCE_Z = 1.96; // 95% confidence
|
|
15
|
+
const DEFAULT_MIN_SAMPLE_SIZE = 3;
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Calculate the mean (average) of an array of numbers.
|
|
19
|
+
*/
|
|
20
|
+
export const stats_mean = (values: Array<number>): number => {
|
|
21
|
+
if (values.length === 0) return NaN;
|
|
22
|
+
return values.reduce((sum, val) => sum + val, 0) / values.length;
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Calculate the median of an array of numbers.
|
|
27
|
+
*/
|
|
28
|
+
export const stats_median = (values: Array<number>): number => {
|
|
29
|
+
if (values.length === 0) return NaN;
|
|
30
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
31
|
+
const mid = Math.floor(sorted.length / 2);
|
|
32
|
+
return sorted.length % 2 === 0 ? (sorted[mid - 1]! + sorted[mid]!) / 2 : sorted[mid]!;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Calculate the standard deviation of an array of numbers.
|
|
37
|
+
* Uses population standard deviation (divides by n, not n-1).
|
|
38
|
+
* For benchmarks with many samples, this is typically appropriate.
|
|
39
|
+
*/
|
|
40
|
+
export const stats_std_dev = (values: Array<number>, mean?: number): number => {
|
|
41
|
+
if (values.length === 0) return NaN;
|
|
42
|
+
const m = mean ?? stats_mean(values);
|
|
43
|
+
const variance = values.reduce((sum, val) => sum + (val - m) ** 2, 0) / values.length;
|
|
44
|
+
return Math.sqrt(variance);
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Calculate the variance of an array of numbers.
|
|
49
|
+
*/
|
|
50
|
+
export const stats_variance = (values: Array<number>, mean?: number): number => {
|
|
51
|
+
if (values.length === 0) return NaN;
|
|
52
|
+
const m = mean ?? stats_mean(values);
|
|
53
|
+
return values.reduce((sum, val) => sum + (val - m) ** 2, 0) / values.length;
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Calculate a percentile of an array of numbers using linear interpolation.
|
|
58
|
+
* Uses the "R-7" method (default in R, NumPy, Excel) which interpolates between
|
|
59
|
+
* data points for more accurate percentile estimates, especially with smaller samples.
|
|
60
|
+
* @param values - Array of numbers
|
|
61
|
+
* @param p - Percentile (0-1, e.g., 0.95 for 95th percentile)
|
|
62
|
+
*/
|
|
63
|
+
export const stats_percentile = (values: Array<number>, p: number): number => {
|
|
64
|
+
if (values.length === 0) return NaN;
|
|
65
|
+
if (values.length === 1) return values[0]!;
|
|
66
|
+
|
|
67
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
68
|
+
const n = sorted.length;
|
|
69
|
+
|
|
70
|
+
// R-7 method: index = (n - 1) * p
|
|
71
|
+
const index = (n - 1) * p;
|
|
72
|
+
const lower = Math.floor(index);
|
|
73
|
+
const upper = Math.ceil(index);
|
|
74
|
+
|
|
75
|
+
if (lower === upper) {
|
|
76
|
+
return sorted[lower]!;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Linear interpolation between the two nearest values
|
|
80
|
+
const fraction = index - lower;
|
|
81
|
+
return sorted[lower]! + fraction * (sorted[upper]! - sorted[lower]!);
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Calculate the coefficient of variation (CV).
|
|
86
|
+
* CV = standard deviation / mean, expressed as a ratio.
|
|
87
|
+
* Useful for comparing relative variability between datasets.
|
|
88
|
+
*/
|
|
89
|
+
export const stats_cv = (mean: number, std_dev: number): number => {
|
|
90
|
+
if (mean === 0) return NaN;
|
|
91
|
+
return std_dev / mean;
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Calculate min and max values.
|
|
96
|
+
*/
|
|
97
|
+
export const stats_min_max = (values: Array<number>): {min: number; max: number} => {
|
|
98
|
+
if (values.length === 0) return {min: NaN, max: NaN};
|
|
99
|
+
let min = values[0]!;
|
|
100
|
+
let max = values[0]!;
|
|
101
|
+
for (let i = 1; i < values.length; i++) {
|
|
102
|
+
const val = values[i]!;
|
|
103
|
+
if (val < min) min = val;
|
|
104
|
+
if (val > max) max = val;
|
|
105
|
+
}
|
|
106
|
+
return {min, max};
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Result from outlier detection.
|
|
111
|
+
*/
|
|
112
|
+
export interface StatsOutlierResult {
|
|
113
|
+
/** Values after removing outliers */
|
|
114
|
+
cleaned: Array<number>;
|
|
115
|
+
/** Detected outlier values */
|
|
116
|
+
outliers: Array<number>;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Configuration options for IQR outlier detection.
|
|
121
|
+
*/
|
|
122
|
+
export interface StatsOutliersIqrOptions {
|
|
123
|
+
/** Multiplier for IQR bounds (default: 1.5) */
|
|
124
|
+
iqr_multiplier?: number;
|
|
125
|
+
/** Minimum sample size to perform outlier detection (default: 3) */
|
|
126
|
+
min_sample_size?: number;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Detect outliers using the IQR (Interquartile Range) method.
|
|
131
|
+
* Values outside [Q1 - multiplier*IQR, Q3 + multiplier*IQR] are considered outliers.
|
|
132
|
+
*/
|
|
133
|
+
export const stats_outliers_iqr = (
|
|
134
|
+
values: Array<number>,
|
|
135
|
+
options?: StatsOutliersIqrOptions,
|
|
136
|
+
): StatsOutlierResult => {
|
|
137
|
+
const iqr_multiplier = options?.iqr_multiplier ?? DEFAULT_IQR_MULTIPLIER;
|
|
138
|
+
const min_sample_size = options?.min_sample_size ?? DEFAULT_MIN_SAMPLE_SIZE;
|
|
139
|
+
|
|
140
|
+
if (values.length < min_sample_size) {
|
|
141
|
+
return {cleaned: values, outliers: []};
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
145
|
+
const q1 = sorted[Math.floor(sorted.length * 0.25)]!;
|
|
146
|
+
const q3 = sorted[Math.floor(sorted.length * 0.75)]!;
|
|
147
|
+
const iqr = q3 - q1;
|
|
148
|
+
|
|
149
|
+
if (iqr === 0) {
|
|
150
|
+
return {cleaned: values, outliers: []};
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const lower_bound = q1 - iqr_multiplier * iqr;
|
|
154
|
+
const upper_bound = q3 + iqr_multiplier * iqr;
|
|
155
|
+
|
|
156
|
+
const cleaned: Array<number> = [];
|
|
157
|
+
const outliers: Array<number> = [];
|
|
158
|
+
|
|
159
|
+
for (const value of values) {
|
|
160
|
+
if (value < lower_bound || value > upper_bound) {
|
|
161
|
+
outliers.push(value);
|
|
162
|
+
} else {
|
|
163
|
+
cleaned.push(value);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return {cleaned, outliers};
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Configuration options for MAD outlier detection.
|
|
172
|
+
*/
|
|
173
|
+
export interface StatsOutliersMadOptions {
|
|
174
|
+
/** Modified Z-score threshold for outlier detection (default: 3.5) */
|
|
175
|
+
z_score_threshold?: number;
|
|
176
|
+
/** Extreme Z-score threshold when too many outliers detected (default: 5.0) */
|
|
177
|
+
z_score_extreme?: number;
|
|
178
|
+
/** MAD constant for normal distribution (default: 0.6745) */
|
|
179
|
+
mad_constant?: number;
|
|
180
|
+
/** Ratio threshold to switch to extreme mode (default: 0.3) */
|
|
181
|
+
outlier_ratio_high?: number;
|
|
182
|
+
/** Ratio threshold to switch to keep-closest mode (default: 0.4) */
|
|
183
|
+
outlier_ratio_extreme?: number;
|
|
184
|
+
/** Ratio of values to keep in keep-closest mode (default: 0.8) */
|
|
185
|
+
outlier_keep_ratio?: number;
|
|
186
|
+
/** Minimum sample size to perform outlier detection (default: 3) */
|
|
187
|
+
min_sample_size?: number;
|
|
188
|
+
/** Options to pass to IQR fallback when MAD is zero */
|
|
189
|
+
iqr_options?: StatsOutliersIqrOptions;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/**
|
|
193
|
+
* Detect outliers using the MAD (Median Absolute Deviation) method.
|
|
194
|
+
* More robust than IQR for skewed distributions.
|
|
195
|
+
* Uses modified Z-score: |0.6745 * (x - median) / MAD|
|
|
196
|
+
* Values with modified Z-score > threshold are considered outliers.
|
|
197
|
+
*/
|
|
198
|
+
export const stats_outliers_mad = (
|
|
199
|
+
values: Array<number>,
|
|
200
|
+
options?: StatsOutliersMadOptions,
|
|
201
|
+
): StatsOutlierResult => {
|
|
202
|
+
const z_score_threshold = options?.z_score_threshold ?? DEFAULT_MAD_Z_SCORE_THRESHOLD;
|
|
203
|
+
const z_score_extreme = options?.z_score_extreme ?? DEFAULT_MAD_Z_SCORE_EXTREME;
|
|
204
|
+
const mad_constant = options?.mad_constant ?? DEFAULT_MAD_CONSTANT;
|
|
205
|
+
const outlier_ratio_high = options?.outlier_ratio_high ?? DEFAULT_OUTLIER_RATIO_HIGH;
|
|
206
|
+
const outlier_ratio_extreme = options?.outlier_ratio_extreme ?? DEFAULT_OUTLIER_RATIO_EXTREME;
|
|
207
|
+
const outlier_keep_ratio = options?.outlier_keep_ratio ?? DEFAULT_OUTLIER_KEEP_RATIO;
|
|
208
|
+
const min_sample_size = options?.min_sample_size ?? DEFAULT_MIN_SAMPLE_SIZE;
|
|
209
|
+
const iqr_options = options?.iqr_options;
|
|
210
|
+
|
|
211
|
+
if (values.length < min_sample_size) {
|
|
212
|
+
return {cleaned: values, outliers: []};
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
216
|
+
const median = stats_median(sorted);
|
|
217
|
+
|
|
218
|
+
// Calculate MAD (Median Absolute Deviation)
|
|
219
|
+
const deviations = values.map((v) => Math.abs(v - median));
|
|
220
|
+
const sorted_deviations = [...deviations].sort((a, b) => a - b);
|
|
221
|
+
const mad = stats_median(sorted_deviations);
|
|
222
|
+
|
|
223
|
+
// If MAD is zero, fall back to IQR method
|
|
224
|
+
if (mad === 0) {
|
|
225
|
+
return stats_outliers_iqr(values, iqr_options);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Use modified Z-score with MAD
|
|
229
|
+
let cleaned: Array<number> = [];
|
|
230
|
+
let outliers: Array<number> = [];
|
|
231
|
+
|
|
232
|
+
for (const value of values) {
|
|
233
|
+
const modified_z_score = (mad_constant * (value - median)) / mad;
|
|
234
|
+
if (Math.abs(modified_z_score) > z_score_threshold) {
|
|
235
|
+
outliers.push(value);
|
|
236
|
+
} else {
|
|
237
|
+
cleaned.push(value);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// If too many outliers, increase threshold and try again
|
|
242
|
+
if (outliers.length > values.length * outlier_ratio_high) {
|
|
243
|
+
cleaned = [];
|
|
244
|
+
outliers = [];
|
|
245
|
+
|
|
246
|
+
for (const value of values) {
|
|
247
|
+
const modified_z_score = (mad_constant * (value - median)) / mad;
|
|
248
|
+
if (Math.abs(modified_z_score) > z_score_extreme) {
|
|
249
|
+
outliers.push(value);
|
|
250
|
+
} else {
|
|
251
|
+
cleaned.push(value);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// If still too many outliers, keep closest values to median
|
|
256
|
+
if (outliers.length > values.length * outlier_ratio_extreme) {
|
|
257
|
+
const with_distances = values.map((v) => ({
|
|
258
|
+
value: v,
|
|
259
|
+
distance: Math.abs(v - median),
|
|
260
|
+
}));
|
|
261
|
+
with_distances.sort((a, b) => a.distance - b.distance);
|
|
262
|
+
|
|
263
|
+
const keep_count = Math.floor(values.length * outlier_keep_ratio);
|
|
264
|
+
cleaned = with_distances.slice(0, keep_count).map((d) => d.value);
|
|
265
|
+
outliers = with_distances.slice(keep_count).map((d) => d.value);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
return {cleaned, outliers};
|
|
270
|
+
};
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
* Common z-scores for confidence intervals.
|
|
274
|
+
*/
|
|
275
|
+
export const STATS_CONFIDENCE_Z_SCORES: Record<number, number> = {
|
|
276
|
+
0.8: 1.282,
|
|
277
|
+
0.9: 1.645,
|
|
278
|
+
0.95: 1.96,
|
|
279
|
+
0.99: 2.576,
|
|
280
|
+
0.999: 3.291,
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* Convert a confidence level (0-1) to a z-score.
|
|
285
|
+
* Uses a lookup table for common values, approximates others.
|
|
286
|
+
*
|
|
287
|
+
* @example
|
|
288
|
+
* ```ts
|
|
289
|
+
* stats_confidence_level_to_z_score(0.95); // 1.96
|
|
290
|
+
* stats_confidence_level_to_z_score(0.99); // 2.576
|
|
291
|
+
* ```
|
|
292
|
+
*/
|
|
293
|
+
export const stats_confidence_level_to_z_score = (level: number): number => {
|
|
294
|
+
if (level <= 0 || level >= 1) {
|
|
295
|
+
throw new Error('Confidence level must be between 0 and 1 (exclusive)');
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Check lookup table first
|
|
299
|
+
if (level in STATS_CONFIDENCE_Z_SCORES) {
|
|
300
|
+
return STATS_CONFIDENCE_Z_SCORES[level]!;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// For confidence level c, we want z such that P(-z < Z < z) = c
|
|
304
|
+
// This means ฮฆ(z) = (1 + c) / 2, so z = ฮฆโปยน((1 + c) / 2)
|
|
305
|
+
// Using ฮฆโปยน(p) = โ2 * erfinv(2p - 1)
|
|
306
|
+
const p = (1 + level) / 2; // e.g., 0.95 -> 0.975
|
|
307
|
+
const x = 2 * p - 1; // Argument for erfinv, e.g., 0.975 -> 0.95
|
|
308
|
+
|
|
309
|
+
// Winitzki approximation for erfinv
|
|
310
|
+
const a = 0.147;
|
|
311
|
+
const ln_term = Math.log(1 - x * x);
|
|
312
|
+
const term1 = 2 / (Math.PI * a) + ln_term / 2;
|
|
313
|
+
const erfinv = Math.sign(x) * Math.sqrt(Math.sqrt(term1 * term1 - ln_term / a) - term1);
|
|
314
|
+
|
|
315
|
+
return Math.SQRT2 * erfinv;
|
|
316
|
+
};
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Configuration options for confidence interval calculation.
|
|
320
|
+
*/
|
|
321
|
+
export interface StatsConfidenceIntervalOptions {
|
|
322
|
+
/** Z-score for confidence level (default: 1.96 for 95% CI) */
|
|
323
|
+
z_score?: number;
|
|
324
|
+
/** Confidence level (0-1), alternative to z_score. If both provided, z_score takes precedence. */
|
|
325
|
+
confidence_level?: number;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/**
|
|
329
|
+
* Calculate confidence interval for the mean.
|
|
330
|
+
* @param values - Array of numbers
|
|
331
|
+
* @param options - Configuration options
|
|
332
|
+
* @returns [lower_bound, upper_bound]
|
|
333
|
+
*/
|
|
334
|
+
export const stats_confidence_interval = (
|
|
335
|
+
values: Array<number>,
|
|
336
|
+
options?: StatsConfidenceIntervalOptions,
|
|
337
|
+
): [number, number] => {
|
|
338
|
+
if (values.length === 0) return [NaN, NaN];
|
|
339
|
+
|
|
340
|
+
const mean = stats_mean(values);
|
|
341
|
+
const std_dev = stats_std_dev(values, mean);
|
|
342
|
+
|
|
343
|
+
return stats_confidence_interval_from_summary(mean, std_dev, values.length, options);
|
|
344
|
+
};
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Calculate confidence interval from summary statistics (mean, std_dev, sample_size).
|
|
348
|
+
* Useful when raw data is not available.
|
|
349
|
+
* @param mean - Mean of the data
|
|
350
|
+
* @param std_dev - Standard deviation of the data
|
|
351
|
+
* @param sample_size - Number of samples
|
|
352
|
+
* @param options - Configuration options
|
|
353
|
+
* @returns [lower_bound, upper_bound]
|
|
354
|
+
*/
|
|
355
|
+
export const stats_confidence_interval_from_summary = (
|
|
356
|
+
mean: number,
|
|
357
|
+
std_dev: number,
|
|
358
|
+
sample_size: number,
|
|
359
|
+
options?: StatsConfidenceIntervalOptions,
|
|
360
|
+
): [number, number] => {
|
|
361
|
+
// z_score takes precedence, then confidence_level, then default
|
|
362
|
+
const z_score =
|
|
363
|
+
options?.z_score ??
|
|
364
|
+
(options?.confidence_level
|
|
365
|
+
? stats_confidence_level_to_z_score(options.confidence_level)
|
|
366
|
+
: null) ??
|
|
367
|
+
DEFAULT_CONFIDENCE_Z;
|
|
368
|
+
|
|
369
|
+
if (sample_size === 0) return [NaN, NaN];
|
|
370
|
+
|
|
371
|
+
const se = std_dev / Math.sqrt(sample_size);
|
|
372
|
+
const margin = z_score * se;
|
|
373
|
+
|
|
374
|
+
return [mean - margin, mean + margin];
|
|
375
|
+
};
|
|
376
|
+
|
|
377
|
+
// Hypothesis Testing Utilities
|
|
378
|
+
// These functions support statistical significance testing (t-tests, p-values, etc.)
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* Result from Welch's t-test calculation.
|
|
382
|
+
*/
|
|
383
|
+
export interface StatsWelchTTestResult {
|
|
384
|
+
/** The t-statistic */
|
|
385
|
+
t_statistic: number;
|
|
386
|
+
/** Welch-Satterthwaite degrees of freedom */
|
|
387
|
+
degrees_of_freedom: number;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/**
|
|
391
|
+
* Calculate Welch's t-test statistic and degrees of freedom.
|
|
392
|
+
* Welch's t-test is more robust than Student's t-test when variances are unequal.
|
|
393
|
+
*
|
|
394
|
+
* @param mean1 - Mean of first sample
|
|
395
|
+
* @param std1 - Standard deviation of first sample
|
|
396
|
+
* @param n1 - Size of first sample
|
|
397
|
+
* @param mean2 - Mean of second sample
|
|
398
|
+
* @param std2 - Standard deviation of second sample
|
|
399
|
+
* @param n2 - Size of second sample
|
|
400
|
+
*/
|
|
401
|
+
export const stats_welch_t_test = (
|
|
402
|
+
mean1: number,
|
|
403
|
+
std1: number,
|
|
404
|
+
n1: number,
|
|
405
|
+
mean2: number,
|
|
406
|
+
std2: number,
|
|
407
|
+
n2: number,
|
|
408
|
+
): StatsWelchTTestResult => {
|
|
409
|
+
const var1 = std1 ** 2;
|
|
410
|
+
const var2 = std2 ** 2;
|
|
411
|
+
|
|
412
|
+
const se1 = var1 / n1;
|
|
413
|
+
const se2 = var2 / n2;
|
|
414
|
+
|
|
415
|
+
const t_statistic = (mean1 - mean2) / Math.sqrt(se1 + se2);
|
|
416
|
+
|
|
417
|
+
// Welch-Satterthwaite degrees of freedom
|
|
418
|
+
const numerator = (se1 + se2) ** 2;
|
|
419
|
+
const denominator = se1 ** 2 / (n1 - 1) + se2 ** 2 / (n2 - 1);
|
|
420
|
+
const degrees_of_freedom = numerator / denominator;
|
|
421
|
+
|
|
422
|
+
return {t_statistic, degrees_of_freedom};
|
|
423
|
+
};
|
|
424
|
+
|
|
425
|
+
/**
|
|
426
|
+
* Standard normal CDF approximation (Abramowitz and Stegun formula 7.1.26).
|
|
427
|
+
*/
|
|
428
|
+
export const stats_normal_cdf = (x: number): number => {
|
|
429
|
+
const t = 1 / (1 + 0.2316419 * Math.abs(x));
|
|
430
|
+
const d = 0.3989423 * Math.exp((-x * x) / 2);
|
|
431
|
+
const p =
|
|
432
|
+
d * t * (0.3193815 + t * (-0.3565638 + t * (1.781478 + t * (-1.821256 + t * 1.330274))));
|
|
433
|
+
return x > 0 ? 1 - p : p;
|
|
434
|
+
};
|
|
435
|
+
|
|
436
|
+
/**
|
|
437
|
+
* Log gamma function approximation (Lanczos approximation).
|
|
438
|
+
*/
|
|
439
|
+
export const stats_ln_gamma = (z: number): number => {
|
|
440
|
+
const g = 7;
|
|
441
|
+
const c = [
|
|
442
|
+
0.99999999999980993, 676.5203681218851, -1259.1392167224028, 771.32342877765313,
|
|
443
|
+
-176.61502916214059, 12.507343278686905, -0.13857109526572012, 9.9843695780195716e-6,
|
|
444
|
+
1.5056327351493116e-7,
|
|
445
|
+
];
|
|
446
|
+
|
|
447
|
+
if (z < 0.5) {
|
|
448
|
+
return Math.log(Math.PI / Math.sin(Math.PI * z)) - stats_ln_gamma(1 - z);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
const z_adj = z - 1;
|
|
452
|
+
let x = c[0]!;
|
|
453
|
+
for (let i = 1; i < g + 2; i++) {
|
|
454
|
+
x += c[i]! / (z_adj + i);
|
|
455
|
+
}
|
|
456
|
+
const t = z_adj + g + 0.5;
|
|
457
|
+
return 0.5 * Math.log(2 * Math.PI) + (z_adj + 0.5) * Math.log(t) - t + Math.log(x);
|
|
458
|
+
};
|
|
459
|
+
|
|
460
|
+
/**
|
|
461
|
+
* Approximate regularized incomplete beta function for p-value calculation.
|
|
462
|
+
* Uses continued fraction expansion for reasonable accuracy.
|
|
463
|
+
*/
|
|
464
|
+
export const stats_incomplete_beta = (x: number, a: number, b: number): number => {
|
|
465
|
+
// Simple approximation using the relationship between beta and normal distributions
|
|
466
|
+
// For our use case (t-distribution p-values), this provides sufficient accuracy
|
|
467
|
+
if (x <= 0) return 0;
|
|
468
|
+
if (x >= 1) return 1;
|
|
469
|
+
|
|
470
|
+
// Use symmetry if needed
|
|
471
|
+
if (x > (a + 1) / (a + b + 2)) {
|
|
472
|
+
return 1 - stats_incomplete_beta(1 - x, b, a);
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
// Continued fraction approximation (first few terms)
|
|
476
|
+
const lnBeta = stats_ln_gamma(a) + stats_ln_gamma(b) - stats_ln_gamma(a + b);
|
|
477
|
+
const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
|
|
478
|
+
|
|
479
|
+
// Simple continued fraction (limited iterations for speed)
|
|
480
|
+
let f = 1;
|
|
481
|
+
let c = 1;
|
|
482
|
+
let d = 0;
|
|
483
|
+
|
|
484
|
+
for (let m = 1; m <= 100; m++) {
|
|
485
|
+
const m2 = 2 * m;
|
|
486
|
+
|
|
487
|
+
// Even step
|
|
488
|
+
let aa = (m * (b - m) * x) / ((a + m2 - 1) * (a + m2));
|
|
489
|
+
d = 1 + aa * d;
|
|
490
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
491
|
+
c = 1 + aa / c;
|
|
492
|
+
if (Math.abs(c) < 1e-30) c = 1e-30;
|
|
493
|
+
d = 1 / d;
|
|
494
|
+
f *= d * c;
|
|
495
|
+
|
|
496
|
+
// Odd step
|
|
497
|
+
aa = (-(a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
|
|
498
|
+
d = 1 + aa * d;
|
|
499
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
500
|
+
c = 1 + aa / c;
|
|
501
|
+
if (Math.abs(c) < 1e-30) c = 1e-30;
|
|
502
|
+
d = 1 / d;
|
|
503
|
+
const delta = d * c;
|
|
504
|
+
f *= delta;
|
|
505
|
+
|
|
506
|
+
if (Math.abs(delta - 1) < 1e-8) break;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
return front * f;
|
|
510
|
+
};
|
|
511
|
+
|
|
512
|
+
/**
|
|
513
|
+
* Approximate two-tailed p-value from t-distribution.
|
|
514
|
+
* For large df (>100), uses normal approximation.
|
|
515
|
+
* For smaller df, uses incomplete beta function.
|
|
516
|
+
*
|
|
517
|
+
* @param t - Absolute value of t-statistic
|
|
518
|
+
* @param df - Degrees of freedom
|
|
519
|
+
* @returns Two-tailed p-value
|
|
520
|
+
*/
|
|
521
|
+
export const stats_t_distribution_p_value = (t: number, df: number): number => {
|
|
522
|
+
// Use normal approximation for large df
|
|
523
|
+
if (df > 100) {
|
|
524
|
+
return 2 * (1 - stats_normal_cdf(t));
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
// For smaller df, use a more accurate approximation
|
|
528
|
+
// Based on the incomplete beta function relationship
|
|
529
|
+
const x = df / (df + t * t);
|
|
530
|
+
const a = df / 2;
|
|
531
|
+
const b = 0.5;
|
|
532
|
+
|
|
533
|
+
return stats_incomplete_beta(x, a, b);
|
|
534
|
+
};
|
package/src/lib/string.ts
CHANGED
|
@@ -97,3 +97,69 @@ export const strip_ansi = (str: string): string => str.replaceAll(/\x1B\[[0-9;]*
|
|
|
97
97
|
*/
|
|
98
98
|
export const stringify = (value: unknown): string =>
|
|
99
99
|
typeof value === 'bigint' ? value + 'n' : (JSON.stringify(value) ?? String(value)); // eslint-disable-line @typescript-eslint/no-unnecessary-condition
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Calculate the display width of a string in terminal columns.
|
|
103
|
+
* - Strips ANSI escape codes (they have 0 width)
|
|
104
|
+
* - Emojis and other wide characters take 2 columns
|
|
105
|
+
* - Tab characters take 4 columns
|
|
106
|
+
* - Newlines and other control characters take 0 columns
|
|
107
|
+
* - Uses `Intl.Segmenter` to properly handle grapheme clusters (e.g., family emoji "๐จโ๐ฉโ๐งโ๐ฆ")
|
|
108
|
+
*/
|
|
109
|
+
export const string_display_width = (str: string): number => {
|
|
110
|
+
// Strip ANSI codes first (they have 0 display width)
|
|
111
|
+
const clean = strip_ansi(str);
|
|
112
|
+
|
|
113
|
+
let width = 0;
|
|
114
|
+
const segmenter = new Intl.Segmenter();
|
|
115
|
+
for (const {segment} of segmenter.segment(clean)) {
|
|
116
|
+
const code = segment.codePointAt(0)!;
|
|
117
|
+
|
|
118
|
+
// Handle control characters
|
|
119
|
+
if (code === 0x09) {
|
|
120
|
+
// Tab = 4 columns
|
|
121
|
+
width += 4;
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
if (code < 0x20 || (code >= 0x7f && code < 0xa0)) {
|
|
125
|
+
// Other control characters (including newline) = 0 width
|
|
126
|
+
continue;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Emoji and other wide characters (rough heuristic)
|
|
130
|
+
// - Most emoji are in range 0x1F300-0x1FAFF
|
|
131
|
+
// - Some are in 0x2600-0x27BF (misc symbols)
|
|
132
|
+
// - CJK characters 0x4E00-0x9FFF also double-width
|
|
133
|
+
// - Grapheme clusters with multiple code points (like ZWJ sequences) are typically emoji
|
|
134
|
+
if (
|
|
135
|
+
segment.length > 1 || // Multi-codepoint graphemes (ZWJ sequences, etc.)
|
|
136
|
+
(code >= 0x1f300 && code <= 0x1faff) ||
|
|
137
|
+
(code >= 0x2600 && code <= 0x27bf) ||
|
|
138
|
+
(code >= 0x1f600 && code <= 0x1f64f) ||
|
|
139
|
+
(code >= 0x1f680 && code <= 0x1f6ff) ||
|
|
140
|
+
(code >= 0x4e00 && code <= 0x9fff) // CJK
|
|
141
|
+
) {
|
|
142
|
+
width += 2;
|
|
143
|
+
} else {
|
|
144
|
+
width += 1;
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
return width;
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Pad a string to a target display width (accounting for wide characters).
|
|
152
|
+
*/
|
|
153
|
+
export const pad_width = (
|
|
154
|
+
str: string,
|
|
155
|
+
target_width: number,
|
|
156
|
+
align: 'left' | 'right' = 'left',
|
|
157
|
+
): string => {
|
|
158
|
+
const current_width = string_display_width(str);
|
|
159
|
+
const padding = Math.max(0, target_width - current_width);
|
|
160
|
+
if (align === 'left') {
|
|
161
|
+
return str + ' '.repeat(padding);
|
|
162
|
+
} else {
|
|
163
|
+
return ' '.repeat(padding) + str;
|
|
164
|
+
}
|
|
165
|
+
};
|