@winm2m/inferential-stats-js 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1126 @@
1
+ (function () {
2
+ 'use strict';
3
+
4
+ /**
5
+ * Python code for descriptive statistics functions.
6
+ * These are executed inside Pyodide in the Web Worker.
7
+ */
8
+ const FREQUENCIES_PY = `
9
+ import json
10
+ import pandas as pd
11
+
12
+ def run_frequencies(data_json, variable):
13
+ df = pd.DataFrame(json.loads(data_json))
14
+ series = df[variable]
15
+ total = len(series)
16
+
17
+ counts = series.value_counts(dropna=False)
18
+ pcts = series.value_counts(normalize=True, dropna=False) * 100
19
+
20
+ freqs = []
21
+ cum_pct = 0
22
+ for val in counts.index:
23
+ count = int(counts[val])
24
+ pct = float(pcts[val])
25
+ cum_pct += pct
26
+ freqs.append({
27
+ 'value': str(val) if not isinstance(val, (int, float)) else val,
28
+ 'count': count,
29
+ 'percentage': round(pct, 4),
30
+ 'cumulativePercentage': round(cum_pct, 4)
31
+ })
32
+
33
+ return json.dumps({
34
+ 'variable': variable,
35
+ 'totalCount': total,
36
+ 'frequencies': freqs
37
+ })
38
+ `;
39
+ const DESCRIPTIVES_PY = `
40
+ import json
41
+ import pandas as pd
42
+ from scipy import stats as sp_stats
43
+
44
+ def run_descriptives(data_json, variables_json):
45
+ df = pd.DataFrame(json.loads(data_json))
46
+ variables = json.loads(variables_json)
47
+
48
+ results = []
49
+ for var in variables:
50
+ col = pd.to_numeric(df[var], errors='coerce').dropna()
51
+ desc = col.describe()
52
+ results.append({
53
+ 'variable': var,
54
+ 'count': int(desc['count']),
55
+ 'mean': round(float(desc['mean']), 6),
56
+ 'std': round(float(desc['std']), 6),
57
+ 'min': round(float(desc['min']), 6),
58
+ 'max': round(float(desc['max']), 6),
59
+ 'q25': round(float(desc['25%']), 6),
60
+ 'q50': round(float(desc['50%']), 6),
61
+ 'q75': round(float(desc['75%']), 6),
62
+ 'skewness': round(float(sp_stats.skew(col)), 6),
63
+ 'kurtosis': round(float(sp_stats.kurtosis(col)), 6)
64
+ })
65
+
66
+ return json.dumps({'statistics': results})
67
+ `;
68
+ const CROSSTABS_PY = `
69
+ import json
70
+ import pandas as pd
71
+ from scipy.stats import chi2_contingency
72
+ import numpy as np
73
+
74
+ def run_crosstabs(data_json, row_variable, col_variable):
75
+ df = pd.DataFrame(json.loads(data_json))
76
+
77
+ ct = pd.crosstab(df[row_variable], df[col_variable])
78
+ chi2, p, dof, expected = chi2_contingency(ct)
79
+
80
+ n = ct.values.sum()
81
+ k = min(ct.shape) - 1
82
+ cramers_v = float(np.sqrt(chi2 / (n * k))) if k > 0 else 0
83
+
84
+ row_labels = [str(x) for x in ct.index.tolist()]
85
+ col_labels = [str(x) for x in ct.columns.tolist()]
86
+
87
+ table = []
88
+ row_sums = ct.sum(axis=1)
89
+ col_sums = ct.sum(axis=0)
90
+ total = ct.values.sum()
91
+
92
+ for i, rl in enumerate(row_labels):
93
+ for j, cl in enumerate(col_labels):
94
+ obs = int(ct.iloc[i, j])
95
+ exp = float(expected[i, j])
96
+ table.append({
97
+ 'row': rl,
98
+ 'col': cl,
99
+ 'observed': obs,
100
+ 'expected': round(exp, 4),
101
+ 'rowPercentage': round(obs / float(row_sums.iloc[i]) * 100, 4) if row_sums.iloc[i] > 0 else 0,
102
+ 'colPercentage': round(obs / float(col_sums.iloc[j]) * 100, 4) if col_sums.iloc[j] > 0 else 0,
103
+ 'totalPercentage': round(obs / float(total) * 100, 4) if total > 0 else 0
104
+ })
105
+
106
+ return json.dumps({
107
+ 'rowVariable': row_variable,
108
+ 'colVariable': col_variable,
109
+ 'table': table,
110
+ 'rowLabels': row_labels,
111
+ 'colLabels': col_labels,
112
+ 'chiSquare': round(float(chi2), 6),
113
+ 'degreesOfFreedom': int(dof),
114
+ 'pValue': float(p),
115
+ 'cramersV': round(cramers_v, 6)
116
+ })
117
+ `;
118
+
119
+ /**
120
+ * Python code for compare means functions.
121
+ */
122
+ const TTEST_INDEPENDENT_PY = `
123
+ import json
124
+ import pandas as pd
125
+ import numpy as np
126
+ from scipy import stats
127
+
128
+ def run_ttest_independent(data_json, variable, group_variable, group1_value, group2_value):
129
+ df = pd.DataFrame(json.loads(data_json))
130
+
131
+ g1 = pd.to_numeric(df[df[group_variable] == group1_value][variable], errors='coerce').dropna()
132
+ g2 = pd.to_numeric(df[df[group_variable] == group2_value][variable], errors='coerce').dropna()
133
+
134
+ # Levene's test for equality of variances
135
+ levene_stat, levene_p = stats.levene(g1, g2)
136
+ equal_var = levene_p > 0.05
137
+
138
+ # T-test with equal variance
139
+ t_eq, p_eq = stats.ttest_ind(g1, g2, equal_var=True)
140
+ # T-test with unequal variance (Welch's)
141
+ t_uneq, p_uneq = stats.ttest_ind(g1, g2, equal_var=False)
142
+
143
+ mean_diff = float(g1.mean() - g2.mean())
144
+
145
+ # Degrees of freedom
146
+ df_eq = len(g1) + len(g2) - 2
147
+
148
+ # Welch df
149
+ s1_sq = g1.var(ddof=1)
150
+ s2_sq = g2.var(ddof=1)
151
+ n1, n2 = len(g1), len(g2)
152
+ num = (s1_sq/n1 + s2_sq/n2)**2
153
+ denom = (s1_sq/n1)**2/(n1-1) + (s2_sq/n2)**2/(n2-1)
154
+ df_welch = float(num/denom)
155
+
156
+ # Confidence intervals
157
+ se_eq = float(np.sqrt(((n1-1)*s1_sq + (n2-1)*s2_sq)/(n1+n2-2) * (1/n1 + 1/n2)))
158
+ se_uneq = float(np.sqrt(s1_sq/n1 + s2_sq/n2))
159
+
160
+ ci_eq = stats.t.interval(0.95, df_eq, loc=mean_diff, scale=se_eq)
161
+ ci_uneq = stats.t.interval(0.95, df_welch, loc=mean_diff, scale=se_uneq)
162
+
163
+ def make_result(t_stat, df_val, p_val, ci):
164
+ return {
165
+ 'tStatistic': round(float(t_stat), 6),
166
+ 'degreesOfFreedom': round(float(df_val), 6),
167
+ 'pValue': float(p_val),
168
+ 'meanDifference': round(mean_diff, 6),
169
+ 'confidenceInterval': [round(float(ci[0]), 6), round(float(ci[1]), 6)],
170
+ 'group1Mean': round(float(g1.mean()), 6),
171
+ 'group1Std': round(float(g1.std(ddof=1)), 6),
172
+ 'group1N': n1,
173
+ 'group2Mean': round(float(g2.mean()), 6),
174
+ 'group2Std': round(float(g2.std(ddof=1)), 6),
175
+ 'group2N': n2
176
+ }
177
+
178
+ return json.dumps({
179
+ 'leveneTest': {
180
+ 'statistic': round(float(levene_stat), 6),
181
+ 'pValue': float(levene_p),
182
+ 'equalVariance': bool(equal_var)
183
+ },
184
+ 'equalVariance': make_result(t_eq, df_eq, p_eq, ci_eq),
185
+ 'unequalVariance': make_result(t_uneq, df_welch, p_uneq, ci_uneq)
186
+ })
187
+ `;
188
+ const TTEST_PAIRED_PY = `
189
+ import json
190
+ import pandas as pd
191
+ import numpy as np
192
+ from scipy import stats
193
+
194
+ def run_ttest_paired(data_json, variable1, variable2):
195
+ df = pd.DataFrame(json.loads(data_json))
196
+
197
+ v1 = pd.to_numeric(df[variable1], errors='coerce').dropna()
198
+ v2 = pd.to_numeric(df[variable2], errors='coerce').dropna()
199
+
200
+ # Align by index
201
+ common = v1.index.intersection(v2.index)
202
+ v1 = v1.loc[common]
203
+ v2 = v2.loc[common]
204
+
205
+ diff = v1 - v2
206
+ n = len(diff)
207
+
208
+ t_stat, p_val = stats.ttest_rel(v1, v2)
209
+
210
+ mean_diff = float(diff.mean())
211
+ std_diff = float(diff.std(ddof=1))
212
+ se = std_diff / np.sqrt(n)
213
+ ci = stats.t.interval(0.95, n-1, loc=mean_diff, scale=se)
214
+
215
+ return json.dumps({
216
+ 'tStatistic': round(float(t_stat), 6),
217
+ 'degreesOfFreedom': n - 1,
218
+ 'pValue': float(p_val),
219
+ 'meanDifference': round(mean_diff, 6),
220
+ 'stdDifference': round(std_diff, 6),
221
+ 'confidenceInterval': [round(float(ci[0]), 6), round(float(ci[1]), 6)],
222
+ 'mean1': round(float(v1.mean()), 6),
223
+ 'mean2': round(float(v2.mean()), 6),
224
+ 'n': n
225
+ })
226
+ `;
227
+ const ANOVA_ONEWAY_PY = `
228
+ import json
229
+ import pandas as pd
230
+ import numpy as np
231
+ from scipy import stats
232
+
233
+ def run_anova_oneway(data_json, variable, group_variable):
234
+ df = pd.DataFrame(json.loads(data_json))
235
+
236
+ groups = df.groupby(group_variable)[variable].apply(
237
+ lambda x: pd.to_numeric(x, errors='coerce').dropna().tolist()
238
+ )
239
+
240
+ group_arrays = [np.array(g) for g in groups.values if len(g) > 0]
241
+ group_names = [str(name) for name, g in zip(groups.index, groups.values) if len(g) > 0]
242
+
243
+ f_stat, p_val = stats.f_oneway(*group_arrays)
244
+
245
+ # Compute detailed ANOVA table
246
+ grand_mean = np.concatenate(group_arrays).mean()
247
+ n_total = sum(len(g) for g in group_arrays)
248
+ k = len(group_arrays)
249
+
250
+ ss_between = sum(len(g) * (g.mean() - grand_mean)**2 for g in group_arrays)
251
+ ss_within = sum(((g - g.mean())**2).sum() for g in group_arrays)
252
+
253
+ df_between = k - 1
254
+ df_within = n_total - k
255
+
256
+ ms_between = ss_between / df_between
257
+ ms_within = ss_within / df_within
258
+
259
+ ss_total = ss_between + ss_within
260
+ eta_sq = ss_between / ss_total if ss_total > 0 else 0
261
+
262
+ group_stats = []
263
+ for name, arr in zip(group_names, group_arrays):
264
+ group_stats.append({
265
+ 'group': name,
266
+ 'n': len(arr),
267
+ 'mean': round(float(arr.mean()), 6),
268
+ 'std': round(float(arr.std(ddof=1)), 6)
269
+ })
270
+
271
+ return json.dumps({
272
+ 'fStatistic': round(float(f_stat), 6),
273
+ 'pValue': float(p_val),
274
+ 'degreesOfFreedomBetween': df_between,
275
+ 'degreesOfFreedomWithin': df_within,
276
+ 'sumOfSquaresBetween': round(float(ss_between), 6),
277
+ 'sumOfSquaresWithin': round(float(ss_within), 6),
278
+ 'meanSquareBetween': round(float(ms_between), 6),
279
+ 'meanSquareWithin': round(float(ms_within), 6),
280
+ 'groupStats': group_stats,
281
+ 'etaSquared': round(float(eta_sq), 6)
282
+ })
283
+ `;
284
+ const POSTHOC_TUKEY_PY = `
285
+ import json
286
+ import pandas as pd
287
+ import numpy as np
288
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
289
+
290
+ def run_posthoc_tukey(data_json, variable, group_variable, alpha=0.05):
291
+ df = pd.DataFrame(json.loads(data_json))
292
+
293
+ df[variable] = pd.to_numeric(df[variable], errors='coerce')
294
+ df = df.dropna(subset=[variable])
295
+
296
+ result = pairwise_tukeyhsd(df[variable], df[group_variable], alpha=alpha)
297
+
298
+ comparisons = []
299
+ for i in range(len(result.summary().data) - 1):
300
+ row = result.summary().data[i + 1]
301
+ comparisons.append({
302
+ 'group1': str(row[0]),
303
+ 'group2': str(row[1]),
304
+ 'meanDifference': round(float(row[2]), 6),
305
+ 'pValue': round(float(row[3]), 6),
306
+ 'lowerCI': round(float(row[4]), 6),
307
+ 'upperCI': round(float(row[5]), 6),
308
+ 'reject': bool(row[6])
309
+ })
310
+
311
+ return json.dumps({
312
+ 'comparisons': comparisons,
313
+ 'alpha': alpha
314
+ })
315
+ `;
316
+
317
+ /**
318
+ * Python code for regression analysis functions.
319
+ */
320
+ const LINEAR_REGRESSION_PY = `
321
+ import json
322
+ import pandas as pd
323
+ import numpy as np
324
+ import statsmodels.api as sm
325
+
326
+ def run_linear_regression(data_json, dependent, independents_json, add_constant=True):
327
+ df = pd.DataFrame(json.loads(data_json))
328
+ independents = json.loads(independents_json)
329
+
330
+ y = pd.to_numeric(df[dependent], errors='coerce')
331
+ X = df[independents].apply(pd.to_numeric, errors='coerce')
332
+
333
+ mask = y.notna() & X.notna().all(axis=1)
334
+ y = y[mask]
335
+ X = X[mask]
336
+
337
+ if add_constant:
338
+ X = sm.add_constant(X)
339
+
340
+ model = sm.OLS(y, X).fit()
341
+
342
+ coefficients = []
343
+ for i, name in enumerate(model.params.index):
344
+ ci = model.conf_int().iloc[i]
345
+ coefficients.append({
346
+ 'variable': str(name),
347
+ 'coefficient': round(float(model.params.iloc[i]), 6),
348
+ 'stdError': round(float(model.bse.iloc[i]), 6),
349
+ 'tStatistic': round(float(model.tvalues.iloc[i]), 6),
350
+ 'pValue': float(model.pvalues.iloc[i]),
351
+ 'confidenceInterval': [round(float(ci[0]), 6), round(float(ci[1]), 6)]
352
+ })
353
+
354
+ dw = float(sm.stats.stattools.durbin_watson(model.resid))
355
+
356
+ return json.dumps({
357
+ 'rSquared': round(float(model.rsquared), 6),
358
+ 'adjustedRSquared': round(float(model.rsquared_adj), 6),
359
+ 'fStatistic': round(float(model.fvalue), 6),
360
+ 'fPValue': float(model.f_pvalue),
361
+ 'coefficients': coefficients,
362
+ 'residualStdError': round(float(np.sqrt(model.mse_resid)), 6),
363
+ 'observations': int(model.nobs),
364
+ 'degreesOfFreedom': int(model.df_resid),
365
+ 'durbinWatson': round(dw, 6)
366
+ })
367
+ `;
368
+ const LOGISTIC_BINARY_PY = `
369
+ import json
370
+ import pandas as pd
371
+ import numpy as np
372
+ import statsmodels.api as sm
373
+
374
+ def run_logistic_binary(data_json, dependent, independents_json, add_constant=True):
375
+ df = pd.DataFrame(json.loads(data_json))
376
+ independents = json.loads(independents_json)
377
+
378
+ y = pd.to_numeric(df[dependent], errors='coerce')
379
+ X = df[independents].apply(pd.to_numeric, errors='coerce')
380
+
381
+ mask = y.notna() & X.notna().all(axis=1)
382
+ y = y[mask]
383
+ X = X[mask]
384
+
385
+ if add_constant:
386
+ X = sm.add_constant(X)
387
+
388
+ model = sm.Logit(y, X).fit(disp=0)
389
+
390
+ coefficients = []
391
+ ci = model.conf_int()
392
+ for i, name in enumerate(model.params.index):
393
+ coef = float(model.params.iloc[i])
394
+ coefficients.append({
395
+ 'variable': str(name),
396
+ 'coefficient': round(coef, 6),
397
+ 'stdError': round(float(model.bse.iloc[i]), 6),
398
+ 'zStatistic': round(float(model.tvalues.iloc[i]), 6),
399
+ 'pValue': float(model.pvalues.iloc[i]),
400
+ 'oddsRatio': round(float(np.exp(coef)), 6),
401
+ 'confidenceInterval': [round(float(ci.iloc[i, 0]), 6), round(float(ci.iloc[i, 1]), 6)]
402
+ })
403
+
404
+ return json.dumps({
405
+ 'coefficients': coefficients,
406
+ 'pseudoRSquared': round(float(model.prsquared), 6),
407
+ 'logLikelihood': round(float(model.llf), 6),
408
+ 'llrPValue': float(model.llr_pvalue),
409
+ 'aic': round(float(model.aic), 6),
410
+ 'bic': round(float(model.bic), 6),
411
+ 'observations': int(model.nobs),
412
+ 'convergence': bool(model.mle_retvals['converged'])
413
+ })
414
+ `;
415
+ const LOGISTIC_MULTINOMIAL_PY = `
416
+ import json
417
+ import pandas as pd
418
+ import numpy as np
419
+ import statsmodels.api as sm
420
+
421
+ def run_logistic_multinomial(data_json, dependent, independents_json, reference_category=None):
422
+ df = pd.DataFrame(json.loads(data_json))
423
+ independents = json.loads(independents_json)
424
+
425
+ X = df[independents].apply(pd.to_numeric, errors='coerce')
426
+ y = df[dependent]
427
+
428
+ mask = X.notna().all(axis=1) & y.notna()
429
+ X = X[mask]
430
+ y = y[mask]
431
+
432
+ # Encode categories
433
+ categories = sorted(y.unique().tolist(), key=str)
434
+ if reference_category is not None:
435
+ ref = str(reference_category)
436
+ else:
437
+ ref = str(categories[0])
438
+
439
+ y_coded = pd.Categorical(y, categories=categories)
440
+ y_dummies = pd.get_dummies(y_coded, drop_first=False)
441
+
442
+ X_const = sm.add_constant(X)
443
+
444
+ from sklearn.linear_model import LogisticRegression
445
+
446
+ le_map = {str(c): i for i, c in enumerate(categories)}
447
+ y_numeric = y.map(lambda x: le_map[str(x)])
448
+
449
+ model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
450
+ model.fit(X, y_numeric)
451
+
452
+ ref_idx = le_map[ref]
453
+ non_ref_cats = [c for c in categories if str(c) != ref]
454
+
455
+ coefficients = []
456
+ for cat in non_ref_cats:
457
+ cat_idx = le_map[str(cat)]
458
+ for j, var_name in enumerate(independents):
459
+ coef = float(model.coef_[cat_idx][j] - model.coef_[ref_idx][j])
460
+ coefficients.append({
461
+ 'category': str(cat),
462
+ 'variable': var_name,
463
+ 'coefficient': round(coef, 6),
464
+ 'stdError': 0.0,
465
+ 'zStatistic': 0.0,
466
+ 'pValue': 0.0,
467
+ 'oddsRatio': round(float(np.exp(coef)), 6),
468
+ 'confidenceInterval': [0.0, 0.0]
469
+ })
470
+
471
+ # Log-likelihood
472
+ proba = model.predict_proba(X)
473
+ ll = float(np.sum(np.log(proba[np.arange(len(y_numeric)), y_numeric] + 1e-10)))
474
+ n_params = len(non_ref_cats) * (len(independents) + 1)
475
+ aic = -2 * ll + 2 * n_params
476
+ bic_val = -2 * ll + n_params * np.log(len(y))
477
+
478
+ return json.dumps({
479
+ 'coefficients': coefficients,
480
+ 'pseudoRSquared': round(float(model.score(X, y_numeric)), 6),
481
+ 'logLikelihood': round(ll, 6),
482
+ 'llrPValue': 0.0,
483
+ 'aic': round(float(aic), 6),
484
+ 'bic': round(float(bic_val), 6),
485
+ 'categories': [str(c) for c in categories],
486
+ 'referenceCategory': ref,
487
+ 'observations': int(len(y))
488
+ })
489
+ `;
490
+
491
+ /**
492
+ * Python code for classification/clustering functions.
493
+ */
494
+ const KMEANS_PY = `
495
+ import json
496
+ import pandas as pd
497
+ import numpy as np
498
+ from sklearn.cluster import KMeans
499
+ from sklearn.preprocessing import StandardScaler
500
+
501
+ def run_kmeans(data_json, variables_json, k, max_iterations=300, random_state=42):
502
+ df = pd.DataFrame(json.loads(data_json))
503
+ variables = json.loads(variables_json)
504
+
505
+ X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
506
+
507
+ scaler = StandardScaler()
508
+ X_scaled = scaler.fit_transform(X)
509
+
510
+ model = KMeans(n_clusters=k, max_iter=max_iterations, random_state=random_state, n_init=10)
511
+ labels = model.fit_predict(X_scaled)
512
+
513
+ # Transform centers back to original scale
514
+ centers_original = scaler.inverse_transform(model.cluster_centers_)
515
+
516
+ centers = []
517
+ for i in range(k):
518
+ center = {}
519
+ for j, var in enumerate(variables):
520
+ center[var] = round(float(centers_original[i, j]), 6)
521
+ centers.append({'cluster': i, 'center': center})
522
+
523
+ unique, counts = np.unique(labels, return_counts=True)
524
+ cluster_sizes = {int(u): int(c) for u, c in zip(unique, counts)}
525
+
526
+ return json.dumps({
527
+ 'labels': [int(l) for l in labels],
528
+ 'centers': centers,
529
+ 'inertia': round(float(model.inertia_), 6),
530
+ 'iterations': int(model.n_iter_),
531
+ 'clusterSizes': cluster_sizes
532
+ })
533
+ `;
534
+ const HIERARCHICAL_CLUSTER_PY = `
535
+ import json
536
+ import pandas as pd
537
+ import numpy as np
538
+ from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
539
+ from sklearn.preprocessing import StandardScaler
540
+
541
+ def run_hierarchical_cluster(data_json, variables_json, method='ward', metric='euclidean', n_clusters=None, distance_threshold=None):
542
+ df = pd.DataFrame(json.loads(data_json))
543
+ variables = json.loads(variables_json)
544
+
545
+ X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
546
+
547
+ scaler = StandardScaler()
548
+ X_scaled = scaler.fit_transform(X)
549
+
550
+ Z = linkage(X_scaled, method=method, metric=metric)
551
+
552
+ if n_clusters is not None:
553
+ labels = fcluster(Z, t=n_clusters, criterion='maxclust')
554
+ elif distance_threshold is not None:
555
+ labels = fcluster(Z, t=distance_threshold, criterion='distance')
556
+ else:
557
+ labels = fcluster(Z, t=3, criterion='maxclust')
558
+
559
+ labels = labels - 1 # 0-indexed
560
+
561
+ unique, counts = np.unique(labels, return_counts=True)
562
+ cluster_sizes = {int(u): int(c) for u, c in zip(unique, counts)}
563
+
564
+ # Dendrogram data (truncated for large datasets)
565
+ trunc = min(30, len(X_scaled))
566
+ dend = dendrogram(Z, truncate_mode='lastp', p=trunc, no_plot=True)
567
+
568
+ return json.dumps({
569
+ 'labels': [int(l) for l in labels],
570
+ 'nClusters': len(unique),
571
+ 'linkageMatrix': [[round(float(x), 6) for x in row] for row in Z.tolist()],
572
+ 'clusterSizes': cluster_sizes,
573
+ 'dendrogramData': {
574
+ 'icoord': [[round(float(x), 4) for x in row] for row in dend['icoord']],
575
+ 'dcoord': [[round(float(x), 4) for x in row] for row in dend['dcoord']],
576
+ 'leaves': [int(x) for x in dend['leaves']]
577
+ }
578
+ })
579
+ `;
580
+
581
+ /**
582
+ * Python code for dimension reduction functions.
583
+ */
584
+ const EFA_PY = `
585
+ import json
586
+ import pandas as pd
587
+ import numpy as np
588
+ from factor_analyzer import FactorAnalyzer
589
+ from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
590
+
591
+ def run_efa(data_json, variables_json, n_factors, rotation='varimax', method='minres'):
592
+ df = pd.DataFrame(json.loads(data_json))
593
+ variables = json.loads(variables_json)
594
+
595
+ X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
596
+
597
+ # KMO and Bartlett tests
598
+ kmo_all, kmo_model = calculate_kmo(X)
599
+ chi2, p_value = calculate_bartlett_sphericity(X)
600
+
601
+ fa = FactorAnalyzer(n_factors=n_factors, rotation=rotation, method=method)
602
+ fa.fit(X)
603
+
604
+ loadings = fa.loadings_
605
+ loadings_dict = {}
606
+ for i, var in enumerate(variables):
607
+ loadings_dict[var] = [round(float(x), 6) for x in loadings[i]]
608
+
609
+ ev, v = fa.get_factor_variance()
610
+
611
+ communalities = fa.get_communalities()
612
+ uniquenesses = fa.get_uniquenesses()
613
+
614
+ comm_dict = {}
615
+ uniq_dict = {}
616
+ for i, var in enumerate(variables):
617
+ comm_dict[var] = round(float(communalities[i]), 6)
618
+ uniq_dict[var] = round(float(uniquenesses[i]), 6)
619
+
620
+ eigenvalues = fa.get_eigenvalues()[0]
621
+
622
+ return json.dumps({
623
+ 'loadings': loadings_dict,
624
+ 'eigenvalues': [round(float(x), 6) for x in eigenvalues],
625
+ 'variance': [round(float(x), 6) for x in ev],
626
+ 'cumulativeVariance': [round(float(sum(v[:i+1])), 6) for i in range(len(v))],
627
+ 'communalities': comm_dict,
628
+ 'uniquenesses': uniq_dict,
629
+ 'nFactors': n_factors,
630
+ 'rotation': rotation,
631
+ 'kmo': round(float(kmo_model), 6),
632
+ 'bartlettChi2': round(float(chi2), 6),
633
+ 'bartlettPValue': float(p_value)
634
+ })
635
+ `;
636
+ const PCA_PY = `
637
+ import json
638
+ import pandas as pd
639
+ import numpy as np
640
+ from sklearn.decomposition import PCA
641
+ from sklearn.preprocessing import StandardScaler
642
+
643
+ def run_pca(data_json, variables_json, n_components=None, standardize=True):
644
+ df = pd.DataFrame(json.loads(data_json))
645
+ variables = json.loads(variables_json)
646
+
647
+ X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
648
+
649
+ if standardize:
650
+ scaler = StandardScaler()
651
+ X_scaled = scaler.fit_transform(X)
652
+ else:
653
+ X_scaled = X.values
654
+
655
+ if n_components is None:
656
+ n_components = min(len(variables), len(X_scaled))
657
+
658
+ pca = PCA(n_components=n_components)
659
+ transformed = pca.fit_transform(X_scaled)
660
+
661
+ loadings = {}
662
+ for i, var in enumerate(variables):
663
+ loadings[var] = [round(float(x), 6) for x in pca.components_[:, i]]
664
+
665
+ cum_var = np.cumsum(pca.explained_variance_ratio_)
666
+
667
+ return json.dumps({
668
+ 'components': [[round(float(x), 6) for x in row] for row in transformed.tolist()],
669
+ 'explainedVariance': [round(float(x), 6) for x in pca.explained_variance_],
670
+ 'explainedVarianceRatio': [round(float(x), 6) for x in pca.explained_variance_ratio_],
671
+ 'cumulativeVarianceRatio': [round(float(x), 6) for x in cum_var],
672
+ 'loadings': loadings,
673
+ 'singularValues': [round(float(x), 6) for x in pca.singular_values_],
674
+ 'nComponents': n_components
675
+ })
676
+ `;
677
+ const MDS_PY = `
678
+ import json
679
+ import pandas as pd
680
+ import numpy as np
681
+ from sklearn.manifold import MDS
682
+ from sklearn.preprocessing import StandardScaler
683
+
684
+ def run_mds(data_json, variables_json, n_components=2, metric=True, max_iterations=300, random_state=42):
685
+ df = pd.DataFrame(json.loads(data_json))
686
+ variables = json.loads(variables_json)
687
+
688
+ X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
689
+
690
+ scaler = StandardScaler()
691
+ X_scaled = scaler.fit_transform(X)
692
+
693
+ mds = MDS(n_components=n_components, metric=metric, max_iter=max_iterations, random_state=random_state, normalized_stress='auto')
694
+ coords = mds.fit_transform(X_scaled)
695
+
696
+ return json.dumps({
697
+ 'coordinates': [[round(float(x), 6) for x in row] for row in coords.tolist()],
698
+ 'stress': round(float(mds.stress_), 6),
699
+ 'nComponents': n_components
700
+ })
701
+ `;
702
+
703
+ /**
704
+ * Python code for scale/reliability functions.
705
+ */
706
+ const CRONBACH_ALPHA_PY = `
707
+ import json
708
+ import pandas as pd
709
+ import numpy as np
710
+
711
+ def run_cronbach_alpha(data_json, items_json):
712
+ df = pd.DataFrame(json.loads(data_json))
713
+ items = json.loads(items_json)
714
+
715
+ X = df[items].apply(pd.to_numeric, errors='coerce').dropna()
716
+
717
+ n_items = len(items)
718
+ n_obs = len(X)
719
+
720
+ # Compute Cronbach's Alpha
721
+ item_vars = X.var(ddof=1)
722
+ total_var = X.sum(axis=1).var(ddof=1)
723
+ alpha = (n_items / (n_items - 1)) * (1 - item_vars.sum() / total_var)
724
+
725
+ # Standardized alpha (using correlation matrix)
726
+ corr_matrix = X.corr()
727
+ mean_r = (corr_matrix.sum().sum() - n_items) / (n_items * (n_items - 1))
728
+ std_alpha = (n_items * mean_r) / (1 + (n_items - 1) * mean_r)
729
+
730
+ # Item analysis
731
+ item_analysis = []
732
+ total_score = X.sum(axis=1)
733
+
734
+ for item in items:
735
+ item_col = X[item]
736
+ other_items = [i for i in items if i != item]
737
+ other_sum = X[other_items].sum(axis=1)
738
+
739
+ # Corrected item-total correlation
740
+ citc = float(item_col.corr(other_sum))
741
+
742
+ # Alpha if item deleted
743
+ if len(other_items) > 1:
744
+ sub_X = X[other_items]
745
+ sub_vars = sub_X.var(ddof=1)
746
+ sub_total_var = sub_X.sum(axis=1).var(ddof=1)
747
+ k = len(other_items)
748
+ alpha_deleted = (k / (k - 1)) * (1 - sub_vars.sum() / sub_total_var)
749
+ else:
750
+ alpha_deleted = 0.0
751
+
752
+ item_analysis.append({
753
+ 'item': item,
754
+ 'itemMean': round(float(item_col.mean()), 6),
755
+ 'itemStd': round(float(item_col.std(ddof=1)), 6),
756
+ 'correctedItemTotalCorrelation': round(citc, 6),
757
+ 'alphaIfItemDeleted': round(float(alpha_deleted), 6)
758
+ })
759
+
760
+ return json.dumps({
761
+ 'alpha': round(float(alpha), 6),
762
+ 'standardizedAlpha': round(float(std_alpha), 6),
763
+ 'nItems': n_items,
764
+ 'nObservations': n_obs,
765
+ 'itemAnalysis': item_analysis,
766
+ 'interItemCorrelationMean': round(float(mean_r), 6)
767
+ })
768
+ `;
769
+
770
+ /**
771
+ * Web Worker for Pyodide-based statistical analysis.
772
+ * Runs Python code in a WASM sandbox for browser-based computations.
773
+ */
774
+ // Import Python code strings - these will be inlined by the bundler
775
+ // For the worker bundle, we import them directly
776
+ let pyodide = null;
777
+ /**
778
+ * Send progress update to main thread
779
+ */
780
+ function sendProgress(id, stage, progress, message) {
781
+ const response = {
782
+ id,
783
+ type: 'progress',
784
+ progress: { stage, progress, message },
785
+ };
786
+ self.postMessage(response);
787
+ }
788
+ /**
789
+ * Send result to main thread
790
+ */
791
+ function sendResult(id, data) {
792
+ const response = {
793
+ id,
794
+ type: 'result',
795
+ data,
796
+ };
797
+ self.postMessage(response);
798
+ }
799
+ /**
800
+ * Send error to main thread
801
+ */
802
+ function sendError(id, error) {
803
+ const response = {
804
+ id,
805
+ type: 'error',
806
+ error,
807
+ };
808
+ self.postMessage(response);
809
+ }
810
+ /**
811
+ * Deserialize ArrayBuffer to JSON string for Python consumption.
812
+ * Matches the format produced by serializeToBuffer in bridge/serializer.ts.
813
+ */
814
+ function bufferToJsonString(buffer) {
815
+ const view = new DataView(buffer);
816
+ const headerLength = view.getUint32(0, true);
817
+ const headerBytes = new Uint8Array(buffer, 4, headerLength);
818
+ const header = JSON.parse(new TextDecoder().decode(headerBytes));
819
+ if (header.rowCount === 0)
820
+ return '[]';
821
+ const { rowCount, columns } = header;
822
+ let offset = 4 + headerLength;
823
+ // Read columns into arrays
824
+ const columnData = new Map();
825
+ for (const col of columns) {
826
+ if (col.dtype === 'string') {
827
+ const byteLen = rowCount * 4;
828
+ const indices = new Int32Array(new Uint8Array(buffer, offset, byteLen).slice().buffer);
829
+ const values = [];
830
+ for (let i = 0; i < rowCount; i++) {
831
+ values.push(col.stringTable[indices[i]]);
832
+ }
833
+ columnData.set(col.name, values);
834
+ offset += byteLen;
835
+ }
836
+ else {
837
+ const byteLen = rowCount * 8;
838
+ const arr = new Float64Array(new Uint8Array(buffer, offset, byteLen).slice().buffer);
839
+ const values = Array.from(arr);
840
+ columnData.set(col.name, values);
841
+ offset += byteLen;
842
+ }
843
+ }
844
+ // Build row-oriented JSON
845
+ const rows = [];
846
+ for (let i = 0; i < rowCount; i++) {
847
+ const row = {};
848
+ for (const col of columns) {
849
+ row[col.name] = columnData.get(col.name)[i];
850
+ }
851
+ rows.push(row);
852
+ }
853
+ return JSON.stringify(rows);
854
+ }
855
+ /**
856
+ * Initialize Pyodide with required packages
857
+ */
858
+ async function initPyodide(id, pyodideUrl) {
859
+ const totalSteps = 6;
860
+ let currentStep = 0;
861
+ const reportStep = (message) => {
862
+ currentStep++;
863
+ sendProgress(id, 'init', Math.round((currentStep / totalSteps) * 100), message);
864
+ };
865
+ try {
866
+ // Step 1: Load Pyodide core
867
+ sendProgress(id, 'init', 0, 'Loading Pyodide WASM runtime...');
868
+ // Try to load Pyodide - it should be available via importScripts or already loaded
869
+ if (typeof loadPyodide === 'undefined') {
870
+ const pyodideCdnUrl = pyodideUrl || 'https://cdn.jsdelivr.net/pyodide/v0.27.5/full/';
871
+ importScripts(pyodideCdnUrl + 'pyodide.js');
872
+ }
873
+ pyodide = await loadPyodide({
874
+ indexURL: pyodideUrl || 'https://cdn.jsdelivr.net/pyodide/v0.27.5/full/',
875
+ });
876
+ reportStep('Pyodide runtime loaded successfully');
877
+ // Step 2: Install micropip
878
+ await pyodide.loadPackagesFromImports('import micropip', {
879
+ messageCallback: (msg) => {
880
+ sendProgress(id, 'init', Math.round((currentStep / totalSteps) * 100), `micropip: ${msg}`);
881
+ }
882
+ });
883
+ reportStep('micropip package manager ready');
884
+ // Step 3: Install pandas and scipy
885
+ await pyodide.runPythonAsync(`
886
+ import micropip
887
+ await micropip.install(['pandas', 'scipy'])
888
+ `);
889
+ reportStep('pandas and scipy installed');
890
+ // Step 4: Install statsmodels
891
+ await pyodide.runPythonAsync(`
892
+ import micropip
893
+ await micropip.install('statsmodels')
894
+ `);
895
+ reportStep('statsmodels installed');
896
+ // Step 5: Install scikit-learn
897
+ await pyodide.runPythonAsync(`
898
+ import micropip
899
+ await micropip.install('scikit-learn')
900
+ `);
901
+ reportStep('scikit-learn installed');
902
+ // Step 6: Install factor_analyzer
903
+ await pyodide.runPythonAsync(`
904
+ import micropip
905
+ await micropip.install('factor_analyzer')
906
+ `);
907
+ reportStep('factor_analyzer installed - all packages ready');
908
+ sendResult(id, { initialized: true });
909
+ }
910
+ catch (err) {
911
+ sendError(id, `Initialization failed: ${err instanceof Error ? err.message : String(err)}`);
912
+ }
913
+ }
914
+ /**
915
+ * Run a Python analysis function.
916
+ * Handles proxy cleanup to prevent memory leaks.
917
+ */
918
+ async function runAnalysis(id, pythonCode, functionName, args) {
919
+ if (!pyodide) {
920
+ sendError(id, 'Pyodide is not initialized. Call init() first.');
921
+ return;
922
+ }
923
+ try {
924
+ // Load the Python function
925
+ await pyodide.runPythonAsync(pythonCode);
926
+ // Build the function call
927
+ const argsStr = args.map(a => {
928
+ // If it looks like a raw Python expression (number, bool), pass as-is
929
+ if (/^[-+]?\d+(\.\d+)?$/.test(a) || a === 'True' || a === 'False' || a === 'None') {
930
+ return a;
931
+ }
932
+ // Otherwise, wrap as a Python string
933
+ // Escape backslashes and single quotes
934
+ const escaped = a.replace(/\\/g, '\\\\').replace(/'/g, "\\'");
935
+ return `'${escaped}'`;
936
+ }).join(', ');
937
+ const callCode = `
938
+ import gc
939
+ _result = ${functionName}(${argsStr})
940
+ _result
941
+ `;
942
+ const result = await pyodide.runPythonAsync(callCode);
943
+ // Parse the JSON result from Python
944
+ const resultStr = String(result);
945
+ const parsed = JSON.parse(resultStr);
946
+ // Cleanup Python memory
947
+ await pyodide.runPythonAsync(`
948
+ del _result
949
+ gc.collect()
950
+ `);
951
+ sendResult(id, parsed);
952
+ }
953
+ catch (err) {
954
+ // Attempt cleanup even on error
955
+ try {
956
+ await pyodide.runPythonAsync('import gc; gc.collect()');
957
+ }
958
+ catch {
959
+ // Ignore cleanup errors
960
+ }
961
+ sendError(id, `Analysis failed: ${err instanceof Error ? err.message : String(err)}`);
962
+ }
963
+ }
964
+ /**
965
+ * Handle incoming messages from main thread
966
+ */
967
+ self.onmessage = async (event) => {
968
+ const { id, type, payload, params } = event.data;
969
+ try {
970
+ // Convert ArrayBuffer payload to JSON string if present
971
+ let dataJson = '[]';
972
+ if (payload && payload instanceof ArrayBuffer && payload.byteLength > 0) {
973
+ dataJson = bufferToJsonString(payload);
974
+ }
975
+ else if (params?.data) {
976
+ dataJson = JSON.stringify(params.data);
977
+ }
978
+ switch (type) {
979
+ case 'init':
980
+ await initPyodide(id, params?.pyodideUrl);
981
+ break;
982
+ // === Descriptive Statistics ===
983
+ case 'frequencies':
984
+ await runAnalysis(id, FREQUENCIES_PY, 'run_frequencies', [
985
+ dataJson,
986
+ String(params?.variable ?? '')
987
+ ]);
988
+ break;
989
+ case 'descriptives':
990
+ await runAnalysis(id, DESCRIPTIVES_PY, 'run_descriptives', [
991
+ dataJson,
992
+ JSON.stringify(params?.variables ?? [])
993
+ ]);
994
+ break;
995
+ case 'crosstabs':
996
+ await runAnalysis(id, CROSSTABS_PY, 'run_crosstabs', [
997
+ dataJson,
998
+ String(params?.rowVariable ?? ''),
999
+ String(params?.colVariable ?? '')
1000
+ ]);
1001
+ break;
1002
+ // === Compare Means ===
1003
+ case 'ttest_independent':
1004
+ await runAnalysis(id, TTEST_INDEPENDENT_PY, 'run_ttest_independent', [
1005
+ dataJson,
1006
+ String(params?.variable ?? ''),
1007
+ String(params?.groupVariable ?? ''),
1008
+ String(params?.group1Value ?? ''),
1009
+ String(params?.group2Value ?? '')
1010
+ ]);
1011
+ break;
1012
+ case 'ttest_paired':
1013
+ await runAnalysis(id, TTEST_PAIRED_PY, 'run_ttest_paired', [
1014
+ dataJson,
1015
+ String(params?.variable1 ?? ''),
1016
+ String(params?.variable2 ?? '')
1017
+ ]);
1018
+ break;
1019
+ case 'anova_oneway':
1020
+ await runAnalysis(id, ANOVA_ONEWAY_PY, 'run_anova_oneway', [
1021
+ dataJson,
1022
+ String(params?.variable ?? ''),
1023
+ String(params?.groupVariable ?? '')
1024
+ ]);
1025
+ break;
1026
+ case 'posthoc_tukey':
1027
+ await runAnalysis(id, POSTHOC_TUKEY_PY, 'run_posthoc_tukey', [
1028
+ dataJson,
1029
+ String(params?.variable ?? ''),
1030
+ String(params?.groupVariable ?? ''),
1031
+ String(params?.alpha ?? 0.05)
1032
+ ]);
1033
+ break;
1034
+ // === Regression ===
1035
+ case 'linear_regression':
1036
+ await runAnalysis(id, LINEAR_REGRESSION_PY, 'run_linear_regression', [
1037
+ dataJson,
1038
+ String(params?.dependentVariable ?? ''),
1039
+ JSON.stringify(params?.independentVariables ?? []),
1040
+ String(params?.addConstant !== false ? 'True' : 'False')
1041
+ ]);
1042
+ break;
1043
+ case 'logistic_binary':
1044
+ await runAnalysis(id, LOGISTIC_BINARY_PY, 'run_logistic_binary', [
1045
+ dataJson,
1046
+ String(params?.dependentVariable ?? ''),
1047
+ JSON.stringify(params?.independentVariables ?? []),
1048
+ String(params?.addConstant !== false ? 'True' : 'False')
1049
+ ]);
1050
+ break;
1051
+ case 'logistic_multinomial':
1052
+ await runAnalysis(id, LOGISTIC_MULTINOMIAL_PY, 'run_logistic_multinomial', [
1053
+ dataJson,
1054
+ String(params?.dependentVariable ?? ''),
1055
+ JSON.stringify(params?.independentVariables ?? []),
1056
+ params?.referenceCategory != null ? String(params.referenceCategory) : 'None'
1057
+ ]);
1058
+ break;
1059
+ // === Classify ===
1060
+ case 'kmeans':
1061
+ await runAnalysis(id, KMEANS_PY, 'run_kmeans', [
1062
+ dataJson,
1063
+ JSON.stringify(params?.variables ?? []),
1064
+ String(params?.k ?? 3),
1065
+ String(params?.maxIterations ?? 300),
1066
+ String(params?.randomState ?? 42)
1067
+ ]);
1068
+ break;
1069
+ case 'hierarchical_cluster':
1070
+ await runAnalysis(id, HIERARCHICAL_CLUSTER_PY, 'run_hierarchical_cluster', [
1071
+ dataJson,
1072
+ JSON.stringify(params?.variables ?? []),
1073
+ String(params?.method ?? 'ward'),
1074
+ String(params?.metric ?? 'euclidean'),
1075
+ params?.nClusters != null ? String(params.nClusters) : 'None',
1076
+ params?.distanceThreshold != null ? String(params.distanceThreshold) : 'None'
1077
+ ]);
1078
+ break;
1079
+ // === Dimension Reduction ===
1080
+ case 'efa':
1081
+ await runAnalysis(id, EFA_PY, 'run_efa', [
1082
+ dataJson,
1083
+ JSON.stringify(params?.variables ?? []),
1084
+ String(params?.nFactors ?? 2),
1085
+ String(params?.rotation ?? 'varimax'),
1086
+ String(params?.method ?? 'minres')
1087
+ ]);
1088
+ break;
1089
+ case 'pca':
1090
+ await runAnalysis(id, PCA_PY, 'run_pca', [
1091
+ dataJson,
1092
+ JSON.stringify(params?.variables ?? []),
1093
+ params?.nComponents != null ? String(params.nComponents) : 'None',
1094
+ String(params?.standardize !== false ? 'True' : 'False')
1095
+ ]);
1096
+ break;
1097
+ case 'mds':
1098
+ await runAnalysis(id, MDS_PY, 'run_mds', [
1099
+ dataJson,
1100
+ JSON.stringify(params?.variables ?? []),
1101
+ String(params?.nComponents ?? 2),
1102
+ String(params?.metric !== false ? 'True' : 'False'),
1103
+ String(params?.maxIterations ?? 300),
1104
+ String(params?.randomState ?? 42)
1105
+ ]);
1106
+ break;
1107
+ // === Scale ===
1108
+ case 'cronbach_alpha':
1109
+ await runAnalysis(id, CRONBACH_ALPHA_PY, 'run_cronbach_alpha', [
1110
+ dataJson,
1111
+ JSON.stringify(params?.items ?? [])
1112
+ ]);
1113
+ break;
1114
+ default:
1115
+ sendError(id, `Unknown analysis type: ${type}`);
1116
+ }
1117
+ }
1118
+ catch (err) {
1119
+ sendError(id, `Worker error: ${err instanceof Error ? err.message : String(err)}`);
1120
+ }
1121
+ };
1122
+ // Signal that the worker is ready
1123
+ self.postMessage({ id: '__worker_ready__', type: 'result', data: { ready: true } });
1124
+
1125
+ })();
1126
+ //# sourceMappingURL=stats-worker.js.map