@winm2m/inferential-stats-js 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +712 -0
- package/dist/InferentialStats.d.ts +90 -0
- package/dist/bridge/deserializer.d.ts +15 -0
- package/dist/bridge/index.d.ts +2 -0
- package/dist/bridge/serializer.d.ts +17 -0
- package/dist/index.cjs +453 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.js +446 -0
- package/dist/index.js.map +1 -0
- package/dist/python/classify.d.ts +5 -0
- package/dist/python/compare-means.d.ts +7 -0
- package/dist/python/descriptive.d.ts +7 -0
- package/dist/python/dimension.d.ts +6 -0
- package/dist/python/index.d.ts +6 -0
- package/dist/python/regression.d.ts +6 -0
- package/dist/python/scale.d.ts +4 -0
- package/dist/stats-worker.js +1126 -0
- package/dist/stats-worker.js.map +1 -0
- package/dist/types/classify.d.ts +37 -0
- package/dist/types/common.d.ts +39 -0
- package/dist/types/compare-means.d.ts +89 -0
- package/dist/types/descriptive.d.ts +60 -0
- package/dist/types/dimension.d.ts +49 -0
- package/dist/types/index.d.ts +7 -0
- package/dist/types/regression.d.ts +77 -0
- package/dist/types/scale.d.ts +19 -0
- package/dist/worker/stats-worker.d.ts +5 -0
- package/package.json +79 -0
|
@@ -0,0 +1,1126 @@
|
|
|
1
|
+
(function () {
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Python code for descriptive statistics functions.
|
|
6
|
+
* These are executed inside Pyodide in the Web Worker.
|
|
7
|
+
*/
|
|
8
|
+
const FREQUENCIES_PY = `
|
|
9
|
+
import json
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
def run_frequencies(data_json, variable):
|
|
13
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
14
|
+
series = df[variable]
|
|
15
|
+
total = len(series)
|
|
16
|
+
|
|
17
|
+
counts = series.value_counts(dropna=False)
|
|
18
|
+
pcts = series.value_counts(normalize=True, dropna=False) * 100
|
|
19
|
+
|
|
20
|
+
freqs = []
|
|
21
|
+
cum_pct = 0
|
|
22
|
+
for val in counts.index:
|
|
23
|
+
count = int(counts[val])
|
|
24
|
+
pct = float(pcts[val])
|
|
25
|
+
cum_pct += pct
|
|
26
|
+
freqs.append({
|
|
27
|
+
'value': str(val) if not isinstance(val, (int, float)) else val,
|
|
28
|
+
'count': count,
|
|
29
|
+
'percentage': round(pct, 4),
|
|
30
|
+
'cumulativePercentage': round(cum_pct, 4)
|
|
31
|
+
})
|
|
32
|
+
|
|
33
|
+
return json.dumps({
|
|
34
|
+
'variable': variable,
|
|
35
|
+
'totalCount': total,
|
|
36
|
+
'frequencies': freqs
|
|
37
|
+
})
|
|
38
|
+
`;
|
|
39
|
+
const DESCRIPTIVES_PY = `
|
|
40
|
+
import json
|
|
41
|
+
import pandas as pd
|
|
42
|
+
from scipy import stats as sp_stats
|
|
43
|
+
|
|
44
|
+
def run_descriptives(data_json, variables_json):
|
|
45
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
46
|
+
variables = json.loads(variables_json)
|
|
47
|
+
|
|
48
|
+
results = []
|
|
49
|
+
for var in variables:
|
|
50
|
+
col = pd.to_numeric(df[var], errors='coerce').dropna()
|
|
51
|
+
desc = col.describe()
|
|
52
|
+
results.append({
|
|
53
|
+
'variable': var,
|
|
54
|
+
'count': int(desc['count']),
|
|
55
|
+
'mean': round(float(desc['mean']), 6),
|
|
56
|
+
'std': round(float(desc['std']), 6),
|
|
57
|
+
'min': round(float(desc['min']), 6),
|
|
58
|
+
'max': round(float(desc['max']), 6),
|
|
59
|
+
'q25': round(float(desc['25%']), 6),
|
|
60
|
+
'q50': round(float(desc['50%']), 6),
|
|
61
|
+
'q75': round(float(desc['75%']), 6),
|
|
62
|
+
'skewness': round(float(sp_stats.skew(col)), 6),
|
|
63
|
+
'kurtosis': round(float(sp_stats.kurtosis(col)), 6)
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
return json.dumps({'statistics': results})
|
|
67
|
+
`;
|
|
68
|
+
const CROSSTABS_PY = `
|
|
69
|
+
import json
|
|
70
|
+
import pandas as pd
|
|
71
|
+
from scipy.stats import chi2_contingency
|
|
72
|
+
import numpy as np
|
|
73
|
+
|
|
74
|
+
def run_crosstabs(data_json, row_variable, col_variable):
|
|
75
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
76
|
+
|
|
77
|
+
ct = pd.crosstab(df[row_variable], df[col_variable])
|
|
78
|
+
chi2, p, dof, expected = chi2_contingency(ct)
|
|
79
|
+
|
|
80
|
+
n = ct.values.sum()
|
|
81
|
+
k = min(ct.shape) - 1
|
|
82
|
+
cramers_v = float(np.sqrt(chi2 / (n * k))) if k > 0 else 0
|
|
83
|
+
|
|
84
|
+
row_labels = [str(x) for x in ct.index.tolist()]
|
|
85
|
+
col_labels = [str(x) for x in ct.columns.tolist()]
|
|
86
|
+
|
|
87
|
+
table = []
|
|
88
|
+
row_sums = ct.sum(axis=1)
|
|
89
|
+
col_sums = ct.sum(axis=0)
|
|
90
|
+
total = ct.values.sum()
|
|
91
|
+
|
|
92
|
+
for i, rl in enumerate(row_labels):
|
|
93
|
+
for j, cl in enumerate(col_labels):
|
|
94
|
+
obs = int(ct.iloc[i, j])
|
|
95
|
+
exp = float(expected[i, j])
|
|
96
|
+
table.append({
|
|
97
|
+
'row': rl,
|
|
98
|
+
'col': cl,
|
|
99
|
+
'observed': obs,
|
|
100
|
+
'expected': round(exp, 4),
|
|
101
|
+
'rowPercentage': round(obs / float(row_sums.iloc[i]) * 100, 4) if row_sums.iloc[i] > 0 else 0,
|
|
102
|
+
'colPercentage': round(obs / float(col_sums.iloc[j]) * 100, 4) if col_sums.iloc[j] > 0 else 0,
|
|
103
|
+
'totalPercentage': round(obs / float(total) * 100, 4) if total > 0 else 0
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
return json.dumps({
|
|
107
|
+
'rowVariable': row_variable,
|
|
108
|
+
'colVariable': col_variable,
|
|
109
|
+
'table': table,
|
|
110
|
+
'rowLabels': row_labels,
|
|
111
|
+
'colLabels': col_labels,
|
|
112
|
+
'chiSquare': round(float(chi2), 6),
|
|
113
|
+
'degreesOfFreedom': int(dof),
|
|
114
|
+
'pValue': float(p),
|
|
115
|
+
'cramersV': round(cramers_v, 6)
|
|
116
|
+
})
|
|
117
|
+
`;
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Python code for compare means functions.
|
|
121
|
+
*/
|
|
122
|
+
const TTEST_INDEPENDENT_PY = `
|
|
123
|
+
import json
|
|
124
|
+
import pandas as pd
|
|
125
|
+
import numpy as np
|
|
126
|
+
from scipy import stats
|
|
127
|
+
|
|
128
|
+
def run_ttest_independent(data_json, variable, group_variable, group1_value, group2_value):
|
|
129
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
130
|
+
|
|
131
|
+
g1 = pd.to_numeric(df[df[group_variable] == group1_value][variable], errors='coerce').dropna()
|
|
132
|
+
g2 = pd.to_numeric(df[df[group_variable] == group2_value][variable], errors='coerce').dropna()
|
|
133
|
+
|
|
134
|
+
# Levene's test for equality of variances
|
|
135
|
+
levene_stat, levene_p = stats.levene(g1, g2)
|
|
136
|
+
equal_var = levene_p > 0.05
|
|
137
|
+
|
|
138
|
+
# T-test with equal variance
|
|
139
|
+
t_eq, p_eq = stats.ttest_ind(g1, g2, equal_var=True)
|
|
140
|
+
# T-test with unequal variance (Welch's)
|
|
141
|
+
t_uneq, p_uneq = stats.ttest_ind(g1, g2, equal_var=False)
|
|
142
|
+
|
|
143
|
+
mean_diff = float(g1.mean() - g2.mean())
|
|
144
|
+
|
|
145
|
+
# Degrees of freedom
|
|
146
|
+
df_eq = len(g1) + len(g2) - 2
|
|
147
|
+
|
|
148
|
+
# Welch df
|
|
149
|
+
s1_sq = g1.var(ddof=1)
|
|
150
|
+
s2_sq = g2.var(ddof=1)
|
|
151
|
+
n1, n2 = len(g1), len(g2)
|
|
152
|
+
num = (s1_sq/n1 + s2_sq/n2)**2
|
|
153
|
+
denom = (s1_sq/n1)**2/(n1-1) + (s2_sq/n2)**2/(n2-1)
|
|
154
|
+
df_welch = float(num/denom)
|
|
155
|
+
|
|
156
|
+
# Confidence intervals
|
|
157
|
+
se_eq = float(np.sqrt(((n1-1)*s1_sq + (n2-1)*s2_sq)/(n1+n2-2) * (1/n1 + 1/n2)))
|
|
158
|
+
se_uneq = float(np.sqrt(s1_sq/n1 + s2_sq/n2))
|
|
159
|
+
|
|
160
|
+
ci_eq = stats.t.interval(0.95, df_eq, loc=mean_diff, scale=se_eq)
|
|
161
|
+
ci_uneq = stats.t.interval(0.95, df_welch, loc=mean_diff, scale=se_uneq)
|
|
162
|
+
|
|
163
|
+
def make_result(t_stat, df_val, p_val, ci):
|
|
164
|
+
return {
|
|
165
|
+
'tStatistic': round(float(t_stat), 6),
|
|
166
|
+
'degreesOfFreedom': round(float(df_val), 6),
|
|
167
|
+
'pValue': float(p_val),
|
|
168
|
+
'meanDifference': round(mean_diff, 6),
|
|
169
|
+
'confidenceInterval': [round(float(ci[0]), 6), round(float(ci[1]), 6)],
|
|
170
|
+
'group1Mean': round(float(g1.mean()), 6),
|
|
171
|
+
'group1Std': round(float(g1.std(ddof=1)), 6),
|
|
172
|
+
'group1N': n1,
|
|
173
|
+
'group2Mean': round(float(g2.mean()), 6),
|
|
174
|
+
'group2Std': round(float(g2.std(ddof=1)), 6),
|
|
175
|
+
'group2N': n2
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return json.dumps({
|
|
179
|
+
'leveneTest': {
|
|
180
|
+
'statistic': round(float(levene_stat), 6),
|
|
181
|
+
'pValue': float(levene_p),
|
|
182
|
+
'equalVariance': bool(equal_var)
|
|
183
|
+
},
|
|
184
|
+
'equalVariance': make_result(t_eq, df_eq, p_eq, ci_eq),
|
|
185
|
+
'unequalVariance': make_result(t_uneq, df_welch, p_uneq, ci_uneq)
|
|
186
|
+
})
|
|
187
|
+
`;
|
|
188
|
+
const TTEST_PAIRED_PY = `
|
|
189
|
+
import json
|
|
190
|
+
import pandas as pd
|
|
191
|
+
import numpy as np
|
|
192
|
+
from scipy import stats
|
|
193
|
+
|
|
194
|
+
def run_ttest_paired(data_json, variable1, variable2):
|
|
195
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
196
|
+
|
|
197
|
+
v1 = pd.to_numeric(df[variable1], errors='coerce').dropna()
|
|
198
|
+
v2 = pd.to_numeric(df[variable2], errors='coerce').dropna()
|
|
199
|
+
|
|
200
|
+
# Align by index
|
|
201
|
+
common = v1.index.intersection(v2.index)
|
|
202
|
+
v1 = v1.loc[common]
|
|
203
|
+
v2 = v2.loc[common]
|
|
204
|
+
|
|
205
|
+
diff = v1 - v2
|
|
206
|
+
n = len(diff)
|
|
207
|
+
|
|
208
|
+
t_stat, p_val = stats.ttest_rel(v1, v2)
|
|
209
|
+
|
|
210
|
+
mean_diff = float(diff.mean())
|
|
211
|
+
std_diff = float(diff.std(ddof=1))
|
|
212
|
+
se = std_diff / np.sqrt(n)
|
|
213
|
+
ci = stats.t.interval(0.95, n-1, loc=mean_diff, scale=se)
|
|
214
|
+
|
|
215
|
+
return json.dumps({
|
|
216
|
+
'tStatistic': round(float(t_stat), 6),
|
|
217
|
+
'degreesOfFreedom': n - 1,
|
|
218
|
+
'pValue': float(p_val),
|
|
219
|
+
'meanDifference': round(mean_diff, 6),
|
|
220
|
+
'stdDifference': round(std_diff, 6),
|
|
221
|
+
'confidenceInterval': [round(float(ci[0]), 6), round(float(ci[1]), 6)],
|
|
222
|
+
'mean1': round(float(v1.mean()), 6),
|
|
223
|
+
'mean2': round(float(v2.mean()), 6),
|
|
224
|
+
'n': n
|
|
225
|
+
})
|
|
226
|
+
`;
|
|
227
|
+
const ANOVA_ONEWAY_PY = `
|
|
228
|
+
import json
|
|
229
|
+
import pandas as pd
|
|
230
|
+
import numpy as np
|
|
231
|
+
from scipy import stats
|
|
232
|
+
|
|
233
|
+
def run_anova_oneway(data_json, variable, group_variable):
|
|
234
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
235
|
+
|
|
236
|
+
groups = df.groupby(group_variable)[variable].apply(
|
|
237
|
+
lambda x: pd.to_numeric(x, errors='coerce').dropna().tolist()
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
group_arrays = [np.array(g) for g in groups.values if len(g) > 0]
|
|
241
|
+
group_names = [str(name) for name, g in zip(groups.index, groups.values) if len(g) > 0]
|
|
242
|
+
|
|
243
|
+
f_stat, p_val = stats.f_oneway(*group_arrays)
|
|
244
|
+
|
|
245
|
+
# Compute detailed ANOVA table
|
|
246
|
+
grand_mean = np.concatenate(group_arrays).mean()
|
|
247
|
+
n_total = sum(len(g) for g in group_arrays)
|
|
248
|
+
k = len(group_arrays)
|
|
249
|
+
|
|
250
|
+
ss_between = sum(len(g) * (g.mean() - grand_mean)**2 for g in group_arrays)
|
|
251
|
+
ss_within = sum(((g - g.mean())**2).sum() for g in group_arrays)
|
|
252
|
+
|
|
253
|
+
df_between = k - 1
|
|
254
|
+
df_within = n_total - k
|
|
255
|
+
|
|
256
|
+
ms_between = ss_between / df_between
|
|
257
|
+
ms_within = ss_within / df_within
|
|
258
|
+
|
|
259
|
+
ss_total = ss_between + ss_within
|
|
260
|
+
eta_sq = ss_between / ss_total if ss_total > 0 else 0
|
|
261
|
+
|
|
262
|
+
group_stats = []
|
|
263
|
+
for name, arr in zip(group_names, group_arrays):
|
|
264
|
+
group_stats.append({
|
|
265
|
+
'group': name,
|
|
266
|
+
'n': len(arr),
|
|
267
|
+
'mean': round(float(arr.mean()), 6),
|
|
268
|
+
'std': round(float(arr.std(ddof=1)), 6)
|
|
269
|
+
})
|
|
270
|
+
|
|
271
|
+
return json.dumps({
|
|
272
|
+
'fStatistic': round(float(f_stat), 6),
|
|
273
|
+
'pValue': float(p_val),
|
|
274
|
+
'degreesOfFreedomBetween': df_between,
|
|
275
|
+
'degreesOfFreedomWithin': df_within,
|
|
276
|
+
'sumOfSquaresBetween': round(float(ss_between), 6),
|
|
277
|
+
'sumOfSquaresWithin': round(float(ss_within), 6),
|
|
278
|
+
'meanSquareBetween': round(float(ms_between), 6),
|
|
279
|
+
'meanSquareWithin': round(float(ms_within), 6),
|
|
280
|
+
'groupStats': group_stats,
|
|
281
|
+
'etaSquared': round(float(eta_sq), 6)
|
|
282
|
+
})
|
|
283
|
+
`;
|
|
284
|
+
const POSTHOC_TUKEY_PY = `
|
|
285
|
+
import json
|
|
286
|
+
import pandas as pd
|
|
287
|
+
import numpy as np
|
|
288
|
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
|
289
|
+
|
|
290
|
+
def run_posthoc_tukey(data_json, variable, group_variable, alpha=0.05):
|
|
291
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
292
|
+
|
|
293
|
+
df[variable] = pd.to_numeric(df[variable], errors='coerce')
|
|
294
|
+
df = df.dropna(subset=[variable])
|
|
295
|
+
|
|
296
|
+
result = pairwise_tukeyhsd(df[variable], df[group_variable], alpha=alpha)
|
|
297
|
+
|
|
298
|
+
comparisons = []
|
|
299
|
+
for i in range(len(result.summary().data) - 1):
|
|
300
|
+
row = result.summary().data[i + 1]
|
|
301
|
+
comparisons.append({
|
|
302
|
+
'group1': str(row[0]),
|
|
303
|
+
'group2': str(row[1]),
|
|
304
|
+
'meanDifference': round(float(row[2]), 6),
|
|
305
|
+
'pValue': round(float(row[3]), 6),
|
|
306
|
+
'lowerCI': round(float(row[4]), 6),
|
|
307
|
+
'upperCI': round(float(row[5]), 6),
|
|
308
|
+
'reject': bool(row[6])
|
|
309
|
+
})
|
|
310
|
+
|
|
311
|
+
return json.dumps({
|
|
312
|
+
'comparisons': comparisons,
|
|
313
|
+
'alpha': alpha
|
|
314
|
+
})
|
|
315
|
+
`;
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Python code for regression analysis functions.
|
|
319
|
+
*/
|
|
320
|
+
const LINEAR_REGRESSION_PY = `
|
|
321
|
+
import json
|
|
322
|
+
import pandas as pd
|
|
323
|
+
import numpy as np
|
|
324
|
+
import statsmodels.api as sm
|
|
325
|
+
|
|
326
|
+
def run_linear_regression(data_json, dependent, independents_json, add_constant=True):
|
|
327
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
328
|
+
independents = json.loads(independents_json)
|
|
329
|
+
|
|
330
|
+
y = pd.to_numeric(df[dependent], errors='coerce')
|
|
331
|
+
X = df[independents].apply(pd.to_numeric, errors='coerce')
|
|
332
|
+
|
|
333
|
+
mask = y.notna() & X.notna().all(axis=1)
|
|
334
|
+
y = y[mask]
|
|
335
|
+
X = X[mask]
|
|
336
|
+
|
|
337
|
+
if add_constant:
|
|
338
|
+
X = sm.add_constant(X)
|
|
339
|
+
|
|
340
|
+
model = sm.OLS(y, X).fit()
|
|
341
|
+
|
|
342
|
+
coefficients = []
|
|
343
|
+
for i, name in enumerate(model.params.index):
|
|
344
|
+
ci = model.conf_int().iloc[i]
|
|
345
|
+
coefficients.append({
|
|
346
|
+
'variable': str(name),
|
|
347
|
+
'coefficient': round(float(model.params.iloc[i]), 6),
|
|
348
|
+
'stdError': round(float(model.bse.iloc[i]), 6),
|
|
349
|
+
'tStatistic': round(float(model.tvalues.iloc[i]), 6),
|
|
350
|
+
'pValue': float(model.pvalues.iloc[i]),
|
|
351
|
+
'confidenceInterval': [round(float(ci[0]), 6), round(float(ci[1]), 6)]
|
|
352
|
+
})
|
|
353
|
+
|
|
354
|
+
dw = float(sm.stats.stattools.durbin_watson(model.resid))
|
|
355
|
+
|
|
356
|
+
return json.dumps({
|
|
357
|
+
'rSquared': round(float(model.rsquared), 6),
|
|
358
|
+
'adjustedRSquared': round(float(model.rsquared_adj), 6),
|
|
359
|
+
'fStatistic': round(float(model.fvalue), 6),
|
|
360
|
+
'fPValue': float(model.f_pvalue),
|
|
361
|
+
'coefficients': coefficients,
|
|
362
|
+
'residualStdError': round(float(np.sqrt(model.mse_resid)), 6),
|
|
363
|
+
'observations': int(model.nobs),
|
|
364
|
+
'degreesOfFreedom': int(model.df_resid),
|
|
365
|
+
'durbinWatson': round(dw, 6)
|
|
366
|
+
})
|
|
367
|
+
`;
|
|
368
|
+
const LOGISTIC_BINARY_PY = `
|
|
369
|
+
import json
|
|
370
|
+
import pandas as pd
|
|
371
|
+
import numpy as np
|
|
372
|
+
import statsmodels.api as sm
|
|
373
|
+
|
|
374
|
+
def run_logistic_binary(data_json, dependent, independents_json, add_constant=True):
|
|
375
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
376
|
+
independents = json.loads(independents_json)
|
|
377
|
+
|
|
378
|
+
y = pd.to_numeric(df[dependent], errors='coerce')
|
|
379
|
+
X = df[independents].apply(pd.to_numeric, errors='coerce')
|
|
380
|
+
|
|
381
|
+
mask = y.notna() & X.notna().all(axis=1)
|
|
382
|
+
y = y[mask]
|
|
383
|
+
X = X[mask]
|
|
384
|
+
|
|
385
|
+
if add_constant:
|
|
386
|
+
X = sm.add_constant(X)
|
|
387
|
+
|
|
388
|
+
model = sm.Logit(y, X).fit(disp=0)
|
|
389
|
+
|
|
390
|
+
coefficients = []
|
|
391
|
+
ci = model.conf_int()
|
|
392
|
+
for i, name in enumerate(model.params.index):
|
|
393
|
+
coef = float(model.params.iloc[i])
|
|
394
|
+
coefficients.append({
|
|
395
|
+
'variable': str(name),
|
|
396
|
+
'coefficient': round(coef, 6),
|
|
397
|
+
'stdError': round(float(model.bse.iloc[i]), 6),
|
|
398
|
+
'zStatistic': round(float(model.tvalues.iloc[i]), 6),
|
|
399
|
+
'pValue': float(model.pvalues.iloc[i]),
|
|
400
|
+
'oddsRatio': round(float(np.exp(coef)), 6),
|
|
401
|
+
'confidenceInterval': [round(float(ci.iloc[i, 0]), 6), round(float(ci.iloc[i, 1]), 6)]
|
|
402
|
+
})
|
|
403
|
+
|
|
404
|
+
return json.dumps({
|
|
405
|
+
'coefficients': coefficients,
|
|
406
|
+
'pseudoRSquared': round(float(model.prsquared), 6),
|
|
407
|
+
'logLikelihood': round(float(model.llf), 6),
|
|
408
|
+
'llrPValue': float(model.llr_pvalue),
|
|
409
|
+
'aic': round(float(model.aic), 6),
|
|
410
|
+
'bic': round(float(model.bic), 6),
|
|
411
|
+
'observations': int(model.nobs),
|
|
412
|
+
'convergence': bool(model.mle_retvals['converged'])
|
|
413
|
+
})
|
|
414
|
+
`;
|
|
415
|
+
const LOGISTIC_MULTINOMIAL_PY = `
|
|
416
|
+
import json
|
|
417
|
+
import pandas as pd
|
|
418
|
+
import numpy as np
|
|
419
|
+
import statsmodels.api as sm
|
|
420
|
+
|
|
421
|
+
def run_logistic_multinomial(data_json, dependent, independents_json, reference_category=None):
|
|
422
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
423
|
+
independents = json.loads(independents_json)
|
|
424
|
+
|
|
425
|
+
X = df[independents].apply(pd.to_numeric, errors='coerce')
|
|
426
|
+
y = df[dependent]
|
|
427
|
+
|
|
428
|
+
mask = X.notna().all(axis=1) & y.notna()
|
|
429
|
+
X = X[mask]
|
|
430
|
+
y = y[mask]
|
|
431
|
+
|
|
432
|
+
# Encode categories
|
|
433
|
+
categories = sorted(y.unique().tolist(), key=str)
|
|
434
|
+
if reference_category is not None:
|
|
435
|
+
ref = str(reference_category)
|
|
436
|
+
else:
|
|
437
|
+
ref = str(categories[0])
|
|
438
|
+
|
|
439
|
+
y_coded = pd.Categorical(y, categories=categories)
|
|
440
|
+
y_dummies = pd.get_dummies(y_coded, drop_first=False)
|
|
441
|
+
|
|
442
|
+
X_const = sm.add_constant(X)
|
|
443
|
+
|
|
444
|
+
from sklearn.linear_model import LogisticRegression
|
|
445
|
+
|
|
446
|
+
le_map = {str(c): i for i, c in enumerate(categories)}
|
|
447
|
+
y_numeric = y.map(lambda x: le_map[str(x)])
|
|
448
|
+
|
|
449
|
+
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
|
|
450
|
+
model.fit(X, y_numeric)
|
|
451
|
+
|
|
452
|
+
ref_idx = le_map[ref]
|
|
453
|
+
non_ref_cats = [c for c in categories if str(c) != ref]
|
|
454
|
+
|
|
455
|
+
coefficients = []
|
|
456
|
+
for cat in non_ref_cats:
|
|
457
|
+
cat_idx = le_map[str(cat)]
|
|
458
|
+
for j, var_name in enumerate(independents):
|
|
459
|
+
coef = float(model.coef_[cat_idx][j] - model.coef_[ref_idx][j])
|
|
460
|
+
coefficients.append({
|
|
461
|
+
'category': str(cat),
|
|
462
|
+
'variable': var_name,
|
|
463
|
+
'coefficient': round(coef, 6),
|
|
464
|
+
'stdError': 0.0,
|
|
465
|
+
'zStatistic': 0.0,
|
|
466
|
+
'pValue': 0.0,
|
|
467
|
+
'oddsRatio': round(float(np.exp(coef)), 6),
|
|
468
|
+
'confidenceInterval': [0.0, 0.0]
|
|
469
|
+
})
|
|
470
|
+
|
|
471
|
+
# Log-likelihood
|
|
472
|
+
proba = model.predict_proba(X)
|
|
473
|
+
ll = float(np.sum(np.log(proba[np.arange(len(y_numeric)), y_numeric] + 1e-10)))
|
|
474
|
+
n_params = len(non_ref_cats) * (len(independents) + 1)
|
|
475
|
+
aic = -2 * ll + 2 * n_params
|
|
476
|
+
bic_val = -2 * ll + n_params * np.log(len(y))
|
|
477
|
+
|
|
478
|
+
return json.dumps({
|
|
479
|
+
'coefficients': coefficients,
|
|
480
|
+
'pseudoRSquared': round(float(model.score(X, y_numeric)), 6),
|
|
481
|
+
'logLikelihood': round(ll, 6),
|
|
482
|
+
'llrPValue': 0.0,
|
|
483
|
+
'aic': round(float(aic), 6),
|
|
484
|
+
'bic': round(float(bic_val), 6),
|
|
485
|
+
'categories': [str(c) for c in categories],
|
|
486
|
+
'referenceCategory': ref,
|
|
487
|
+
'observations': int(len(y))
|
|
488
|
+
})
|
|
489
|
+
`;
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Python code for classification/clustering functions.
|
|
493
|
+
*/
|
|
494
|
+
const KMEANS_PY = `
|
|
495
|
+
import json
|
|
496
|
+
import pandas as pd
|
|
497
|
+
import numpy as np
|
|
498
|
+
from sklearn.cluster import KMeans
|
|
499
|
+
from sklearn.preprocessing import StandardScaler
|
|
500
|
+
|
|
501
|
+
def run_kmeans(data_json, variables_json, k, max_iterations=300, random_state=42):
|
|
502
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
503
|
+
variables = json.loads(variables_json)
|
|
504
|
+
|
|
505
|
+
X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
|
|
506
|
+
|
|
507
|
+
scaler = StandardScaler()
|
|
508
|
+
X_scaled = scaler.fit_transform(X)
|
|
509
|
+
|
|
510
|
+
model = KMeans(n_clusters=k, max_iter=max_iterations, random_state=random_state, n_init=10)
|
|
511
|
+
labels = model.fit_predict(X_scaled)
|
|
512
|
+
|
|
513
|
+
# Transform centers back to original scale
|
|
514
|
+
centers_original = scaler.inverse_transform(model.cluster_centers_)
|
|
515
|
+
|
|
516
|
+
centers = []
|
|
517
|
+
for i in range(k):
|
|
518
|
+
center = {}
|
|
519
|
+
for j, var in enumerate(variables):
|
|
520
|
+
center[var] = round(float(centers_original[i, j]), 6)
|
|
521
|
+
centers.append({'cluster': i, 'center': center})
|
|
522
|
+
|
|
523
|
+
unique, counts = np.unique(labels, return_counts=True)
|
|
524
|
+
cluster_sizes = {int(u): int(c) for u, c in zip(unique, counts)}
|
|
525
|
+
|
|
526
|
+
return json.dumps({
|
|
527
|
+
'labels': [int(l) for l in labels],
|
|
528
|
+
'centers': centers,
|
|
529
|
+
'inertia': round(float(model.inertia_), 6),
|
|
530
|
+
'iterations': int(model.n_iter_),
|
|
531
|
+
'clusterSizes': cluster_sizes
|
|
532
|
+
})
|
|
533
|
+
`;
|
|
534
|
+
const HIERARCHICAL_CLUSTER_PY = `
|
|
535
|
+
import json
|
|
536
|
+
import pandas as pd
|
|
537
|
+
import numpy as np
|
|
538
|
+
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
|
|
539
|
+
from sklearn.preprocessing import StandardScaler
|
|
540
|
+
|
|
541
|
+
def run_hierarchical_cluster(data_json, variables_json, method='ward', metric='euclidean', n_clusters=None, distance_threshold=None):
|
|
542
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
543
|
+
variables = json.loads(variables_json)
|
|
544
|
+
|
|
545
|
+
X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
|
|
546
|
+
|
|
547
|
+
scaler = StandardScaler()
|
|
548
|
+
X_scaled = scaler.fit_transform(X)
|
|
549
|
+
|
|
550
|
+
Z = linkage(X_scaled, method=method, metric=metric)
|
|
551
|
+
|
|
552
|
+
if n_clusters is not None:
|
|
553
|
+
labels = fcluster(Z, t=n_clusters, criterion='maxclust')
|
|
554
|
+
elif distance_threshold is not None:
|
|
555
|
+
labels = fcluster(Z, t=distance_threshold, criterion='distance')
|
|
556
|
+
else:
|
|
557
|
+
labels = fcluster(Z, t=3, criterion='maxclust')
|
|
558
|
+
|
|
559
|
+
labels = labels - 1 # 0-indexed
|
|
560
|
+
|
|
561
|
+
unique, counts = np.unique(labels, return_counts=True)
|
|
562
|
+
cluster_sizes = {int(u): int(c) for u, c in zip(unique, counts)}
|
|
563
|
+
|
|
564
|
+
# Dendrogram data (truncated for large datasets)
|
|
565
|
+
trunc = min(30, len(X_scaled))
|
|
566
|
+
dend = dendrogram(Z, truncate_mode='lastp', p=trunc, no_plot=True)
|
|
567
|
+
|
|
568
|
+
return json.dumps({
|
|
569
|
+
'labels': [int(l) for l in labels],
|
|
570
|
+
'nClusters': len(unique),
|
|
571
|
+
'linkageMatrix': [[round(float(x), 6) for x in row] for row in Z.tolist()],
|
|
572
|
+
'clusterSizes': cluster_sizes,
|
|
573
|
+
'dendrogramData': {
|
|
574
|
+
'icoord': [[round(float(x), 4) for x in row] for row in dend['icoord']],
|
|
575
|
+
'dcoord': [[round(float(x), 4) for x in row] for row in dend['dcoord']],
|
|
576
|
+
'leaves': [int(x) for x in dend['leaves']]
|
|
577
|
+
}
|
|
578
|
+
})
|
|
579
|
+
`;
|
|
580
|
+
|
|
581
|
+
/**
|
|
582
|
+
* Python code for dimension reduction functions.
|
|
583
|
+
*/
|
|
584
|
+
const EFA_PY = `
|
|
585
|
+
import json
|
|
586
|
+
import pandas as pd
|
|
587
|
+
import numpy as np
|
|
588
|
+
from factor_analyzer import FactorAnalyzer
|
|
589
|
+
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
|
|
590
|
+
|
|
591
|
+
def run_efa(data_json, variables_json, n_factors, rotation='varimax', method='minres'):
|
|
592
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
593
|
+
variables = json.loads(variables_json)
|
|
594
|
+
|
|
595
|
+
X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
|
|
596
|
+
|
|
597
|
+
# KMO and Bartlett tests
|
|
598
|
+
kmo_all, kmo_model = calculate_kmo(X)
|
|
599
|
+
chi2, p_value = calculate_bartlett_sphericity(X)
|
|
600
|
+
|
|
601
|
+
fa = FactorAnalyzer(n_factors=n_factors, rotation=rotation, method=method)
|
|
602
|
+
fa.fit(X)
|
|
603
|
+
|
|
604
|
+
loadings = fa.loadings_
|
|
605
|
+
loadings_dict = {}
|
|
606
|
+
for i, var in enumerate(variables):
|
|
607
|
+
loadings_dict[var] = [round(float(x), 6) for x in loadings[i]]
|
|
608
|
+
|
|
609
|
+
ev, v = fa.get_factor_variance()
|
|
610
|
+
|
|
611
|
+
communalities = fa.get_communalities()
|
|
612
|
+
uniquenesses = fa.get_uniquenesses()
|
|
613
|
+
|
|
614
|
+
comm_dict = {}
|
|
615
|
+
uniq_dict = {}
|
|
616
|
+
for i, var in enumerate(variables):
|
|
617
|
+
comm_dict[var] = round(float(communalities[i]), 6)
|
|
618
|
+
uniq_dict[var] = round(float(uniquenesses[i]), 6)
|
|
619
|
+
|
|
620
|
+
eigenvalues = fa.get_eigenvalues()[0]
|
|
621
|
+
|
|
622
|
+
return json.dumps({
|
|
623
|
+
'loadings': loadings_dict,
|
|
624
|
+
'eigenvalues': [round(float(x), 6) for x in eigenvalues],
|
|
625
|
+
'variance': [round(float(x), 6) for x in ev],
|
|
626
|
+
'cumulativeVariance': [round(float(sum(v[:i+1])), 6) for i in range(len(v))],
|
|
627
|
+
'communalities': comm_dict,
|
|
628
|
+
'uniquenesses': uniq_dict,
|
|
629
|
+
'nFactors': n_factors,
|
|
630
|
+
'rotation': rotation,
|
|
631
|
+
'kmo': round(float(kmo_model), 6),
|
|
632
|
+
'bartlettChi2': round(float(chi2), 6),
|
|
633
|
+
'bartlettPValue': float(p_value)
|
|
634
|
+
})
|
|
635
|
+
`;
|
|
636
|
+
const PCA_PY = `
|
|
637
|
+
import json
|
|
638
|
+
import pandas as pd
|
|
639
|
+
import numpy as np
|
|
640
|
+
from sklearn.decomposition import PCA
|
|
641
|
+
from sklearn.preprocessing import StandardScaler
|
|
642
|
+
|
|
643
|
+
def run_pca(data_json, variables_json, n_components=None, standardize=True):
|
|
644
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
645
|
+
variables = json.loads(variables_json)
|
|
646
|
+
|
|
647
|
+
X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
|
|
648
|
+
|
|
649
|
+
if standardize:
|
|
650
|
+
scaler = StandardScaler()
|
|
651
|
+
X_scaled = scaler.fit_transform(X)
|
|
652
|
+
else:
|
|
653
|
+
X_scaled = X.values
|
|
654
|
+
|
|
655
|
+
if n_components is None:
|
|
656
|
+
n_components = min(len(variables), len(X_scaled))
|
|
657
|
+
|
|
658
|
+
pca = PCA(n_components=n_components)
|
|
659
|
+
transformed = pca.fit_transform(X_scaled)
|
|
660
|
+
|
|
661
|
+
loadings = {}
|
|
662
|
+
for i, var in enumerate(variables):
|
|
663
|
+
loadings[var] = [round(float(x), 6) for x in pca.components_[:, i]]
|
|
664
|
+
|
|
665
|
+
cum_var = np.cumsum(pca.explained_variance_ratio_)
|
|
666
|
+
|
|
667
|
+
return json.dumps({
|
|
668
|
+
'components': [[round(float(x), 6) for x in row] for row in transformed.tolist()],
|
|
669
|
+
'explainedVariance': [round(float(x), 6) for x in pca.explained_variance_],
|
|
670
|
+
'explainedVarianceRatio': [round(float(x), 6) for x in pca.explained_variance_ratio_],
|
|
671
|
+
'cumulativeVarianceRatio': [round(float(x), 6) for x in cum_var],
|
|
672
|
+
'loadings': loadings,
|
|
673
|
+
'singularValues': [round(float(x), 6) for x in pca.singular_values_],
|
|
674
|
+
'nComponents': n_components
|
|
675
|
+
})
|
|
676
|
+
`;
|
|
677
|
+
const MDS_PY = `
|
|
678
|
+
import json
|
|
679
|
+
import pandas as pd
|
|
680
|
+
import numpy as np
|
|
681
|
+
from sklearn.manifold import MDS
|
|
682
|
+
from sklearn.preprocessing import StandardScaler
|
|
683
|
+
|
|
684
|
+
def run_mds(data_json, variables_json, n_components=2, metric=True, max_iterations=300, random_state=42):
|
|
685
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
686
|
+
variables = json.loads(variables_json)
|
|
687
|
+
|
|
688
|
+
X = df[variables].apply(pd.to_numeric, errors='coerce').dropna()
|
|
689
|
+
|
|
690
|
+
scaler = StandardScaler()
|
|
691
|
+
X_scaled = scaler.fit_transform(X)
|
|
692
|
+
|
|
693
|
+
mds = MDS(n_components=n_components, metric=metric, max_iter=max_iterations, random_state=random_state, normalized_stress='auto')
|
|
694
|
+
coords = mds.fit_transform(X_scaled)
|
|
695
|
+
|
|
696
|
+
return json.dumps({
|
|
697
|
+
'coordinates': [[round(float(x), 6) for x in row] for row in coords.tolist()],
|
|
698
|
+
'stress': round(float(mds.stress_), 6),
|
|
699
|
+
'nComponents': n_components
|
|
700
|
+
})
|
|
701
|
+
`;
|
|
702
|
+
|
|
703
|
+
/**
|
|
704
|
+
* Python code for scale/reliability functions.
|
|
705
|
+
*/
|
|
706
|
+
const CRONBACH_ALPHA_PY = `
|
|
707
|
+
import json
|
|
708
|
+
import pandas as pd
|
|
709
|
+
import numpy as np
|
|
710
|
+
|
|
711
|
+
def run_cronbach_alpha(data_json, items_json):
|
|
712
|
+
df = pd.DataFrame(json.loads(data_json))
|
|
713
|
+
items = json.loads(items_json)
|
|
714
|
+
|
|
715
|
+
X = df[items].apply(pd.to_numeric, errors='coerce').dropna()
|
|
716
|
+
|
|
717
|
+
n_items = len(items)
|
|
718
|
+
n_obs = len(X)
|
|
719
|
+
|
|
720
|
+
# Compute Cronbach's Alpha
|
|
721
|
+
item_vars = X.var(ddof=1)
|
|
722
|
+
total_var = X.sum(axis=1).var(ddof=1)
|
|
723
|
+
alpha = (n_items / (n_items - 1)) * (1 - item_vars.sum() / total_var)
|
|
724
|
+
|
|
725
|
+
# Standardized alpha (using correlation matrix)
|
|
726
|
+
corr_matrix = X.corr()
|
|
727
|
+
mean_r = (corr_matrix.sum().sum() - n_items) / (n_items * (n_items - 1))
|
|
728
|
+
std_alpha = (n_items * mean_r) / (1 + (n_items - 1) * mean_r)
|
|
729
|
+
|
|
730
|
+
# Item analysis
|
|
731
|
+
item_analysis = []
|
|
732
|
+
total_score = X.sum(axis=1)
|
|
733
|
+
|
|
734
|
+
for item in items:
|
|
735
|
+
item_col = X[item]
|
|
736
|
+
other_items = [i for i in items if i != item]
|
|
737
|
+
other_sum = X[other_items].sum(axis=1)
|
|
738
|
+
|
|
739
|
+
# Corrected item-total correlation
|
|
740
|
+
citc = float(item_col.corr(other_sum))
|
|
741
|
+
|
|
742
|
+
# Alpha if item deleted
|
|
743
|
+
if len(other_items) > 1:
|
|
744
|
+
sub_X = X[other_items]
|
|
745
|
+
sub_vars = sub_X.var(ddof=1)
|
|
746
|
+
sub_total_var = sub_X.sum(axis=1).var(ddof=1)
|
|
747
|
+
k = len(other_items)
|
|
748
|
+
alpha_deleted = (k / (k - 1)) * (1 - sub_vars.sum() / sub_total_var)
|
|
749
|
+
else:
|
|
750
|
+
alpha_deleted = 0.0
|
|
751
|
+
|
|
752
|
+
item_analysis.append({
|
|
753
|
+
'item': item,
|
|
754
|
+
'itemMean': round(float(item_col.mean()), 6),
|
|
755
|
+
'itemStd': round(float(item_col.std(ddof=1)), 6),
|
|
756
|
+
'correctedItemTotalCorrelation': round(citc, 6),
|
|
757
|
+
'alphaIfItemDeleted': round(float(alpha_deleted), 6)
|
|
758
|
+
})
|
|
759
|
+
|
|
760
|
+
return json.dumps({
|
|
761
|
+
'alpha': round(float(alpha), 6),
|
|
762
|
+
'standardizedAlpha': round(float(std_alpha), 6),
|
|
763
|
+
'nItems': n_items,
|
|
764
|
+
'nObservations': n_obs,
|
|
765
|
+
'itemAnalysis': item_analysis,
|
|
766
|
+
'interItemCorrelationMean': round(float(mean_r), 6)
|
|
767
|
+
})
|
|
768
|
+
`;
|
|
769
|
+
|
|
770
|
+
/**
|
|
771
|
+
* Web Worker for Pyodide-based statistical analysis.
|
|
772
|
+
* Runs Python code in a WASM sandbox for browser-based computations.
|
|
773
|
+
*/
|
|
774
|
+
// Import Python code strings - these will be inlined by the bundler
|
|
775
|
+
// For the worker bundle, we import them directly
|
|
776
|
+
let pyodide = null;
|
|
777
|
+
/**
|
|
778
|
+
* Send progress update to main thread
|
|
779
|
+
*/
|
|
780
|
+
function sendProgress(id, stage, progress, message) {
|
|
781
|
+
const response = {
|
|
782
|
+
id,
|
|
783
|
+
type: 'progress',
|
|
784
|
+
progress: { stage, progress, message },
|
|
785
|
+
};
|
|
786
|
+
self.postMessage(response);
|
|
787
|
+
}
|
|
788
|
+
/**
|
|
789
|
+
* Send result to main thread
|
|
790
|
+
*/
|
|
791
|
+
function sendResult(id, data) {
|
|
792
|
+
const response = {
|
|
793
|
+
id,
|
|
794
|
+
type: 'result',
|
|
795
|
+
data,
|
|
796
|
+
};
|
|
797
|
+
self.postMessage(response);
|
|
798
|
+
}
|
|
799
|
+
/**
|
|
800
|
+
* Send error to main thread
|
|
801
|
+
*/
|
|
802
|
+
function sendError(id, error) {
|
|
803
|
+
const response = {
|
|
804
|
+
id,
|
|
805
|
+
type: 'error',
|
|
806
|
+
error,
|
|
807
|
+
};
|
|
808
|
+
self.postMessage(response);
|
|
809
|
+
}
|
|
810
|
+
/**
|
|
811
|
+
* Deserialize ArrayBuffer to JSON string for Python consumption.
|
|
812
|
+
* Matches the format produced by serializeToBuffer in bridge/serializer.ts.
|
|
813
|
+
*/
|
|
814
|
+
function bufferToJsonString(buffer) {
|
|
815
|
+
const view = new DataView(buffer);
|
|
816
|
+
const headerLength = view.getUint32(0, true);
|
|
817
|
+
const headerBytes = new Uint8Array(buffer, 4, headerLength);
|
|
818
|
+
const header = JSON.parse(new TextDecoder().decode(headerBytes));
|
|
819
|
+
if (header.rowCount === 0)
|
|
820
|
+
return '[]';
|
|
821
|
+
const { rowCount, columns } = header;
|
|
822
|
+
let offset = 4 + headerLength;
|
|
823
|
+
// Read columns into arrays
|
|
824
|
+
const columnData = new Map();
|
|
825
|
+
for (const col of columns) {
|
|
826
|
+
if (col.dtype === 'string') {
|
|
827
|
+
const byteLen = rowCount * 4;
|
|
828
|
+
const indices = new Int32Array(new Uint8Array(buffer, offset, byteLen).slice().buffer);
|
|
829
|
+
const values = [];
|
|
830
|
+
for (let i = 0; i < rowCount; i++) {
|
|
831
|
+
values.push(col.stringTable[indices[i]]);
|
|
832
|
+
}
|
|
833
|
+
columnData.set(col.name, values);
|
|
834
|
+
offset += byteLen;
|
|
835
|
+
}
|
|
836
|
+
else {
|
|
837
|
+
const byteLen = rowCount * 8;
|
|
838
|
+
const arr = new Float64Array(new Uint8Array(buffer, offset, byteLen).slice().buffer);
|
|
839
|
+
const values = Array.from(arr);
|
|
840
|
+
columnData.set(col.name, values);
|
|
841
|
+
offset += byteLen;
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
// Build row-oriented JSON
|
|
845
|
+
const rows = [];
|
|
846
|
+
for (let i = 0; i < rowCount; i++) {
|
|
847
|
+
const row = {};
|
|
848
|
+
for (const col of columns) {
|
|
849
|
+
row[col.name] = columnData.get(col.name)[i];
|
|
850
|
+
}
|
|
851
|
+
rows.push(row);
|
|
852
|
+
}
|
|
853
|
+
return JSON.stringify(rows);
|
|
854
|
+
}
|
|
855
|
+
/**
|
|
856
|
+
* Initialize Pyodide with required packages
|
|
857
|
+
*/
|
|
858
|
+
async function initPyodide(id, pyodideUrl) {
|
|
859
|
+
const totalSteps = 6;
|
|
860
|
+
let currentStep = 0;
|
|
861
|
+
const reportStep = (message) => {
|
|
862
|
+
currentStep++;
|
|
863
|
+
sendProgress(id, 'init', Math.round((currentStep / totalSteps) * 100), message);
|
|
864
|
+
};
|
|
865
|
+
try {
|
|
866
|
+
// Step 1: Load Pyodide core
|
|
867
|
+
sendProgress(id, 'init', 0, 'Loading Pyodide WASM runtime...');
|
|
868
|
+
// Try to load Pyodide - it should be available via importScripts or already loaded
|
|
869
|
+
if (typeof loadPyodide === 'undefined') {
|
|
870
|
+
const pyodideCdnUrl = pyodideUrl || 'https://cdn.jsdelivr.net/pyodide/v0.27.5/full/';
|
|
871
|
+
importScripts(pyodideCdnUrl + 'pyodide.js');
|
|
872
|
+
}
|
|
873
|
+
pyodide = await loadPyodide({
|
|
874
|
+
indexURL: pyodideUrl || 'https://cdn.jsdelivr.net/pyodide/v0.27.5/full/',
|
|
875
|
+
});
|
|
876
|
+
reportStep('Pyodide runtime loaded successfully');
|
|
877
|
+
// Step 2: Install micropip
|
|
878
|
+
await pyodide.loadPackagesFromImports('import micropip', {
|
|
879
|
+
messageCallback: (msg) => {
|
|
880
|
+
sendProgress(id, 'init', Math.round((currentStep / totalSteps) * 100), `micropip: ${msg}`);
|
|
881
|
+
}
|
|
882
|
+
});
|
|
883
|
+
reportStep('micropip package manager ready');
|
|
884
|
+
// Step 3: Install pandas and scipy
|
|
885
|
+
await pyodide.runPythonAsync(`
|
|
886
|
+
import micropip
|
|
887
|
+
await micropip.install(['pandas', 'scipy'])
|
|
888
|
+
`);
|
|
889
|
+
reportStep('pandas and scipy installed');
|
|
890
|
+
// Step 4: Install statsmodels
|
|
891
|
+
await pyodide.runPythonAsync(`
|
|
892
|
+
import micropip
|
|
893
|
+
await micropip.install('statsmodels')
|
|
894
|
+
`);
|
|
895
|
+
reportStep('statsmodels installed');
|
|
896
|
+
// Step 5: Install scikit-learn
|
|
897
|
+
await pyodide.runPythonAsync(`
|
|
898
|
+
import micropip
|
|
899
|
+
await micropip.install('scikit-learn')
|
|
900
|
+
`);
|
|
901
|
+
reportStep('scikit-learn installed');
|
|
902
|
+
// Step 6: Install factor_analyzer
|
|
903
|
+
await pyodide.runPythonAsync(`
|
|
904
|
+
import micropip
|
|
905
|
+
await micropip.install('factor_analyzer')
|
|
906
|
+
`);
|
|
907
|
+
reportStep('factor_analyzer installed - all packages ready');
|
|
908
|
+
sendResult(id, { initialized: true });
|
|
909
|
+
}
|
|
910
|
+
catch (err) {
|
|
911
|
+
sendError(id, `Initialization failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
/**
|
|
915
|
+
* Run a Python analysis function.
|
|
916
|
+
* Handles proxy cleanup to prevent memory leaks.
|
|
917
|
+
*/
|
|
918
|
+
async function runAnalysis(id, pythonCode, functionName, args) {
|
|
919
|
+
if (!pyodide) {
|
|
920
|
+
sendError(id, 'Pyodide is not initialized. Call init() first.');
|
|
921
|
+
return;
|
|
922
|
+
}
|
|
923
|
+
try {
|
|
924
|
+
// Load the Python function
|
|
925
|
+
await pyodide.runPythonAsync(pythonCode);
|
|
926
|
+
// Build the function call
|
|
927
|
+
const argsStr = args.map(a => {
|
|
928
|
+
// If it looks like a raw Python expression (number, bool), pass as-is
|
|
929
|
+
if (/^[-+]?\d+(\.\d+)?$/.test(a) || a === 'True' || a === 'False' || a === 'None') {
|
|
930
|
+
return a;
|
|
931
|
+
}
|
|
932
|
+
// Otherwise, wrap as a Python string
|
|
933
|
+
// Escape backslashes and single quotes
|
|
934
|
+
const escaped = a.replace(/\\/g, '\\\\').replace(/'/g, "\\'");
|
|
935
|
+
return `'${escaped}'`;
|
|
936
|
+
}).join(', ');
|
|
937
|
+
const callCode = `
|
|
938
|
+
import gc
|
|
939
|
+
_result = ${functionName}(${argsStr})
|
|
940
|
+
_result
|
|
941
|
+
`;
|
|
942
|
+
const result = await pyodide.runPythonAsync(callCode);
|
|
943
|
+
// Parse the JSON result from Python
|
|
944
|
+
const resultStr = String(result);
|
|
945
|
+
const parsed = JSON.parse(resultStr);
|
|
946
|
+
// Cleanup Python memory
|
|
947
|
+
await pyodide.runPythonAsync(`
|
|
948
|
+
del _result
|
|
949
|
+
gc.collect()
|
|
950
|
+
`);
|
|
951
|
+
sendResult(id, parsed);
|
|
952
|
+
}
|
|
953
|
+
catch (err) {
|
|
954
|
+
// Attempt cleanup even on error
|
|
955
|
+
try {
|
|
956
|
+
await pyodide.runPythonAsync('import gc; gc.collect()');
|
|
957
|
+
}
|
|
958
|
+
catch {
|
|
959
|
+
// Ignore cleanup errors
|
|
960
|
+
}
|
|
961
|
+
sendError(id, `Analysis failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
/**
|
|
965
|
+
* Handle incoming messages from main thread
|
|
966
|
+
*/
|
|
967
|
+
self.onmessage = async (event) => {
|
|
968
|
+
const { id, type, payload, params } = event.data;
|
|
969
|
+
try {
|
|
970
|
+
// Convert ArrayBuffer payload to JSON string if present
|
|
971
|
+
let dataJson = '[]';
|
|
972
|
+
if (payload && payload instanceof ArrayBuffer && payload.byteLength > 0) {
|
|
973
|
+
dataJson = bufferToJsonString(payload);
|
|
974
|
+
}
|
|
975
|
+
else if (params?.data) {
|
|
976
|
+
dataJson = JSON.stringify(params.data);
|
|
977
|
+
}
|
|
978
|
+
switch (type) {
|
|
979
|
+
case 'init':
|
|
980
|
+
await initPyodide(id, params?.pyodideUrl);
|
|
981
|
+
break;
|
|
982
|
+
// === Descriptive Statistics ===
|
|
983
|
+
case 'frequencies':
|
|
984
|
+
await runAnalysis(id, FREQUENCIES_PY, 'run_frequencies', [
|
|
985
|
+
dataJson,
|
|
986
|
+
String(params?.variable ?? '')
|
|
987
|
+
]);
|
|
988
|
+
break;
|
|
989
|
+
case 'descriptives':
|
|
990
|
+
await runAnalysis(id, DESCRIPTIVES_PY, 'run_descriptives', [
|
|
991
|
+
dataJson,
|
|
992
|
+
JSON.stringify(params?.variables ?? [])
|
|
993
|
+
]);
|
|
994
|
+
break;
|
|
995
|
+
case 'crosstabs':
|
|
996
|
+
await runAnalysis(id, CROSSTABS_PY, 'run_crosstabs', [
|
|
997
|
+
dataJson,
|
|
998
|
+
String(params?.rowVariable ?? ''),
|
|
999
|
+
String(params?.colVariable ?? '')
|
|
1000
|
+
]);
|
|
1001
|
+
break;
|
|
1002
|
+
// === Compare Means ===
|
|
1003
|
+
case 'ttest_independent':
|
|
1004
|
+
await runAnalysis(id, TTEST_INDEPENDENT_PY, 'run_ttest_independent', [
|
|
1005
|
+
dataJson,
|
|
1006
|
+
String(params?.variable ?? ''),
|
|
1007
|
+
String(params?.groupVariable ?? ''),
|
|
1008
|
+
String(params?.group1Value ?? ''),
|
|
1009
|
+
String(params?.group2Value ?? '')
|
|
1010
|
+
]);
|
|
1011
|
+
break;
|
|
1012
|
+
case 'ttest_paired':
|
|
1013
|
+
await runAnalysis(id, TTEST_PAIRED_PY, 'run_ttest_paired', [
|
|
1014
|
+
dataJson,
|
|
1015
|
+
String(params?.variable1 ?? ''),
|
|
1016
|
+
String(params?.variable2 ?? '')
|
|
1017
|
+
]);
|
|
1018
|
+
break;
|
|
1019
|
+
case 'anova_oneway':
|
|
1020
|
+
await runAnalysis(id, ANOVA_ONEWAY_PY, 'run_anova_oneway', [
|
|
1021
|
+
dataJson,
|
|
1022
|
+
String(params?.variable ?? ''),
|
|
1023
|
+
String(params?.groupVariable ?? '')
|
|
1024
|
+
]);
|
|
1025
|
+
break;
|
|
1026
|
+
case 'posthoc_tukey':
|
|
1027
|
+
await runAnalysis(id, POSTHOC_TUKEY_PY, 'run_posthoc_tukey', [
|
|
1028
|
+
dataJson,
|
|
1029
|
+
String(params?.variable ?? ''),
|
|
1030
|
+
String(params?.groupVariable ?? ''),
|
|
1031
|
+
String(params?.alpha ?? 0.05)
|
|
1032
|
+
]);
|
|
1033
|
+
break;
|
|
1034
|
+
// === Regression ===
|
|
1035
|
+
case 'linear_regression':
|
|
1036
|
+
await runAnalysis(id, LINEAR_REGRESSION_PY, 'run_linear_regression', [
|
|
1037
|
+
dataJson,
|
|
1038
|
+
String(params?.dependentVariable ?? ''),
|
|
1039
|
+
JSON.stringify(params?.independentVariables ?? []),
|
|
1040
|
+
String(params?.addConstant !== false ? 'True' : 'False')
|
|
1041
|
+
]);
|
|
1042
|
+
break;
|
|
1043
|
+
case 'logistic_binary':
|
|
1044
|
+
await runAnalysis(id, LOGISTIC_BINARY_PY, 'run_logistic_binary', [
|
|
1045
|
+
dataJson,
|
|
1046
|
+
String(params?.dependentVariable ?? ''),
|
|
1047
|
+
JSON.stringify(params?.independentVariables ?? []),
|
|
1048
|
+
String(params?.addConstant !== false ? 'True' : 'False')
|
|
1049
|
+
]);
|
|
1050
|
+
break;
|
|
1051
|
+
case 'logistic_multinomial':
|
|
1052
|
+
await runAnalysis(id, LOGISTIC_MULTINOMIAL_PY, 'run_logistic_multinomial', [
|
|
1053
|
+
dataJson,
|
|
1054
|
+
String(params?.dependentVariable ?? ''),
|
|
1055
|
+
JSON.stringify(params?.independentVariables ?? []),
|
|
1056
|
+
params?.referenceCategory != null ? String(params.referenceCategory) : 'None'
|
|
1057
|
+
]);
|
|
1058
|
+
break;
|
|
1059
|
+
// === Classify ===
|
|
1060
|
+
case 'kmeans':
|
|
1061
|
+
await runAnalysis(id, KMEANS_PY, 'run_kmeans', [
|
|
1062
|
+
dataJson,
|
|
1063
|
+
JSON.stringify(params?.variables ?? []),
|
|
1064
|
+
String(params?.k ?? 3),
|
|
1065
|
+
String(params?.maxIterations ?? 300),
|
|
1066
|
+
String(params?.randomState ?? 42)
|
|
1067
|
+
]);
|
|
1068
|
+
break;
|
|
1069
|
+
case 'hierarchical_cluster':
|
|
1070
|
+
await runAnalysis(id, HIERARCHICAL_CLUSTER_PY, 'run_hierarchical_cluster', [
|
|
1071
|
+
dataJson,
|
|
1072
|
+
JSON.stringify(params?.variables ?? []),
|
|
1073
|
+
String(params?.method ?? 'ward'),
|
|
1074
|
+
String(params?.metric ?? 'euclidean'),
|
|
1075
|
+
params?.nClusters != null ? String(params.nClusters) : 'None',
|
|
1076
|
+
params?.distanceThreshold != null ? String(params.distanceThreshold) : 'None'
|
|
1077
|
+
]);
|
|
1078
|
+
break;
|
|
1079
|
+
// === Dimension Reduction ===
|
|
1080
|
+
case 'efa':
|
|
1081
|
+
await runAnalysis(id, EFA_PY, 'run_efa', [
|
|
1082
|
+
dataJson,
|
|
1083
|
+
JSON.stringify(params?.variables ?? []),
|
|
1084
|
+
String(params?.nFactors ?? 2),
|
|
1085
|
+
String(params?.rotation ?? 'varimax'),
|
|
1086
|
+
String(params?.method ?? 'minres')
|
|
1087
|
+
]);
|
|
1088
|
+
break;
|
|
1089
|
+
case 'pca':
|
|
1090
|
+
await runAnalysis(id, PCA_PY, 'run_pca', [
|
|
1091
|
+
dataJson,
|
|
1092
|
+
JSON.stringify(params?.variables ?? []),
|
|
1093
|
+
params?.nComponents != null ? String(params.nComponents) : 'None',
|
|
1094
|
+
String(params?.standardize !== false ? 'True' : 'False')
|
|
1095
|
+
]);
|
|
1096
|
+
break;
|
|
1097
|
+
case 'mds':
|
|
1098
|
+
await runAnalysis(id, MDS_PY, 'run_mds', [
|
|
1099
|
+
dataJson,
|
|
1100
|
+
JSON.stringify(params?.variables ?? []),
|
|
1101
|
+
String(params?.nComponents ?? 2),
|
|
1102
|
+
String(params?.metric !== false ? 'True' : 'False'),
|
|
1103
|
+
String(params?.maxIterations ?? 300),
|
|
1104
|
+
String(params?.randomState ?? 42)
|
|
1105
|
+
]);
|
|
1106
|
+
break;
|
|
1107
|
+
// === Scale ===
|
|
1108
|
+
case 'cronbach_alpha':
|
|
1109
|
+
await runAnalysis(id, CRONBACH_ALPHA_PY, 'run_cronbach_alpha', [
|
|
1110
|
+
dataJson,
|
|
1111
|
+
JSON.stringify(params?.items ?? [])
|
|
1112
|
+
]);
|
|
1113
|
+
break;
|
|
1114
|
+
default:
|
|
1115
|
+
sendError(id, `Unknown analysis type: ${type}`);
|
|
1116
|
+
}
|
|
1117
|
+
}
|
|
1118
|
+
catch (err) {
|
|
1119
|
+
sendError(id, `Worker error: ${err instanceof Error ? err.message : String(err)}`);
|
|
1120
|
+
}
|
|
1121
|
+
};
|
|
1122
|
+
// Signal that the worker is ready
|
|
1123
|
+
self.postMessage({ id: '__worker_ready__', type: 'result', data: { ready: true } });
|
|
1124
|
+
|
|
1125
|
+
})();
|
|
1126
|
+
//# sourceMappingURL=stats-worker.js.map
|