py2ls 0.1.9.1__py3-none-any.whl → 0.1.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/stats.py
CHANGED
@@ -2,251 +2,362 @@ from scipy.ndimage import convolve1d
|
|
2
2
|
from scipy.signal import savgol_filter
|
3
3
|
import pingouin as pg
|
4
4
|
from scipy import stats
|
5
|
-
|
6
5
|
import numpy as np
|
7
6
|
import pandas as pd
|
8
7
|
import matplotlib.pyplot as plt
|
8
|
+
import warnings
|
9
9
|
|
10
|
+
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
10
11
|
|
11
|
-
# ==============FuncStars(ax,x1=1,x2=2, yscale=0.9, pval=0.01)====================================================
|
12
|
-
# Usage:
|
13
|
-
# FuncStars(ax, x1=2, x2=3, yscale=0.99, pval=0.02)
|
14
|
-
# =============================================================================
|
15
12
|
|
16
13
|
# FuncStars --v 0.1.1
|
17
|
-
def FuncStars(
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
14
|
+
def FuncStars(
|
15
|
+
ax,
|
16
|
+
pval=None,
|
17
|
+
ylim=None,
|
18
|
+
xlim=None,
|
19
|
+
symbol="*",
|
20
|
+
yscale=0.95,
|
21
|
+
y_loc=None,
|
22
|
+
x1=0,
|
23
|
+
x2=1,
|
24
|
+
alpha=0.05,
|
25
|
+
fontsize=14,
|
26
|
+
fontsize_note=12,
|
27
|
+
rotation=0,
|
28
|
+
fontname="Arial",
|
29
|
+
values_below=None,
|
30
|
+
linego=True,
|
31
|
+
linestyle="-",
|
32
|
+
linecolor="k",
|
33
|
+
linewidth=0.8,
|
34
|
+
nsshow="off",
|
35
|
+
symbolcolor="k",
|
36
|
+
tailindicator=[0.06, 0.06],
|
37
|
+
report=None,
|
38
|
+
report_scale=-0.1,
|
39
|
+
report_loc=None,
|
40
|
+
):
|
41
41
|
if ax is None:
|
42
42
|
ax = plt.gca()
|
43
|
-
if
|
44
|
-
|
45
|
-
if
|
46
|
-
|
43
|
+
if ylim is None:
|
44
|
+
ylim = ax.get_ylim()
|
45
|
+
if xlim is None:
|
46
|
+
xlim = ax.get_xlim()
|
47
47
|
if report_loc is None and report is not None:
|
48
|
-
report_loc = np.min(
|
48
|
+
report_loc = np.min(ylim) + report_scale * np.abs(np.diff(ylim))
|
49
49
|
if report_scale > 0:
|
50
50
|
report_scale = -np.abs(report_scale)
|
51
51
|
yscale = np.float64(yscale)
|
52
|
-
y_loc
|
52
|
+
if y_loc is None:
|
53
|
+
y_loc = np.min(ylim) + yscale * (np.max(ylim) - np.min(ylim))
|
53
54
|
xcenter = np.mean([x1, x2])
|
54
55
|
# ns / *
|
55
56
|
if alpha < pval:
|
56
|
-
if nsshow ==
|
57
|
-
ns_str = f
|
58
|
-
color =
|
59
|
-
plt.text(
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
57
|
+
if nsshow == "on":
|
58
|
+
ns_str = f"p={round(pval, 3)}" if pval < 0.9 else "ns"
|
59
|
+
color = "m" if pval < 0.1 else "k"
|
60
|
+
plt.text(
|
61
|
+
xcenter,
|
62
|
+
y_loc,
|
63
|
+
ns_str,
|
64
|
+
ha="center",
|
65
|
+
va="bottom", # 'center_baseline',
|
66
|
+
fontsize=fontsize - 6 if fontsize > 6 else fontsize,
|
67
|
+
fontname=fontname,
|
68
|
+
color=color,
|
69
|
+
rotation=rotation,
|
70
|
+
# bbox=dict(facecolor=None, edgecolor=None, color=None, linewidth=None)
|
71
|
+
)
|
65
72
|
elif 0.01 < pval <= alpha:
|
66
|
-
plt.text(
|
67
|
-
|
68
|
-
|
73
|
+
plt.text(
|
74
|
+
xcenter,
|
75
|
+
y_loc,
|
76
|
+
symbol,
|
77
|
+
ha="center",
|
78
|
+
va="center_baseline",
|
79
|
+
fontsize=fontsize,
|
80
|
+
fontname=fontname,
|
81
|
+
color=symbolcolor,
|
82
|
+
)
|
69
83
|
elif 0.001 < pval <= 0.01:
|
70
|
-
plt.text(
|
71
|
-
|
72
|
-
|
84
|
+
plt.text(
|
85
|
+
xcenter,
|
86
|
+
y_loc,
|
87
|
+
symbol * 2,
|
88
|
+
ha="center",
|
89
|
+
va="center_baseline",
|
90
|
+
fontsize=fontsize,
|
91
|
+
fontname=fontname,
|
92
|
+
color=symbolcolor,
|
93
|
+
)
|
73
94
|
elif 0 < pval <= 0.001:
|
74
|
-
plt.text(
|
75
|
-
|
76
|
-
|
95
|
+
plt.text(
|
96
|
+
xcenter,
|
97
|
+
y_loc,
|
98
|
+
symbol * 3,
|
99
|
+
ha="center",
|
100
|
+
va="center_baseline",
|
101
|
+
fontsize=fontsize,
|
102
|
+
fontname=fontname,
|
103
|
+
color=symbolcolor,
|
104
|
+
)
|
77
105
|
# lines indicators
|
78
|
-
if linego
|
79
|
-
print(pval)
|
80
|
-
print(linego)
|
106
|
+
if linego and 0 < pval <= 0.05:
|
81
107
|
# horizontal line
|
82
|
-
if yscale
|
83
|
-
plt.plot(
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
108
|
+
if yscale <= 0.99:
|
109
|
+
plt.plot(
|
110
|
+
[x1 + np.abs(np.diff(xlim)) * 0.01, x2 - np.abs(np.diff(xlim)) * 0.01],
|
111
|
+
[
|
112
|
+
y_loc - np.abs(np.diff(ylim)) * 0.03,
|
113
|
+
y_loc - np.abs(np.diff(ylim)) * 0.03,
|
114
|
+
],
|
115
|
+
linestyle=linestyle,
|
116
|
+
color=linecolor,
|
117
|
+
linewidth=linewidth,
|
118
|
+
)
|
88
119
|
# vertical line
|
89
|
-
plt.plot(
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
120
|
+
plt.plot(
|
121
|
+
[x1 + np.abs(np.diff(xlim)) * 0.01, x1 + np.abs(np.diff(xlim)) * 0.01],
|
122
|
+
[
|
123
|
+
y_loc - np.abs(np.diff(ylim)) * tailindicator[0],
|
124
|
+
y_loc - np.abs(np.diff(ylim)) * 0.03,
|
125
|
+
],
|
126
|
+
linestyle=linestyle,
|
127
|
+
color=linecolor,
|
128
|
+
linewidth=linewidth,
|
129
|
+
)
|
130
|
+
plt.plot(
|
131
|
+
[x2 - np.abs(np.diff(xlim)) * 0.01, x2 - np.abs(np.diff(xlim)) * 0.01],
|
132
|
+
[
|
133
|
+
y_loc - np.abs(np.diff(ylim)) * tailindicator[1],
|
134
|
+
y_loc - np.abs(np.diff(ylim)) * 0.03,
|
135
|
+
],
|
136
|
+
linestyle=linestyle,
|
137
|
+
color=linecolor,
|
138
|
+
linewidth=linewidth,
|
139
|
+
)
|
99
140
|
else:
|
100
|
-
plt.plot(
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
141
|
+
plt.plot(
|
142
|
+
[x1 + np.abs(np.diff(xlim)) * 0.01, x2 - np.abs(np.diff(xlim)) * 0.01],
|
143
|
+
[
|
144
|
+
np.min(ylim)
|
145
|
+
+ 0.95 * (np.max(ylim) - np.min(ylim))
|
146
|
+
- np.abs(np.diff(ylim)) * 0.002,
|
147
|
+
np.min(ylim)
|
148
|
+
+ 0.95 * (np.max(ylim) - np.min(ylim))
|
149
|
+
- np.abs(np.diff(ylim)) * 0.002,
|
150
|
+
],
|
151
|
+
linestyle=linestyle,
|
152
|
+
color=linecolor,
|
153
|
+
linewidth=linewidth,
|
154
|
+
)
|
105
155
|
# vertical line
|
106
|
-
plt.plot(
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
156
|
+
plt.plot(
|
157
|
+
[x1 + np.abs(np.diff(xlim)) * 0.01, x1 + np.abs(np.diff(xlim)) * 0.01],
|
158
|
+
[
|
159
|
+
np.min(ylim)
|
160
|
+
+ 0.95 * (np.max(ylim) - np.min(ylim))
|
161
|
+
- np.abs(np.diff(ylim)) * tailindicator[0],
|
162
|
+
np.min(ylim)
|
163
|
+
+ 0.95 * (np.max(ylim) - np.min(ylim))
|
164
|
+
- np.abs(np.diff(ylim)) * 0.002,
|
165
|
+
],
|
166
|
+
linestyle=linestyle,
|
167
|
+
color=linecolor,
|
168
|
+
linewidth=linewidth,
|
169
|
+
)
|
170
|
+
plt.plot(
|
171
|
+
[x2 - np.abs(np.diff(xlim)) * 0.01, x2 - np.abs(np.diff(xlim)) * 0.01],
|
172
|
+
[
|
173
|
+
np.min(ylim)
|
174
|
+
+ 0.95 * (np.max(ylim) - np.min(ylim))
|
175
|
+
- np.abs(np.diff(ylim)) * tailindicator[1],
|
176
|
+
np.min(ylim)
|
177
|
+
+ 0.95 * (np.max(ylim) - np.min(ylim))
|
178
|
+
- np.abs(np.diff(ylim)) * 0.002,
|
179
|
+
],
|
180
|
+
linestyle=linestyle,
|
181
|
+
color=linecolor,
|
182
|
+
linewidth=linewidth,
|
183
|
+
)
|
116
184
|
if values_below is not None:
|
117
|
-
plt.text(
|
118
|
-
|
119
|
-
|
185
|
+
plt.text(
|
186
|
+
xcenter,
|
187
|
+
y_loc * (-0.1),
|
188
|
+
values_below,
|
189
|
+
ha="center",
|
190
|
+
va="bottom", # 'center_baseline', rotation=rotation,
|
191
|
+
fontsize=fontsize_note,
|
192
|
+
fontname=fontname,
|
193
|
+
color="k",
|
194
|
+
)
|
120
195
|
# report / comments
|
121
196
|
if report is not None:
|
122
|
-
plt.text(
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
197
|
+
plt.text(
|
198
|
+
xcenter,
|
199
|
+
report_loc,
|
200
|
+
report,
|
201
|
+
ha="left",
|
202
|
+
va="bottom", # 'center_baseline', rotation=rotation,
|
203
|
+
fontsize=fontsize_note,
|
204
|
+
fontname=fontname,
|
205
|
+
color=".7",
|
206
|
+
)
|
207
|
+
|
208
|
+
|
209
|
+
def FuncCmpt(x1, x2, pmc="auto", pair="unpaired", verbose=True):
|
130
210
|
# output = {}
|
131
211
|
|
132
212
|
# pmc correction: 'parametric'/'non-parametric'/'auto'
|
133
213
|
# meawhile get the opposite setting (to compare the results)
|
134
|
-
def corr_pmc(pmc):
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
def
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
214
|
+
# def corr_pmc(pmc):
|
215
|
+
# cfg_pmc = None
|
216
|
+
# if pmc.lower() in {"pmc", "parametric"} and pmc.lower() not in {
|
217
|
+
# "npmc",
|
218
|
+
# "nonparametric",
|
219
|
+
# "non-parametric",
|
220
|
+
# }:
|
221
|
+
# cfg_pmc = "parametric"
|
222
|
+
# elif pmc.lower() in {
|
223
|
+
# "npmc",
|
224
|
+
# "nonparametric",
|
225
|
+
# "non-parametric",
|
226
|
+
# } and pmc.lower() not in {"pmc", "parametric"}:
|
227
|
+
# cfg_pmc = "non-parametric"
|
228
|
+
# else:
|
229
|
+
# cfg_pmc = "auto"
|
230
|
+
# return cfg_pmc
|
231
|
+
|
232
|
+
# def corr_pair(pair):
|
233
|
+
# cfg_pair = None
|
234
|
+
# if "pa" in pair.lower() and "np" not in pair.lower():
|
235
|
+
# cfg_pair = "paired"
|
236
|
+
# elif "np" in pair.lower():
|
237
|
+
# cfg_pair = "unpaired"
|
238
|
+
# return cfg_pair
|
239
|
+
|
240
|
+
# def check_normality(data, verbose=True):
|
241
|
+
# stat_shapiro, pval_shapiro = stats.shapiro(data)
|
242
|
+
# if pval_shapiro > 0.05:
|
243
|
+
# Normality = True
|
244
|
+
# else:
|
245
|
+
# Normality = False
|
246
|
+
# if verbose:
|
247
|
+
# (
|
248
|
+
# print(f"\n normally distributed\n")
|
249
|
+
# if Normality
|
250
|
+
# else print(f"\n NOT normally distributed\n")
|
251
|
+
# )
|
252
|
+
# return Normality
|
253
|
+
|
254
|
+
def sub_cmpt_2group(x1, x2, cfg_pmc="pmc", pair="unpaired", verbose=True):
|
163
255
|
output = {}
|
164
|
-
nX1 = np.sum(~np.isnan(
|
165
|
-
nX2 = np.sum(~np.isnan(
|
166
|
-
if cfg_pmc ==
|
256
|
+
nX1 = np.sum(~np.isnan(x1))
|
257
|
+
nX2 = np.sum(~np.isnan(x2))
|
258
|
+
if cfg_pmc == "parametric" or cfg_pmc == "auto":
|
167
259
|
# VarType correction by checking variance Type via "levene"
|
168
260
|
stat_lev, pval_lev = stats.levene(
|
169
|
-
|
261
|
+
x1, x2, center="median", proportiontocut=0.05
|
262
|
+
)
|
170
263
|
VarType = True if pval_lev > 0.05 and nX1 == nX2 else False
|
171
|
-
|
172
|
-
if
|
264
|
+
print(pair)
|
265
|
+
if "np" in pair: # 'unpaired'
|
173
266
|
if VarType and Normality:
|
174
267
|
# The independent t-test requires that the dependent variable is approximately normally
|
175
268
|
# distributed within each group
|
176
269
|
# Note: Technically, it is the residuals that need to be normally distributed, but for
|
177
270
|
# an independent t-test, both will give you the same result.
|
178
|
-
stat_value, pval= stats.ttest_ind(
|
179
|
-
|
180
|
-
|
181
|
-
|
271
|
+
stat_value, pval = stats.ttest_ind(
|
272
|
+
x1,
|
273
|
+
x2,
|
274
|
+
axis=0,
|
275
|
+
equal_var=True,
|
276
|
+
nan_policy="omit",
|
277
|
+
alternative="two-sided",
|
278
|
+
)
|
279
|
+
notes_stat = "unpaired t test"
|
280
|
+
notes_APA = (
|
281
|
+
f"t({nX1+nX2-2})={round(stat_value, 5)},p={round(pval, 5)}"
|
282
|
+
)
|
182
283
|
else:
|
183
284
|
# If the Levene's Test for Equality of Variances is statistically significant,
|
184
285
|
# which indicates that the group variances are unequal in the population, you
|
185
286
|
# can correct for this violation by not using the pooled estimate for the error
|
186
287
|
# term for the t-statistic, but instead using an adjustment to the degrees of
|
187
288
|
# freedom using the Welch-Satterthwaite method
|
188
|
-
stat_value, pval= stats.ttest_ind(
|
189
|
-
|
190
|
-
|
289
|
+
stat_value, pval = stats.ttest_ind(
|
290
|
+
x1,
|
291
|
+
x2,
|
292
|
+
axis=0,
|
293
|
+
equal_var=False,
|
294
|
+
nan_policy="omit",
|
295
|
+
alternative="two-sided",
|
296
|
+
)
|
297
|
+
notes_stat = "Welchs t-test"
|
191
298
|
# note: APA FORMAT
|
192
|
-
notes_APA =
|
193
|
-
|
299
|
+
notes_APA = (
|
300
|
+
f"t({nX1+nX2-2})={round(stat_value, 5)},p={round(pval, 5)}"
|
301
|
+
)
|
302
|
+
elif "pa" in pair and "np" not in pair: # 'paired'
|
194
303
|
# the paired-samples t-test is considered “robust” in handling violations of normality
|
195
304
|
# to some extent. It can still yield valid results even if the data is not normally
|
196
305
|
# distributed. Therefore, this test typically requires only approximately normal data
|
197
|
-
stat_value, pval= stats.ttest_rel(
|
198
|
-
|
199
|
-
|
306
|
+
stat_value, pval = stats.ttest_rel(
|
307
|
+
x1, x2, axis=0, nan_policy="omit", alternative="two-sided"
|
308
|
+
)
|
309
|
+
notes_stat = "paired t test"
|
200
310
|
# note: APA FORMAT
|
201
|
-
notes_APA =
|
202
|
-
|
203
|
-
|
311
|
+
notes_APA = (
|
312
|
+
f"t({sum([nX1-1])})={round(stat_value, 5)},p={round(pval, 5)}"
|
313
|
+
)
|
314
|
+
elif cfg_pmc == "non-parametric":
|
315
|
+
if "np" in pair: # Perform Mann-Whitney
|
204
316
|
stat_value, pval = stats.mannwhitneyu(
|
205
|
-
|
206
|
-
|
317
|
+
x1, x2, method="exact", nan_policy="omit"
|
318
|
+
)
|
319
|
+
notes_stat = "Mann-Whitney U"
|
207
320
|
if nX1 == nX2:
|
208
|
-
notes_APA = f
|
321
|
+
notes_APA = f"U(n={nX1})={round(stat_value, 5)},p={round(pval, 5)}"
|
209
322
|
else:
|
210
|
-
notes_APA = f
|
211
|
-
elif
|
323
|
+
notes_APA = f"U(n1={nX1},n2={nX2})={round(stat_value, 5)},p={round(pval, 5)}"
|
324
|
+
elif "pa" in pair and "np" not in pair: # Wilcoxon signed-rank test
|
212
325
|
stat_value, pval = stats.wilcoxon(
|
213
|
-
|
214
|
-
|
326
|
+
x1, x2, method="exact", nan_policy="omit"
|
327
|
+
)
|
328
|
+
notes_stat = "Wilcoxon signed-rank"
|
215
329
|
if nX1 == nX2:
|
216
|
-
notes_APA = f
|
330
|
+
notes_APA = f"Z(n={nX1})={round(stat_value, 5)},p={round(pval, 5)}"
|
217
331
|
else:
|
218
|
-
notes_APA = f
|
332
|
+
notes_APA = f"Z(n1={nX1},n2={nX2})={round(stat_value, 5)},p={round(pval, 5)}"
|
219
333
|
|
220
334
|
# filling output
|
221
|
-
output[
|
222
|
-
output[
|
223
|
-
output[
|
224
|
-
output[
|
225
|
-
|
226
|
-
|
335
|
+
output["stat"] = stat_value
|
336
|
+
output["pval"] = pval
|
337
|
+
output["method"] = notes_stat
|
338
|
+
output["APA"] = notes_APA
|
339
|
+
if verbose:
|
340
|
+
print(f"{output['method']}\n {notes_APA}\n\n")
|
227
341
|
|
228
342
|
return output, pval
|
229
343
|
|
230
|
-
Normality1 = check_normality(
|
231
|
-
Normality2 = check_normality(
|
344
|
+
Normality1 = check_normality(x1, verbose=verbose)
|
345
|
+
Normality2 = check_normality(x2, verbose=verbose)
|
232
346
|
Normality = True if all([Normality1, Normality2]) else False
|
233
347
|
|
234
|
-
nX1 = np.sum(~np.isnan(X1))
|
235
|
-
nX2 = np.sum(~np.isnan(X2))
|
236
|
-
|
237
348
|
cfg_pmc = corr_pmc(pmc)
|
238
349
|
cfg_pair = corr_pair(pair)
|
239
350
|
|
240
|
-
output, p = sub_cmpt_2group(
|
241
|
-
X1, X2, cfg_pmc=cfg_pmc, pair=cfg_pair)
|
351
|
+
output, p = sub_cmpt_2group(x1, x2, cfg_pmc=cfg_pmc, pair=cfg_pair, verbose=verbose)
|
242
352
|
return p, output
|
243
353
|
|
354
|
+
|
244
355
|
# ======compare 2 group test===================================================
|
245
356
|
# # Example
|
246
|
-
#
|
247
|
-
#
|
357
|
+
# x1 = [19, 22, 16, 29, 24]
|
358
|
+
# x2 = [20, 11, 17, 12, 22]
|
248
359
|
|
249
|
-
# p, res= FuncCmpt(
|
360
|
+
# p, res= FuncCmpt(x1, x2, pmc='pmc', pair='unparrr')
|
250
361
|
|
251
362
|
# =============================================================================
|
252
363
|
|
@@ -270,192 +381,40 @@ def FuncCmpt(X1, X2, pmc='auto', pair='unpaired'):
|
|
270
381
|
# # 'friedman', # Non-parametric one-way repeated measures ANOVA
|
271
382
|
# # ]
|
272
383
|
# =============================================================================
|
273
|
-
def df_wide_long(df):
|
274
|
-
rows, columns = df.shape
|
275
|
-
if columns > rows:
|
276
|
-
return "Wide"
|
277
|
-
elif rows > columns:
|
278
|
-
return "Long"
|
279
384
|
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
f'\n NOT normally distributed\n')
|
304
|
-
return Normality
|
305
|
-
|
306
|
-
def corr_pmc(pmc):
|
307
|
-
cfg_pmc = None
|
308
|
-
if pmc.lower() in {'pmc', 'parametric'} and pmc.lower() not in {'upmc', 'npmc', 'nonparametric', 'non-parametric'}:
|
309
|
-
cfg_pmc = 'parametric'
|
310
|
-
elif pmc.lower() in {'upmc', 'npmc', 'nonparametric', 'non-parametric'} and pmc.lower() not in {'pmc', 'parametric'}:
|
311
|
-
cfg_pmc = 'non-parametric'
|
312
|
-
else:
|
313
|
-
cfg_pmc = 'auto'
|
314
|
-
return cfg_pmc
|
315
|
-
|
316
|
-
def extract_apa(res_tab):
|
317
|
-
notes_APA = []
|
318
|
-
if "ddof1" in res_tab:
|
319
|
-
for irow in range(res_tab.shape[0]):
|
320
|
-
note_tmp = f'{res_tab.Source[irow]}:F{round(res_tab.ddof1[irow]),round(res_tab.ddof2[irow])}={round(res_tab.F[irow], 5)},p={round(res_tab["p-unc"][irow], 5)}'
|
321
|
-
notes_APA.append([note_tmp])
|
322
|
-
elif "DF" in res_tab:
|
323
|
-
print(res_tab.shape[0])
|
324
|
-
for irow in range(res_tab.shape[0]-1):
|
325
|
-
note_tmp = f'{res_tab.Source[irow]}:F{round(res_tab.DF[irow]),round(res_tab.DF[res_tab.shape[0]-1])}={round(res_tab.F[irow], 5)},p={round(res_tab["p-unc"][irow], 5)}'
|
326
|
-
notes_APA.append([note_tmp])
|
327
|
-
notes_APA.append(['NaN'])
|
328
|
-
elif "DF1" in res_tab: # in 'mix' case
|
329
|
-
for irow in range(res_tab.shape[0]):
|
330
|
-
note_tmp = f'{res_tab.Source[irow]}:F{round(res_tab.DF1[irow]),round(res_tab.DF2[irow])}={round(res_tab.F[irow], 5)},p={round(res_tab["p-unc"][irow], 5)}'
|
331
|
-
notes_APA.append([note_tmp])
|
332
|
-
return notes_APA
|
333
|
-
|
334
|
-
def anovatable(res_tab):
|
335
|
-
if 'df' in res_tab: # statsmodels
|
336
|
-
res_tab['mean_sq'] = res_tab[:]['sum_sq']/res_tab[:]['df']
|
337
|
-
res_tab['est_sq'] = res_tab[:-1]['sum_sq'] / \
|
338
|
-
sum(res_tab['sum_sq'])
|
339
|
-
res_tab['omega_sq'] = (res_tab[:-1]['sum_sq']-(res_tab[:-1]['df'] *
|
340
|
-
res_tab['mean_sq'][-1]))/(sum(res_tab['sum_sq'])+res_tab['mean_sq'][-1])
|
341
|
-
elif 'DF' in res_tab:
|
342
|
-
res_tab['MS'] = res_tab[:]['SS']/res_tab[:]['DF']
|
343
|
-
res_tab['est_sq'] = res_tab[:-1]['SS']/sum(res_tab['SS'])
|
344
|
-
res_tab['omega_sq'] = (res_tab[:-1]['SS']-(res_tab[:-1]['DF'] *
|
345
|
-
res_tab['MS'][1]))/(sum(res_tab['SS'])+res_tab['MS'][1])
|
346
|
-
if 'p-unc' in res_tab:
|
347
|
-
if 'np2' in res_tab:
|
348
|
-
res_tab['est_sq'] = res_tab['np2']
|
349
|
-
if 'p-unc' in res_tab:
|
350
|
-
res_tab['PR(>F)'] = res_tab['p-unc']
|
351
|
-
return res_tab
|
352
|
-
|
353
|
-
def run_anova(data, dv, factor, ss_type=2, detailed=True, effsize='np2'):
|
354
|
-
# perform ANOVA
|
355
|
-
# =============================================================================
|
356
|
-
# # # ANOVA (input: formula, dataset)
|
357
|
-
# =============================================================================
|
358
|
-
# # note: if the data is balanced (equal sample size for each group), Type 1, 2, and 3 sums of squares
|
359
|
-
# # (typ parameter) will produce similar results.
|
360
|
-
# lm = ols("values ~ C(group)", data=df).fit()
|
361
|
-
# res_tab = anova_lm(lm, typ=ss_type)
|
362
|
-
|
363
|
-
# # however, it does not provide any effect size measures to tell if the
|
364
|
-
# # statistical significance is meaningful. The function below calculates
|
365
|
-
# # eta-squared () and omega-squared (). A quick note, is the exact same
|
366
|
-
# # thing as except when coming from the ANOVA framework people call it ;
|
367
|
-
# # is considered a better measure of effect size since it is unbiased in
|
368
|
-
# # it's calculation by accounting for the degrees of freedom in the model.
|
369
|
-
# # note: No effect sizes are calculated when using statsmodels.
|
370
|
-
# # to calculate eta squared, use the sum of squares from the table
|
371
|
-
# res_tab = anovatable(res_tab)
|
372
|
-
|
373
|
-
# =============================================================================
|
374
|
-
# # alternativ for ANOVA
|
375
|
-
# =============================================================================
|
376
|
-
res_tab = pg.anova(dv=dv, between=factor, data=data,
|
377
|
-
detailed=detailed, ss_type=ss_type, effsize=effsize)
|
378
|
-
res_tab = anovatable(res_tab)
|
379
|
-
return res_tab
|
380
|
-
|
381
|
-
def run_rmanova(data, dv, factor, subject, correction='auto', detailed=True, effsize='ng2'):
|
382
|
-
# One-way repeated-measures ANOVA using a long-format dataset.
|
383
|
-
res_tab = pg.rm_anova(data=data, dv=dv, within=factor,
|
384
|
-
subject=subject, detailed=detailed, effsize=effsize)
|
385
|
-
return res_tab
|
386
|
-
|
387
|
-
def run_welchanova(data, dv, factor):
|
388
|
-
# When the groups are balanced and have equal variances, the optimal
|
389
|
-
# post-hoc test is the Tukey-HSD test (pingouin.pairwise_tukey()). If the
|
390
|
-
# groups have unequal variances, the Games-Howell test is more adequate
|
391
|
-
# (pingouin.pairwise_gameshowell()). Results have been tested against R.
|
392
|
-
res_tab = pg.welch_anova(data=data, dv=dv, between=factor)
|
393
|
-
res_tab = anovatable(res_tab)
|
394
|
-
return res_tab
|
395
|
-
|
396
|
-
def run_mixedanova(data, dv, between, within, subject, correction='auto', effsize='np2'):
|
397
|
-
# Notes
|
398
|
-
# Data are expected to be in long-format (even the repeated measures).
|
399
|
-
# If your data is in wide-format, you can use the pandas.melt() function
|
400
|
-
# to convert from wide to long format.
|
401
|
-
|
402
|
-
# Warning
|
403
|
-
# If the between-subject groups are unbalanced(=unequal sample sizes), a
|
404
|
-
# type II ANOVA will be computed. Note however that SPSS, JAMOVI and JASP
|
405
|
-
# by default return a type III ANOVA, which may lead to slightly different
|
406
|
-
# results.
|
407
|
-
res_tab = pg.mixed_anova(data=data, dv=dv, within=within, subject=subject,
|
408
|
-
between=between, correction=correction, effsize=effsize)
|
409
|
-
res_tab = anovatable(res_tab)
|
410
|
-
return res_tab
|
411
|
-
|
412
|
-
def run_friedman(data, dv, factor, subject, method='chisq'):
|
413
|
-
# Friedman test for repeated measurements
|
414
|
-
# The Friedman test is used for non-parametric (rank-based) one-way
|
415
|
-
# repeated measures ANOVA
|
416
|
-
|
417
|
-
# check df form ('long' or 'wide')
|
418
|
-
# df_long = data.melt(ignore_index=False).reset_index()
|
419
|
-
# if data.describe().shape[1] >= df_long.describe().shape[1]:
|
420
|
-
# res_tab = pg.friedman(data, method=method)
|
421
|
-
# else:
|
422
|
-
# res_tab = pg.friedman(data=df_long, dv='value',
|
423
|
-
# within="variable", subject="index", method=method)
|
424
|
-
if "Wide" in df_wide_long(data):
|
425
|
-
df_long = data.melt(ignore_index=False).reset_index()
|
426
|
-
res_tab = pg.friedman(data=df_long, dv='value',
|
427
|
-
within="variable", subject="index", method=method)
|
428
|
-
else:
|
429
|
-
res_tab = pg.friedman(data, dv=dv, within=factor, subject=subject,method=method)
|
430
|
-
res_tab = anovatable(res_tab)
|
431
|
-
return res_tab
|
432
|
-
|
433
|
-
def run_kruskal(data, dv, factor):
|
434
|
-
# Kruskal-Wallis H-test for independent samples
|
435
|
-
res_tab = pg.kruskal(data=data, dv=dv, between=factor)
|
436
|
-
res_tab = anovatable(res_tab)
|
437
|
-
return res_tab
|
438
|
-
|
439
|
-
# Normality Check:
|
440
|
-
# Conduct normality tests (Shapiro-Wilk) for each group.
|
441
|
-
# If the data is approximately normally distributed, ANOVA is robust to
|
442
|
-
# moderate departures from normality, especially with larger sample sizes.
|
443
|
-
|
444
|
-
# print(data[factor])
|
445
|
-
# print(type(data[factor]))
|
446
|
-
# print(len(data[factor].columns))
|
447
|
-
# print(data[factor].nunique())
|
448
|
-
# print(data[factor[0]])
|
449
|
-
# print(data[factor[0]].unique())
|
385
|
+
|
386
|
+
def str_mean_sem(data: list, delimit=5):
|
387
|
+
mean_ = np.nanmean(data)
|
388
|
+
sem_ = np.nanstd(data, ddof=1) / np.sqrt(sum(~np.isnan(data)))
|
389
|
+
return str(round(mean_, delimit)) + "±" + str(round(sem_, delimit))
|
390
|
+
|
391
|
+
|
392
|
+
def FuncMultiCmpt(
|
393
|
+
pmc="pmc",
|
394
|
+
pair="unpair",
|
395
|
+
data=None,
|
396
|
+
dv=None,
|
397
|
+
factor=None,
|
398
|
+
ss_type=2,
|
399
|
+
detailed=True,
|
400
|
+
effsize="np2",
|
401
|
+
correction="auto",
|
402
|
+
between=None,
|
403
|
+
within=None,
|
404
|
+
subject=None,
|
405
|
+
group=None,
|
406
|
+
verbose=True,
|
407
|
+
):
|
450
408
|
if group is None:
|
451
409
|
group = factor
|
452
410
|
|
453
|
-
# print(f'\ngroup is :\n{data[group]},\ndv is :\n{dv}\n')
|
454
411
|
norm_array = []
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
412
|
+
if len(group) > 1:
|
413
|
+
pass
|
414
|
+
else:
|
415
|
+
for sub_group in data[group].unique():
|
416
|
+
norm_curr = check_normality(data.loc[data[group] == sub_group, dv])
|
417
|
+
norm_array.append(norm_curr)
|
459
418
|
norm_all = True if all(norm_array) else False
|
460
419
|
|
461
420
|
# Homogeneity of Variances:
|
@@ -477,55 +436,74 @@ def FuncMultiCmpt(pmc='pmc', pair='unpair', data=None, dv=None, factor=None,
|
|
477
436
|
# # method2: pingouin.homoscedasticity
|
478
437
|
# =============================================================================
|
479
438
|
res_levene = None
|
480
|
-
|
481
|
-
|
482
|
-
|
439
|
+
if len(group) > 1:
|
440
|
+
pass
|
441
|
+
else:
|
442
|
+
variance_all = pg.homoscedasticity(
|
443
|
+
data, dv=dv, group=group, method="levene", alpha=0.05
|
444
|
+
)
|
445
|
+
res_levene = True if variance_all.iloc[0, 1] > 0.05 else False
|
483
446
|
# =============================================================================
|
484
447
|
# # ANOVA Assumptions:
|
485
448
|
# # Ensure that the assumptions of independence, homogeneity of variances, and
|
486
449
|
# # normality are reasonably met before proceeding.
|
487
450
|
# =============================================================================
|
488
|
-
notes_norm =
|
489
|
-
notes_variance =
|
490
|
-
print(f
|
451
|
+
notes_norm = "normally" if norm_all else "NOT-normally"
|
452
|
+
notes_variance = "equal" if res_levene else "unequal"
|
453
|
+
print(f"Data is {notes_norm} distributed, shows {notes_variance} variance")
|
491
454
|
|
492
455
|
cfg_pmc = corr_pmc(pmc)
|
493
456
|
cfg_pair = corr_pair(pair)
|
494
457
|
output = {}
|
495
|
-
if (cfg_pmc ==
|
496
|
-
if
|
497
|
-
if cfg_pmc ==
|
458
|
+
if (cfg_pmc == "parametric") or (cfg_pmc == "auto"):
|
459
|
+
if "np" in cfg_pair: # 'unpaired'
|
460
|
+
if cfg_pmc == "auto":
|
498
461
|
if norm_all:
|
499
462
|
if res_levene:
|
500
|
-
res_tab = run_anova(
|
501
|
-
|
502
|
-
|
463
|
+
res_tab = run_anova(
|
464
|
+
data,
|
465
|
+
dv,
|
466
|
+
factor,
|
467
|
+
ss_type=ss_type,
|
468
|
+
detailed=True,
|
469
|
+
effsize="np2",
|
470
|
+
)
|
471
|
+
notes_stat = f"{data[factor].nunique()} Way ANOVA"
|
503
472
|
notes_APA = extract_apa(res_tab)
|
504
473
|
|
505
474
|
else:
|
506
475
|
res_tab = run_welchanova(data, dv, factor)
|
507
|
-
notes_stat = f
|
476
|
+
notes_stat = f"{data[factor].nunique()} Way Welch ANOVA"
|
508
477
|
notes_APA = extract_apa(res_tab)
|
509
|
-
|
510
478
|
else:
|
511
|
-
|
512
479
|
res_tab = run_kruskal(data, dv, factor)
|
513
|
-
notes_stat =
|
480
|
+
notes_stat = (
|
481
|
+
f"Non-parametric Kruskal: {data[factor].nunique()} Way ANOVA"
|
482
|
+
)
|
514
483
|
notes_APA = extract_apa(res_tab)
|
515
484
|
|
516
|
-
elif cfg_pmc ==
|
517
|
-
res_tab = run_anova(
|
518
|
-
|
519
|
-
|
485
|
+
elif cfg_pmc == "parametric":
|
486
|
+
res_tab = run_anova(
|
487
|
+
data, dv, factor, ss_type=ss_type, detailed=True, effsize="np2"
|
488
|
+
)
|
489
|
+
notes_stat = f"{data[factor].nunique()} Way ANOVA"
|
520
490
|
notes_APA = extract_apa(res_tab)
|
521
491
|
|
522
|
-
elif
|
523
|
-
res_tab = run_rmanova(
|
524
|
-
|
525
|
-
|
492
|
+
elif "pa" in cfg_pair and "np" not in cfg_pair: # 'paired'
|
493
|
+
res_tab = run_rmanova(
|
494
|
+
data,
|
495
|
+
dv,
|
496
|
+
factor,
|
497
|
+
subject,
|
498
|
+
correction="auto",
|
499
|
+
detailed=True,
|
500
|
+
effsize="ng2",
|
501
|
+
)
|
502
|
+
notes_stat = f"{data[factor].nunique()} Way Repeated measures ANOVA"
|
526
503
|
notes_APA = extract_apa(res_tab)
|
527
504
|
|
528
|
-
elif
|
505
|
+
elif "mix" in cfg_pair or "both" in cfg_pair:
|
506
|
+
print("mix")
|
529
507
|
res_tab = run_mixedanova(data, dv, between, within, subject)
|
530
508
|
# notes_stat = f'{len(sum(len(between)+sum(len(within))))} Way Mixed ANOVA'
|
531
509
|
notes_stat = ""
|
@@ -533,15 +511,15 @@ def FuncMultiCmpt(pmc='pmc', pair='unpair', data=None, dv=None, factor=None,
|
|
533
511
|
# print(n_inter)
|
534
512
|
notes_APA = extract_apa(res_tab)
|
535
513
|
|
536
|
-
elif cfg_pmc ==
|
537
|
-
if
|
514
|
+
elif cfg_pmc == "non-parametric":
|
515
|
+
if "np" in cfg_pair: # 'unpaired'
|
538
516
|
res_tab = run_kruskal(data, dv, factor)
|
539
|
-
notes_stat = f
|
517
|
+
notes_stat = f"Non-parametric Kruskal: {data[factor].nunique()} Way ANOVA"
|
540
518
|
notes_APA = f'H({res_tab.ddof1[0]},n={data.shape[0]})={round(res_tab.H[0], 5)},p={round(res_tab["p-unc"][0], 5)}'
|
541
519
|
|
542
|
-
elif
|
543
|
-
res_tab = run_friedman(data, dv, factor, subject, method=
|
544
|
-
notes_stat = f
|
520
|
+
elif "pa" in cfg_pair and "np" not in cfg_pair: # 'paired'
|
521
|
+
res_tab = run_friedman(data, dv, factor, subject, method="chisq")
|
522
|
+
notes_stat = f"Non-parametric {data[factor].nunique()} Way Friedman repeated measures ANOVA"
|
545
523
|
notes_APA = f'X^2({res_tab.ddof1[0]})={round(res_tab.Q[0], 5)},p={round(res_tab["p-unc"][0], 5)}'
|
546
524
|
|
547
525
|
# =============================================================================
|
@@ -551,18 +529,13 @@ def FuncMultiCmpt(pmc='pmc', pair='unpair', data=None, dv=None, factor=None,
|
|
551
529
|
# Tukey's HSD, Bonferroni, or Scheffé) to identify which groups differ from each other.
|
552
530
|
# # https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tests.html
|
553
531
|
# =============================================================================
|
554
|
-
go_pmc = True if cfg_pmc ==
|
555
|
-
go_subject = subject if (
|
556
|
-
|
557
|
-
go_mix_between =
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
go_mix_within = within if ('mix' in cfg_pair) or (
|
562
|
-
'both' in cfg_pair) else None
|
563
|
-
go_mix_within = factor if ('pa' in cfg_pair) or (
|
564
|
-
'np' not in cfg_pair) else None
|
565
|
-
if res_tab['p-unc'][0] <= .05:
|
532
|
+
go_pmc = True if cfg_pmc == "parametric" else False
|
533
|
+
go_subject = subject if ("pa" in cfg_pair) and ("np" not in cfg_pair) else None
|
534
|
+
go_mix_between = between if ("mix" in cfg_pair) or ("both" in cfg_pair) else None
|
535
|
+
go_mix_between = None if ("pa" in cfg_pair) or ("np" not in cfg_pair) else factor
|
536
|
+
go_mix_within = within if ("mix" in cfg_pair) or ("both" in cfg_pair) else None
|
537
|
+
go_mix_within = factor if ("pa" in cfg_pair) or ("np" not in cfg_pair) else None
|
538
|
+
if res_tab["p-unc"][0] <= 0.05:
|
566
539
|
# Pairwise Comparisons
|
567
540
|
method_post_hoc = [
|
568
541
|
"bonf", # 'bonferroni', # : one-step correction
|
@@ -571,32 +544,319 @@ def FuncMultiCmpt(pmc='pmc', pair='unpair', data=None, dv=None, factor=None,
|
|
571
544
|
"fdr_bh", # Benjamini/Hochberg (non-negative)
|
572
545
|
"fdr_by", # Benjamini/Yekutieli (negative)
|
573
546
|
]
|
547
|
+
# *********? not work properly below*********
|
548
|
+
# res_posthoc = pd.DataFrame()
|
549
|
+
|
550
|
+
# for met in method_post_hoc:
|
551
|
+
# post_curr = pg.pairwise_tests(
|
552
|
+
# data=data,
|
553
|
+
# dv=dv,
|
554
|
+
# between=go_mix_between,
|
555
|
+
# within=go_mix_within,
|
556
|
+
# subject=go_subject,
|
557
|
+
# parametric=go_pmc,
|
558
|
+
# marginal=True,
|
559
|
+
# alpha=0.05,
|
560
|
+
# alternative="two-sided",
|
561
|
+
# padjust=met,
|
562
|
+
# nan_policy="listwise",#"pairwise"
|
563
|
+
# return_desc=True
|
564
|
+
# )
|
565
|
+
# res_posthoc = pd.concat([res_posthoc, post_curr], ignore_index=True)
|
566
|
+
# *********? not work properly above *********
|
567
|
+
|
568
|
+
# add ttest
|
569
|
+
data_within = df2array(data=data, x=factor, y=dv)
|
570
|
+
colname_within = data[factor].unique().tolist()
|
571
|
+
nrow, ncol = data_within.shape
|
574
572
|
res_posthoc = pd.DataFrame()
|
575
|
-
for
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
573
|
+
for icol in range(ncol):
|
574
|
+
for icol_ in range(1, ncol):
|
575
|
+
if icol_ > icol:
|
576
|
+
res_posthoc_ = pd.DataFrame()
|
577
|
+
_, res__ = FuncCmpt(
|
578
|
+
x1=data_within[:, icol],
|
579
|
+
x2=data_within[:, icol_],
|
580
|
+
pmc=pmc,
|
581
|
+
pair=pair,
|
582
|
+
verbose=False,
|
583
|
+
)
|
584
|
+
res_posthoc_["A"] = pd.Series(colname_within[icol])
|
585
|
+
res_posthoc_["B"] = pd.Series(colname_within[icol_])
|
586
|
+
res_posthoc_["mean(A)"] = pd.Series(
|
587
|
+
str_mean_sem(data_within[:, icol])
|
588
|
+
)
|
589
|
+
res_posthoc_["mean(B)"] = pd.Series(
|
590
|
+
str_mean_sem(data_within[:, icol_])
|
591
|
+
)
|
592
|
+
res_posthoc_["APA"] = pd.Series(res__["APA"])
|
593
|
+
res_posthoc_["p-unc"] = pd.Series(res__["pval"])
|
594
|
+
res_posthoc_["method"] = pd.Series(res__["method"])
|
595
|
+
res_posthoc = pd.concat(
|
596
|
+
[res_posthoc, res_posthoc_], ignore_index=True
|
597
|
+
)
|
580
598
|
else:
|
581
599
|
res_posthoc = None
|
582
|
-
output[
|
600
|
+
output["res_posthoc"] = res_posthoc
|
583
601
|
# =============================================================================
|
584
602
|
# # filling output
|
585
603
|
# =============================================================================
|
586
604
|
|
587
|
-
pd.set_option(
|
588
|
-
output[
|
605
|
+
pd.set_option("display.max_columns", None)
|
606
|
+
output["stat"] = notes_stat
|
589
607
|
# print(output['APA'])
|
590
|
-
output[
|
591
|
-
output[
|
592
|
-
output[
|
608
|
+
output["APA"] = notes_APA
|
609
|
+
output["pval"] = res_tab["p-unc"]
|
610
|
+
output["res_tab"] = res_tab
|
593
611
|
if res_tab.shape[0] == len(notes_APA):
|
594
|
-
output[
|
595
|
-
# print(output['stat'])
|
596
|
-
# print(output['res_tab'])
|
612
|
+
output["res_tab"]["APA"] = output["APA"] # note APA in the table
|
597
613
|
return output
|
598
614
|
|
599
615
|
|
616
|
+
def display_output(output: dict):
|
617
|
+
if isinstance(output, pd.DataFrame):
|
618
|
+
output = output.to_dict(orient="list")
|
619
|
+
# ['res_posthoc', 'stat', 'APA', 'pval', 'res_tab']
|
620
|
+
# res_keys = list(output.keys())
|
621
|
+
# display(res_keys)
|
622
|
+
try:
|
623
|
+
print("APA:")
|
624
|
+
display(output["APA"])
|
625
|
+
except:
|
626
|
+
pass
|
627
|
+
try:
|
628
|
+
print("results table:")
|
629
|
+
display(output["res_tab"])
|
630
|
+
except:
|
631
|
+
pass
|
632
|
+
try:
|
633
|
+
print("posthoc:")
|
634
|
+
display(output["res_posthoc"])
|
635
|
+
except:
|
636
|
+
pass
|
637
|
+
|
638
|
+
|
639
|
+
def corr_pair(pair):
|
640
|
+
cfg_pair = None
|
641
|
+
if "pa" in pair.lower() and "np" not in pair.lower():
|
642
|
+
cfg_pair = "paired"
|
643
|
+
elif "np" in pair.lower():
|
644
|
+
cfg_pair = "unpaired"
|
645
|
+
elif "mix" in pair.lower():
|
646
|
+
cfg_pair = "mix"
|
647
|
+
return cfg_pair
|
648
|
+
|
649
|
+
|
650
|
+
def check_normality(data, verbose=True):
|
651
|
+
stat_shapiro, pval_shapiro = stats.shapiro(data)
|
652
|
+
if pval_shapiro > 0.05:
|
653
|
+
Normality = True
|
654
|
+
else:
|
655
|
+
Normality = False
|
656
|
+
if verbose:
|
657
|
+
(
|
658
|
+
print(f"\n normally distributed\n")
|
659
|
+
if Normality
|
660
|
+
else print(f"\n NOT normally distributed\n")
|
661
|
+
)
|
662
|
+
return Normality
|
663
|
+
|
664
|
+
|
665
|
+
def corr_pmc(pmc):
|
666
|
+
cfg_pmc = None
|
667
|
+
if pmc.lower() in {"pmc", "parametric"} and pmc.lower() not in {
|
668
|
+
"upmc",
|
669
|
+
"npmc",
|
670
|
+
"nonparametric",
|
671
|
+
"non-parametric",
|
672
|
+
}:
|
673
|
+
cfg_pmc = "parametric"
|
674
|
+
elif pmc.lower() in {
|
675
|
+
"upmc",
|
676
|
+
"npmc",
|
677
|
+
"nonparametric",
|
678
|
+
"non-parametric",
|
679
|
+
} and pmc.lower() not in {"pmc", "parametric"}:
|
680
|
+
cfg_pmc = "non-parametric"
|
681
|
+
elif pmc.lower() in {
|
682
|
+
"mix",
|
683
|
+
"both",
|
684
|
+
}:
|
685
|
+
cfg_pmc = "mix"
|
686
|
+
else:
|
687
|
+
cfg_pmc = "auto"
|
688
|
+
return cfg_pmc
|
689
|
+
|
690
|
+
|
691
|
+
def extract_apa(res_tab):
|
692
|
+
notes_APA = []
|
693
|
+
if "ddof1" in res_tab:
|
694
|
+
for irow in range(res_tab.shape[0]):
|
695
|
+
note_tmp = f'{res_tab.Source[irow]}:F{round(res_tab.ddof1[irow]),round(res_tab.ddof2[irow])}={round(res_tab.F[irow], 5)},p={round(res_tab["p-unc"][irow], 5)}'
|
696
|
+
notes_APA.append([note_tmp])
|
697
|
+
elif "DF" in res_tab:
|
698
|
+
for irow in range(res_tab.shape[0] - 1):
|
699
|
+
note_tmp = f'{res_tab.Source[irow]}:F{round(res_tab.DF[irow]),round(res_tab.DF[res_tab.shape[0]-1])}={round(res_tab.F[irow], 5)},p={round(res_tab["p-unc"][irow], 5)}'
|
700
|
+
notes_APA.append([note_tmp])
|
701
|
+
notes_APA.append(["NaN"])
|
702
|
+
elif "DF1" in res_tab: # in 'mix' case
|
703
|
+
for irow in range(res_tab.shape[0]):
|
704
|
+
note_tmp = f'{res_tab.Source[irow]}:F{round(res_tab.DF1[irow]),round(res_tab.DF2[irow])}={round(res_tab.F[irow], 5)},p={round(res_tab["p-unc"][irow], 5)}'
|
705
|
+
notes_APA.append([note_tmp])
|
706
|
+
return notes_APA
|
707
|
+
|
708
|
+
|
709
|
+
def anovatable(res_tab):
|
710
|
+
if "df" in res_tab: # statsmodels
|
711
|
+
res_tab["mean_sq"] = res_tab[:]["sum_sq"] / res_tab[:]["df"]
|
712
|
+
res_tab["est_sq"] = res_tab[:-1]["sum_sq"] / sum(res_tab["sum_sq"])
|
713
|
+
res_tab["omega_sq"] = (
|
714
|
+
res_tab[:-1]["sum_sq"] - (res_tab[:-1]["df"] * res_tab["mean_sq"][-1])
|
715
|
+
) / (sum(res_tab["sum_sq"]) + res_tab["mean_sq"][-1])
|
716
|
+
elif "DF" in res_tab:
|
717
|
+
res_tab["MS"] = res_tab[:]["SS"] / res_tab[:]["DF"]
|
718
|
+
res_tab["est_sq"] = res_tab[:-1]["SS"] / sum(res_tab["SS"])
|
719
|
+
res_tab["omega_sq"] = (
|
720
|
+
res_tab[:-1]["SS"] - (res_tab[:-1]["DF"] * res_tab["MS"][1])
|
721
|
+
) / (sum(res_tab["SS"]) + res_tab["MS"][1])
|
722
|
+
if "p-unc" in res_tab:
|
723
|
+
if "np2" in res_tab:
|
724
|
+
res_tab["est_sq"] = res_tab["np2"]
|
725
|
+
if "p-unc" in res_tab:
|
726
|
+
res_tab["PR(>F)"] = res_tab["p-unc"]
|
727
|
+
return res_tab
|
728
|
+
|
729
|
+
|
730
|
+
def run_anova(data, dv, factor, ss_type=2, detailed=True, effsize="np2"):
|
731
|
+
# perform ANOVA
|
732
|
+
# =============================================================================
|
733
|
+
# # # ANOVA (input: formula, dataset)
|
734
|
+
# =============================================================================
|
735
|
+
# # note: if the data is balanced (equal sample size for each group), Type 1, 2, and 3 sums of squares
|
736
|
+
# # (typ parameter) will produce similar results.
|
737
|
+
# lm = ols("values ~ C(group)", data=df).fit()
|
738
|
+
# res_tab = anova_lm(lm, typ=ss_type)
|
739
|
+
|
740
|
+
# # however, it does not provide any effect size measures to tell if the
|
741
|
+
# # statistical significance is meaningful. The function below calculates
|
742
|
+
# # eta-squared () and omega-squared (). A quick note, is the exact same
|
743
|
+
# # thing as except when coming from the ANOVA framework people call it ;
|
744
|
+
# # is considered a better measure of effect size since it is unbiased in
|
745
|
+
# # it's calculation by accounting for the degrees of freedom in the model.
|
746
|
+
# # note: No effect sizes are calculated when using statsmodels.
|
747
|
+
# # to calculate eta squared, use the sum of squares from the table
|
748
|
+
# res_tab = anovatable(res_tab)
|
749
|
+
|
750
|
+
# =============================================================================
|
751
|
+
# # alternativ for ANOVA
|
752
|
+
# =============================================================================
|
753
|
+
res_tab = pg.anova(
|
754
|
+
dv=dv,
|
755
|
+
between=factor,
|
756
|
+
data=data,
|
757
|
+
detailed=detailed,
|
758
|
+
ss_type=ss_type,
|
759
|
+
effsize=effsize,
|
760
|
+
)
|
761
|
+
res_tab = anovatable(res_tab)
|
762
|
+
return res_tab
|
763
|
+
|
764
|
+
|
765
|
+
def run_rmanova(
|
766
|
+
data, dv, factor, subject, correction="auto", detailed=True, effsize="ng2"
|
767
|
+
):
|
768
|
+
# One-way repeated-measures ANOVA using a long-format dataset.
|
769
|
+
res_tab = pg.rm_anova(
|
770
|
+
data=data,
|
771
|
+
dv=dv,
|
772
|
+
within=factor,
|
773
|
+
subject=subject,
|
774
|
+
detailed=detailed,
|
775
|
+
effsize=effsize,
|
776
|
+
)
|
777
|
+
return res_tab
|
778
|
+
|
779
|
+
|
780
|
+
def run_welchanova(data, dv, factor):
|
781
|
+
# When the groups are balanced and have equal variances, the optimal
|
782
|
+
# post-hoc test is the Tukey-HSD test (pingouin.pairwise_tukey()). If the
|
783
|
+
# groups have unequal variances, the Games-Howell test is more adequate
|
784
|
+
# (pingouin.pairwise_gameshowell()). Results have been tested against R.
|
785
|
+
res_tab = pg.welch_anova(data=data, dv=dv, between=factor)
|
786
|
+
res_tab = anovatable(res_tab)
|
787
|
+
return res_tab
|
788
|
+
|
789
|
+
|
790
|
+
def run_mixedanova(
|
791
|
+
data, dv, between, within, subject, correction="auto", effsize="np2"
|
792
|
+
):
|
793
|
+
# Notes
|
794
|
+
# Data are expected to be in long-format (even the repeated measures).
|
795
|
+
# If your data is in wide-format, you can use the pandas.melt() function
|
796
|
+
# to convert from wide to long format.
|
797
|
+
|
798
|
+
# Warning
|
799
|
+
# If the between-subject groups are unbalanced(=unequal sample sizes), a
|
800
|
+
# type II ANOVA will be computed. Note however that SPSS, JAMOVI and JASP
|
801
|
+
# by default return a type III ANOVA, which may lead to slightly different
|
802
|
+
# results.
|
803
|
+
res_tab = pg.mixed_anova(
|
804
|
+
data=data,
|
805
|
+
dv=dv,
|
806
|
+
within=within,
|
807
|
+
subject=subject,
|
808
|
+
between=between,
|
809
|
+
correction=correction,
|
810
|
+
effsize=effsize,
|
811
|
+
)
|
812
|
+
res_tab = anovatable(res_tab)
|
813
|
+
return res_tab
|
814
|
+
|
815
|
+
|
816
|
+
def run_friedman(data, dv, factor, subject, method="chisq"):
|
817
|
+
# Friedman test for repeated measurements
|
818
|
+
# The Friedman test is used for non-parametric (rank-based) one-way
|
819
|
+
# repeated measures ANOVA
|
820
|
+
|
821
|
+
# check df form ('long' or 'wide')
|
822
|
+
# df_long = data.melt(ignore_index=False).reset_index()
|
823
|
+
# if data.describe().shape[1] >= df_long.describe().shape[1]:
|
824
|
+
# res_tab = pg.friedman(data, method=method)
|
825
|
+
# else:
|
826
|
+
# res_tab = pg.friedman(data=df_long, dv='value',
|
827
|
+
# within="variable", subject="index", method=method)
|
828
|
+
if "Wide" in df_wide_long(data):
|
829
|
+
df_long = data.melt(ignore_index=False).reset_index()
|
830
|
+
res_tab = pg.friedman(
|
831
|
+
data=df_long,
|
832
|
+
dv="value",
|
833
|
+
within="variable",
|
834
|
+
subject="index",
|
835
|
+
method=method,
|
836
|
+
)
|
837
|
+
else:
|
838
|
+
res_tab = pg.friedman(
|
839
|
+
data, dv=dv, within=factor, subject=subject, method=method
|
840
|
+
)
|
841
|
+
res_tab = anovatable(res_tab)
|
842
|
+
return res_tab
|
843
|
+
|
844
|
+
|
845
|
+
def run_kruskal(data, dv, factor):
|
846
|
+
# Kruskal-Wallis H-test for independent samples
|
847
|
+
res_tab = pg.kruskal(data=data, dv=dv, between=factor)
|
848
|
+
res_tab = anovatable(res_tab)
|
849
|
+
return res_tab
|
850
|
+
|
851
|
+
|
852
|
+
def df_wide_long(df):
|
853
|
+
rows, columns = df.shape
|
854
|
+
if columns > rows:
|
855
|
+
return "Wide"
|
856
|
+
elif rows > columns:
|
857
|
+
return "Long"
|
858
|
+
|
859
|
+
|
600
860
|
# =============================================================================
|
601
861
|
# # One-way ANOVA
|
602
862
|
# =============================================================================
|
@@ -807,4 +1067,138 @@ def FuncMultiCmpt(pmc='pmc', pair='unpair', data=None, dv=None, factor=None,
|
|
807
1067
|
|
808
1068
|
# =============================================================================
|
809
1069
|
# # convert to list to string
|
810
|
-
# =============================================================================
|
1070
|
+
# =============================================================================
|
1071
|
+
|
1072
|
+
|
1073
|
+
def sort_rows_move_nan(arr, sort=False):
|
1074
|
+
# Handle edge cases where all values are NaN
|
1075
|
+
if np.all(np.isnan(arr)):
|
1076
|
+
return arr # Return unchanged if the entire array is NaN
|
1077
|
+
|
1078
|
+
if sort:
|
1079
|
+
# Replace NaNs with a temporary large value for sorting
|
1080
|
+
temp_value = (
|
1081
|
+
np.nanmax(arr[np.isfinite(arr)]) + 1 if np.any(np.isfinite(arr)) else np.inf
|
1082
|
+
)
|
1083
|
+
arr_no_nan = np.where(np.isnan(arr), temp_value, arr)
|
1084
|
+
|
1085
|
+
# Sort each row
|
1086
|
+
sorted_arr = np.sort(arr_no_nan, axis=1)
|
1087
|
+
|
1088
|
+
# Move NaNs to the end
|
1089
|
+
result_arr = np.where(sorted_arr == temp_value, np.nan, sorted_arr)
|
1090
|
+
else:
|
1091
|
+
result_rows = []
|
1092
|
+
for row in arr:
|
1093
|
+
# Separate non-NaN and NaN values
|
1094
|
+
non_nan_values = row[~np.isnan(row)]
|
1095
|
+
nan_count = np.isnan(row).sum()
|
1096
|
+
# Create a new row with non-NaN values followed by NaNs
|
1097
|
+
new_row = np.concatenate([non_nan_values, [np.nan] * nan_count])
|
1098
|
+
result_rows.append(new_row)
|
1099
|
+
# Convert the list of rows back into a 2D NumPy array
|
1100
|
+
result_arr = np.array(result_rows)
|
1101
|
+
|
1102
|
+
# Remove rows/columns that contain only NaNs
|
1103
|
+
clean_arr = result_arr[~np.isnan(result_arr).all(axis=1)]
|
1104
|
+
clean_arr_ = clean_arr[:, ~np.isnan(clean_arr).all(axis=0)]
|
1105
|
+
|
1106
|
+
return clean_arr_
|
1107
|
+
|
1108
|
+
|
1109
|
+
def df2array(data: pd.DataFrame, x, y, hue=None, sort=False):
|
1110
|
+
if hue is None:
|
1111
|
+
a = []
|
1112
|
+
if sort:
|
1113
|
+
np.sort(data[x].unique().tolist()).tolist()
|
1114
|
+
else:
|
1115
|
+
cat_x = data[x].unique().tolist()
|
1116
|
+
for i, x_ in enumerate(cat_x):
|
1117
|
+
new_ = data.loc[data[x] == x_, y].to_list()
|
1118
|
+
a = padcat(a, new_, axis=0)
|
1119
|
+
return sort_rows_move_nan(a).T
|
1120
|
+
else:
|
1121
|
+
a = []
|
1122
|
+
if sort:
|
1123
|
+
cat_x = np.sort(data[x].unique().tolist()).tolist()
|
1124
|
+
cat_hue = np.sort(data[hue].unique().tolist()).tolist()
|
1125
|
+
else:
|
1126
|
+
cat_x = data[x].unique().tolist()
|
1127
|
+
cat_hue = data[hue].unique().tolist()
|
1128
|
+
for i, x_ in enumerate(cat_x):
|
1129
|
+
for j, hue_ in enumerate(cat_hue):
|
1130
|
+
new_ = data.loc[(data[x] == x_) & (data[hue] == hue_), y].to_list()
|
1131
|
+
a = padcat(a, new_, axis=0)
|
1132
|
+
return sort_rows_move_nan(a).T
|
1133
|
+
|
1134
|
+
|
1135
|
+
def padcat(*args, fill_value=np.nan, axis=1, order="row"):
|
1136
|
+
"""
|
1137
|
+
Concatenate vectors with padding.
|
1138
|
+
|
1139
|
+
Parameters:
|
1140
|
+
*args : variable number of list or 1D arrays
|
1141
|
+
Input arrays to concatenate.
|
1142
|
+
fill_value : scalar, optional
|
1143
|
+
The value to use for padding the shorter lists (default is np.nan).
|
1144
|
+
axis : int, optional
|
1145
|
+
The axis along which to concatenate (0 for rows, 1 for columns, default is 1).
|
1146
|
+
order : str, optional
|
1147
|
+
The order for flattening when required: "row" or "column" (default is "row").
|
1148
|
+
|
1149
|
+
Returns:
|
1150
|
+
np.ndarray
|
1151
|
+
A 2D array with the input arrays concatenated along the specified axis,
|
1152
|
+
padded with fill_value where necessary.
|
1153
|
+
"""
|
1154
|
+
# Set the order for processing
|
1155
|
+
if "ro" in order.lower():
|
1156
|
+
order = "C" # row-major order
|
1157
|
+
else:
|
1158
|
+
order = "F" # column-major order
|
1159
|
+
|
1160
|
+
# Process input arrays based on their dimensions
|
1161
|
+
processed_arrays = []
|
1162
|
+
for arg in args:
|
1163
|
+
arr = np.asarray(arg)
|
1164
|
+
if arr.ndim == 1:
|
1165
|
+
processed_arrays.append(arr) # Keep 1D arrays as is
|
1166
|
+
elif arr.ndim == 2:
|
1167
|
+
if axis == 0:
|
1168
|
+
# If concatenating along rows, split 2D arrays into 1D arrays row-wise
|
1169
|
+
processed_arrays.extend(arr)
|
1170
|
+
elif axis == 1:
|
1171
|
+
# If concatenating along columns, split 2D arrays into 1D arrays column-wise
|
1172
|
+
processed_arrays.extend(arr.T)
|
1173
|
+
else:
|
1174
|
+
raise ValueError("axis must be 0 or 1")
|
1175
|
+
else:
|
1176
|
+
raise ValueError("Input arrays must be 1D or 2D")
|
1177
|
+
|
1178
|
+
if axis == 0:
|
1179
|
+
# Concatenate along rows
|
1180
|
+
max_len = max(arr.size for arr in processed_arrays)
|
1181
|
+
result = np.full((len(processed_arrays), max_len), fill_value)
|
1182
|
+
for i, arr in enumerate(processed_arrays):
|
1183
|
+
result[i, : arr.size] = arr
|
1184
|
+
elif axis == 1:
|
1185
|
+
# Concatenate along columns
|
1186
|
+
max_len = max(arr.size for arr in processed_arrays)
|
1187
|
+
result = np.full((max_len, len(processed_arrays)), fill_value)
|
1188
|
+
for i, arr in enumerate(processed_arrays):
|
1189
|
+
result[: arr.size, i] = arr
|
1190
|
+
else:
|
1191
|
+
raise ValueError("axis must be 0 or 1")
|
1192
|
+
|
1193
|
+
return result
|
1194
|
+
|
1195
|
+
|
1196
|
+
# # Example usage:
|
1197
|
+
# a = [1, np.nan]
|
1198
|
+
# b = [1, 3, 4, np.nan, 2, np.nan]
|
1199
|
+
# c = [1, 2, 3, 4, 5, 6, 7, 8, 10]
|
1200
|
+
# d = padcat(a, b)
|
1201
|
+
# result1 = padcat(d, c)
|
1202
|
+
# result2 = padcat(a, b, c)
|
1203
|
+
# print("Result of padcat(d, c):\n", result1)
|
1204
|
+
# print("Result of padcat(a, b, c):\n", result2)
|