py2ls 0.1.4.9__py3-none-any.whl → 0.1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/stats.py ADDED
@@ -0,0 +1,810 @@
1
+ from scipy.ndimage import convolve1d
2
+ from scipy.signal import savgol_filter
3
+ import pingouin as pg
4
+ from scipy import stats
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import matplotlib.pyplot as plt
9
+
10
+
11
+ # ==============FuncStars(ax,x1=1,x2=2, yscale=0.9, pval=0.01)====================================================
12
+ # Usage:
13
+ # FuncStars(ax, x1=2, x2=3, yscale=0.99, pval=0.02)
14
+ # =============================================================================
15
+
16
+ # FuncStars --v 0.1.1
17
+ def FuncStars(ax,
18
+ pval=None,
19
+ Ylim=None,
20
+ Xlim=None,
21
+ symbol='*',
22
+ yscale=0.95,
23
+ x1=0,
24
+ x2=1,
25
+ alpha=0.05,
26
+ fontsize=14,
27
+ fontsize_note=6,
28
+ rotation=0,
29
+ fontname='Arial',
30
+ values_below=None,
31
+ linego=True,
32
+ linestyle='-',
33
+ linecolor='k',
34
+ linewidth=.8,
35
+ nsshow='off',
36
+ symbolcolor='k',
37
+ tailindicator=[0.06, 0.06],
38
+ report=None,
39
+ report_scale=-0.1,
40
+ report_loc=None):
41
+ if ax is None:
42
+ ax = plt.gca()
43
+ if Ylim is None:
44
+ Ylim = plt.gca().get_ylim()
45
+ if Xlim is None:
46
+ Xlim = ax.get_xlim()
47
+ if report_loc is None and report is not None:
48
+ report_loc = np.min(Ylim) + report_scale*np.abs(np.diff(Ylim))
49
+ if report_scale > 0:
50
+ report_scale = -np.abs(report_scale)
51
+ yscale = np.float64(yscale)
52
+ y_loc = np.min(Ylim) + yscale*(np.max(Ylim)-np.min(Ylim))
53
+ xcenter = np.mean([x1, x2])
54
+ # ns / *
55
+ if alpha < pval:
56
+ if nsshow == 'on':
57
+ ns_str = f'p={round(pval, 3)}' if pval < 0.9 else 'ns'
58
+ color = 'm' if pval < 0.1 else 'k'
59
+ plt.text(xcenter, y_loc, ns_str,
60
+ ha='center', va='bottom', # 'center_baseline',
61
+ fontsize=fontsize-6 if fontsize > 6 else fontsize,
62
+ fontname=fontname, color=color, rotation=rotation
63
+ # bbox=dict(facecolor=None, edgecolor=None, color=None, linewidth=None)
64
+ )
65
+ elif 0.01 < pval <= alpha:
66
+ plt.text(xcenter, y_loc, symbol,
67
+ ha='center', va='center_baseline',
68
+ fontsize=fontsize, fontname=fontname, color=symbolcolor)
69
+ elif 0.001 < pval <= 0.01:
70
+ plt.text(xcenter, y_loc, symbol * 2,
71
+ ha='center', va='center_baseline',
72
+ fontsize=fontsize, fontname=fontname, color=symbolcolor)
73
+ elif 0 < pval <= 0.001:
74
+ plt.text(xcenter, y_loc, symbol * 3,
75
+ ha='center', va='center_baseline',
76
+ fontsize=fontsize, fontname=fontname, color=symbolcolor)
77
+ # lines indicators
78
+ if linego: # and 0 < pval <= 0.05:
79
+ print(pval)
80
+ print(linego)
81
+ # horizontal line
82
+ if yscale < 0.99:
83
+ plt.plot([x1 + np.abs(np.diff(Xlim)) * 0.01,
84
+ x2 - np.abs(np.diff(Xlim)) * 0.01],
85
+ [y_loc - np.abs(np.diff(Ylim)) * .03,
86
+ y_loc - np.abs(np.diff(Ylim)) * .03],
87
+ linestyle=linestyle, color=linecolor, linewidth=linewidth)
88
+ # vertical line
89
+ plt.plot([x1 + np.abs(np.diff(Xlim)) * 0.01,
90
+ x1 + np.abs(np.diff(Xlim)) * 0.01],
91
+ [y_loc - np.abs(np.diff(Ylim)) * tailindicator[0],
92
+ y_loc - np.abs(np.diff(Ylim)) * .03],
93
+ linestyle=linestyle, color=linecolor, linewidth=linewidth)
94
+ plt.plot([x2 - np.abs(np.diff(Xlim)) * 0.01,
95
+ x2 - np.abs(np.diff(Xlim)) * 0.01],
96
+ [y_loc - np.abs(np.diff(Ylim)) * tailindicator[1],
97
+ y_loc - np.abs(np.diff(Ylim)) * .03],
98
+ linestyle=linestyle, color=linecolor, linewidth=linewidth)
99
+ else:
100
+ plt.plot([x1 + np.abs(np.diff(Xlim)) * 0.01,
101
+ x2 - np.abs(np.diff(Xlim)) * 0.01],
102
+ [np.min(Ylim) + 0.95*(np.max(Ylim)-np.min(Ylim)) - np.abs(np.diff(Ylim)) * 0.002,
103
+ np.min(Ylim) + 0.95*(np.max(Ylim)-np.min(Ylim)) - np.abs(np.diff(Ylim)) * 0.002],
104
+ linestyle=linestyle, color=linecolor, linewidth=linewidth)
105
+ # vertical line
106
+ plt.plot([x1 + np.abs(np.diff(Xlim)) * 0.01,
107
+ x1 + np.abs(np.diff(Xlim)) * 0.01],
108
+ [np.min(Ylim) + 0.95*(np.max(Ylim)-np.min(Ylim)) - np.abs(np.diff(Ylim)) * tailindicator[0],
109
+ np.min(Ylim) + 0.95*(np.max(Ylim)-np.min(Ylim)) - np.abs(np.diff(Ylim)) * 0.002],
110
+ linestyle=linestyle, color=linecolor, linewidth=linewidth)
111
+ plt.plot([x2 - np.abs(np.diff(Xlim)) * 0.01,
112
+ x2 - np.abs(np.diff(Xlim)) * 0.01],
113
+ [np.min(Ylim) + 0.95*(np.max(Ylim)-np.min(Ylim)) - np.abs(np.diff(Ylim)) * tailindicator[1],
114
+ np.min(Ylim) + 0.95*(np.max(Ylim)-np.min(Ylim)) - np.abs(np.diff(Ylim)) * 0.002],
115
+ linestyle=linestyle, color=linecolor, linewidth=linewidth)
116
+ if values_below is not None:
117
+ plt.text(xcenter, y_loc * (-0.1), values_below,
118
+ ha='center', va='bottom', # 'center_baseline', rotation=rotation,
119
+ fontsize=fontsize_note, fontname=fontname, color='k')
120
+ # report / comments
121
+ if report is not None:
122
+ plt.text(xcenter, report_loc, report,
123
+ ha='left', va='bottom', # 'center_baseline', rotation=rotation,
124
+ fontsize=fontsize_note, fontname=fontname, color='.7')
125
+
126
+
127
+
128
+
129
+ def FuncCmpt(X1, X2, pmc='auto', pair='unpaired'):
130
+ # output = {}
131
+
132
+ # pmc correction: 'parametric'/'non-parametric'/'auto'
133
+ # meawhile get the opposite setting (to compare the results)
134
+ def corr_pmc(pmc):
135
+ cfg_pmc = None
136
+ if pmc.lower() in {'pmc', 'parametric'} and pmc.lower() not in {'npmc', 'nonparametric', 'non-parametric'}:
137
+ cfg_pmc = 'parametric'
138
+ elif pmc.lower() in {'npmc', 'nonparametric', 'non-parametric'} and pmc.lower() not in {'pmc', 'parametric'}:
139
+ cfg_pmc = 'non-parametric'
140
+ else:
141
+ cfg_pmc = 'auto'
142
+ return cfg_pmc
143
+
144
+ def corr_pair(pair):
145
+ cfg_pair = None
146
+ if 'pa' in pair.lower() and 'np' not in pair.lower():
147
+ cfg_pair = 'paired'
148
+ elif 'np' in pair.lower():
149
+ cfg_pair = 'unpaired'
150
+ return cfg_pair
151
+
152
+ def check_normality(data):
153
+ stat_shapiro, pval_shapiro = stats.shapiro(data)
154
+ if pval_shapiro > 0.05:
155
+ Normality = True
156
+ else:
157
+ Normality = False
158
+ print(f'\n normally distributed\n') if Normality else print(
159
+ f'\n NOT normally distributed\n')
160
+ return Normality
161
+
162
+ def sub_cmpt_2group(X1, X2, cfg_pmc='pmc', pair='unpaired'):
163
+ output = {}
164
+ nX1 = np.sum(~np.isnan(X1))
165
+ nX2 = np.sum(~np.isnan(X2))
166
+ if cfg_pmc == 'parametric' or cfg_pmc == 'auto':
167
+ # VarType correction by checking variance Type via "levene"
168
+ stat_lev, pval_lev = stats.levene(
169
+ X1, X2, center='median', proportiontocut=0.05)
170
+ VarType = True if pval_lev > 0.05 and nX1 == nX2 else False
171
+
172
+ if 'np' in pair: # 'unpaired'
173
+ if VarType and Normality:
174
+ # The independent t-test requires that the dependent variable is approximately normally
175
+ # distributed within each group
176
+ # Note: Technically, it is the residuals that need to be normally distributed, but for
177
+ # an independent t-test, both will give you the same result.
178
+ stat_value, pval= stats.ttest_ind(
179
+ X1, X2, axis=0, equal_var=True, nan_policy='omit', alternative='two-sided')
180
+ notes_stat = 'unpaired t test'
181
+ notes_APA = f't({nX1+nX2-2})={round(stat_value, 5)},p={round(pval, 5)}'
182
+ else:
183
+ # If the Levene's Test for Equality of Variances is statistically significant,
184
+ # which indicates that the group variances are unequal in the population, you
185
+ # can correct for this violation by not using the pooled estimate for the error
186
+ # term for the t-statistic, but instead using an adjustment to the degrees of
187
+ # freedom using the Welch-Satterthwaite method
188
+ stat_value, pval= stats.ttest_ind(
189
+ X1, X2, axis=0, equal_var=False, nan_policy='omit', alternative='two-sided')
190
+ notes_stat = 'Welchs t-test'
191
+ # note: APA FORMAT
192
+ notes_APA = f't({nX1+nX2-2})={round(stat_value, 5)},p={round(pval, 5)}'
193
+ elif 'pa' in pair and 'np' not in pair: # 'paired'
194
+ # the paired-samples t-test is considered “robust” in handling violations of normality
195
+ # to some extent. It can still yield valid results even if the data is not normally
196
+ # distributed. Therefore, this test typically requires only approximately normal data
197
+ stat_value, pval= stats.ttest_rel(
198
+ X1, X2, axis=0, nan_policy='omit', alternative='two-sided')
199
+ notes_stat = 'paired t test'
200
+ # note: APA FORMAT
201
+ notes_APA = f't({sum([nX1-1])})={round(stat_value, 5)},p={round(pval, 5)}'
202
+ elif cfg_pmc == 'non-parametric':
203
+ if 'np' in pair: # Perform Mann-Whitney
204
+ stat_value, pval = stats.mannwhitneyu(
205
+ X1, X2, method='exact', nan_policy='omit')
206
+ notes_stat = 'Mann-Whitney U'
207
+ if nX1 == nX2:
208
+ notes_APA = f'U(n={nX1})={round(stat_value, 5)},p={round(pval, 5)}'
209
+ else:
210
+ notes_APA = f'U(n1={nX1},n2={nX2})={round(stat_value, 5)},p={round(pval, 5)}'
211
+ elif 'pa' in pair and 'np' not in pair: # Wilcoxon signed-rank test
212
+ stat_value, pval = stats.wilcoxon(
213
+ X1, X2, method='exact', nan_policy='omit')
214
+ notes_stat = 'Wilcoxon signed-rank'
215
+ if nX1 == nX2:
216
+ notes_APA = f'Z(n={nX1})={round(stat_value, 5)},p={round(pval, 5)}'
217
+ else:
218
+ notes_APA = f'Z(n1={nX1},n2={nX2})={round(stat_value, 5)},p={round(pval, 5)}'
219
+
220
+ # filling output
221
+ output['stat'] = stat_value
222
+ output['pval'] = pval
223
+ output['method'] = notes_stat
224
+ output['APA'] = notes_APA
225
+
226
+ print(f"{output['method']}\n {notes_APA}\n\n")
227
+
228
+ return output, pval
229
+
230
+ Normality1 = check_normality(X1)
231
+ Normality2 = check_normality(X2)
232
+ Normality = True if all([Normality1, Normality2]) else False
233
+
234
+ nX1 = np.sum(~np.isnan(X1))
235
+ nX2 = np.sum(~np.isnan(X2))
236
+
237
+ cfg_pmc = corr_pmc(pmc)
238
+ cfg_pair = corr_pair(pair)
239
+
240
+ output, p = sub_cmpt_2group(
241
+ X1, X2, cfg_pmc=cfg_pmc, pair=cfg_pair)
242
+ return p, output
243
+
244
+ # ======compare 2 group test===================================================
245
+ # # Example
246
+ # X1 = [19, 22, 16, 29, 24]
247
+ # X2 = [20, 11, 17, 12, 22]
248
+
249
+ # p, res= FuncCmpt(X1, X2, pmc='pmc', pair='unparrr')
250
+
251
+ # =============================================================================
252
+
253
+ # =============================================================================
254
+ # # method = ['anova', # 'One-way and N-way ANOVA',
255
+ # # 'rm_anova', # 'One-way and two-way repeated measures ANOVA',
256
+ # # 'mixed_anova', # 'Two way mixed ANOVA',
257
+ # # 'welch_anova', # 'One-way Welch ANOVA',
258
+ # # 'kruskal', # 'Non-parametric one-way ANOVA'
259
+ # # 'friedman', # Non-parametric one-way repeated measures ANOVA
260
+ # # ]
261
+ # =============================================================================
262
+
263
+
264
+ # =============================================================================
265
+ # # method = ['anova', # 'One-way and N-way ANOVA',
266
+ # # 'rm_anova', # 'One-way and two-way repeated measures ANOVA',
267
+ # # 'mixed_anova', # 'Two way mixed ANOVA',
268
+ # # 'welch_anova', # 'One-way Welch ANOVA',
269
+ # # 'kruskal', # 'Non-parametric one-way ANOVA'
270
+ # # 'friedman', # Non-parametric one-way repeated measures ANOVA
271
+ # # ]
272
+ # =============================================================================
273
+ def df_wide_long(df):
274
+ rows, columns = df.shape
275
+ if columns > rows:
276
+ return "Wide"
277
+ elif rows > columns:
278
+ return "Long"
279
+
280
+ def FuncMultiCmpt(pmc='pmc', pair='unpair', data=None, dv=None, factor=None,
281
+ ss_type=2, detailed=True, effsize='np2',
282
+ correction='auto', between=None, within=None,
283
+ subject=None, group=None
284
+ ):
285
+
286
+ def corr_pair(pair):
287
+ cfg_pair = None
288
+ if 'pa' in pair.lower() and 'np' not in pair.lower():
289
+ cfg_pair = 'paired'
290
+ elif 'np' in pair.lower():
291
+ cfg_pair = 'unpaired'
292
+ elif 'mix' in pair.lower():
293
+ cfg_pair = 'mix'
294
+ return cfg_pair
295
+
296
+ def check_normality(data):
297
+ stat_shapiro, pval_shapiro = stats.shapiro(data)
298
+ if pval_shapiro > 0.05:
299
+ Normality = True
300
+ else:
301
+ Normality = False
302
+ print(f'\n normally distributed\n') if Normality else print(
303
+ f'\n NOT normally distributed\n')
304
+ return Normality
305
+
306
+ def corr_pmc(pmc):
307
+ cfg_pmc = None
308
+ if pmc.lower() in {'pmc', 'parametric'} and pmc.lower() not in {'upmc', 'npmc', 'nonparametric', 'non-parametric'}:
309
+ cfg_pmc = 'parametric'
310
+ elif pmc.lower() in {'upmc', 'npmc', 'nonparametric', 'non-parametric'} and pmc.lower() not in {'pmc', 'parametric'}:
311
+ cfg_pmc = 'non-parametric'
312
+ else:
313
+ cfg_pmc = 'auto'
314
+ return cfg_pmc
315
+
316
+ def extract_apa(res_tab):
317
+ notes_APA = []
318
+ if "ddof1" in res_tab:
319
+ for irow in range(res_tab.shape[0]):
320
+ note_tmp = f'{res_tab.Source[irow]}:F{round(res_tab.ddof1[irow]),round(res_tab.ddof2[irow])}={round(res_tab.F[irow], 5)},p={round(res_tab["p-unc"][irow], 5)}'
321
+ notes_APA.append([note_tmp])
322
+ elif "DF" in res_tab:
323
+ print(res_tab.shape[0])
324
+ for irow in range(res_tab.shape[0]-1):
325
+ note_tmp = f'{res_tab.Source[irow]}:F{round(res_tab.DF[irow]),round(res_tab.DF[res_tab.shape[0]-1])}={round(res_tab.F[irow], 5)},p={round(res_tab["p-unc"][irow], 5)}'
326
+ notes_APA.append([note_tmp])
327
+ notes_APA.append(['NaN'])
328
+ elif "DF1" in res_tab: # in 'mix' case
329
+ for irow in range(res_tab.shape[0]):
330
+ note_tmp = f'{res_tab.Source[irow]}:F{round(res_tab.DF1[irow]),round(res_tab.DF2[irow])}={round(res_tab.F[irow], 5)},p={round(res_tab["p-unc"][irow], 5)}'
331
+ notes_APA.append([note_tmp])
332
+ return notes_APA
333
+
334
+ def anovatable(res_tab):
335
+ if 'df' in res_tab: # statsmodels
336
+ res_tab['mean_sq'] = res_tab[:]['sum_sq']/res_tab[:]['df']
337
+ res_tab['est_sq'] = res_tab[:-1]['sum_sq'] / \
338
+ sum(res_tab['sum_sq'])
339
+ res_tab['omega_sq'] = (res_tab[:-1]['sum_sq']-(res_tab[:-1]['df'] *
340
+ res_tab['mean_sq'][-1]))/(sum(res_tab['sum_sq'])+res_tab['mean_sq'][-1])
341
+ elif 'DF' in res_tab:
342
+ res_tab['MS'] = res_tab[:]['SS']/res_tab[:]['DF']
343
+ res_tab['est_sq'] = res_tab[:-1]['SS']/sum(res_tab['SS'])
344
+ res_tab['omega_sq'] = (res_tab[:-1]['SS']-(res_tab[:-1]['DF'] *
345
+ res_tab['MS'][1]))/(sum(res_tab['SS'])+res_tab['MS'][1])
346
+ if 'p-unc' in res_tab:
347
+ if 'np2' in res_tab:
348
+ res_tab['est_sq'] = res_tab['np2']
349
+ if 'p-unc' in res_tab:
350
+ res_tab['PR(>F)'] = res_tab['p-unc']
351
+ return res_tab
352
+
353
+ def run_anova(data, dv, factor, ss_type=2, detailed=True, effsize='np2'):
354
+ # perform ANOVA
355
+ # =============================================================================
356
+ # # # ANOVA (input: formula, dataset)
357
+ # =============================================================================
358
+ # # note: if the data is balanced (equal sample size for each group), Type 1, 2, and 3 sums of squares
359
+ # # (typ parameter) will produce similar results.
360
+ # lm = ols("values ~ C(group)", data=df).fit()
361
+ # res_tab = anova_lm(lm, typ=ss_type)
362
+
363
+ # # however, it does not provide any effect size measures to tell if the
364
+ # # statistical significance is meaningful. The function below calculates
365
+ # # eta-squared () and omega-squared (). A quick note, is the exact same
366
+ # # thing as except when coming from the ANOVA framework people call it ;
367
+ # # is considered a better measure of effect size since it is unbiased in
368
+ # # it's calculation by accounting for the degrees of freedom in the model.
369
+ # # note: No effect sizes are calculated when using statsmodels.
370
+ # # to calculate eta squared, use the sum of squares from the table
371
+ # res_tab = anovatable(res_tab)
372
+
373
+ # =============================================================================
374
+ # # alternativ for ANOVA
375
+ # =============================================================================
376
+ res_tab = pg.anova(dv=dv, between=factor, data=data,
377
+ detailed=detailed, ss_type=ss_type, effsize=effsize)
378
+ res_tab = anovatable(res_tab)
379
+ return res_tab
380
+
381
+ def run_rmanova(data, dv, factor, subject, correction='auto', detailed=True, effsize='ng2'):
382
+ # One-way repeated-measures ANOVA using a long-format dataset.
383
+ res_tab = pg.rm_anova(data=data, dv=dv, within=factor,
384
+ subject=subject, detailed=detailed, effsize=effsize)
385
+ return res_tab
386
+
387
+ def run_welchanova(data, dv, factor):
388
+ # When the groups are balanced and have equal variances, the optimal
389
+ # post-hoc test is the Tukey-HSD test (pingouin.pairwise_tukey()). If the
390
+ # groups have unequal variances, the Games-Howell test is more adequate
391
+ # (pingouin.pairwise_gameshowell()). Results have been tested against R.
392
+ res_tab = pg.welch_anova(data=data, dv=dv, between=factor)
393
+ res_tab = anovatable(res_tab)
394
+ return res_tab
395
+
396
+ def run_mixedanova(data, dv, between, within, subject, correction='auto', effsize='np2'):
397
+ # Notes
398
+ # Data are expected to be in long-format (even the repeated measures).
399
+ # If your data is in wide-format, you can use the pandas.melt() function
400
+ # to convert from wide to long format.
401
+
402
+ # Warning
403
+ # If the between-subject groups are unbalanced(=unequal sample sizes), a
404
+ # type II ANOVA will be computed. Note however that SPSS, JAMOVI and JASP
405
+ # by default return a type III ANOVA, which may lead to slightly different
406
+ # results.
407
+ res_tab = pg.mixed_anova(data=data, dv=dv, within=within, subject=subject,
408
+ between=between, correction=correction, effsize=effsize)
409
+ res_tab = anovatable(res_tab)
410
+ return res_tab
411
+
412
+ def run_friedman(data, dv, factor, subject, method='chisq'):
413
+ # Friedman test for repeated measurements
414
+ # The Friedman test is used for non-parametric (rank-based) one-way
415
+ # repeated measures ANOVA
416
+
417
+ # check df form ('long' or 'wide')
418
+ # df_long = data.melt(ignore_index=False).reset_index()
419
+ # if data.describe().shape[1] >= df_long.describe().shape[1]:
420
+ # res_tab = pg.friedman(data, method=method)
421
+ # else:
422
+ # res_tab = pg.friedman(data=df_long, dv='value',
423
+ # within="variable", subject="index", method=method)
424
+ if "Wide" in df_wide_long(data):
425
+ df_long = data.melt(ignore_index=False).reset_index()
426
+ res_tab = pg.friedman(data=df_long, dv='value',
427
+ within="variable", subject="index", method=method)
428
+ else:
429
+ res_tab = pg.friedman(data, dv=dv, within=factor, subject=subject,method=method)
430
+ res_tab = anovatable(res_tab)
431
+ return res_tab
432
+
433
+ def run_kruskal(data, dv, factor):
434
+ # Kruskal-Wallis H-test for independent samples
435
+ res_tab = pg.kruskal(data=data, dv=dv, between=factor)
436
+ res_tab = anovatable(res_tab)
437
+ return res_tab
438
+
439
+ # Normality Check:
440
+ # Conduct normality tests (Shapiro-Wilk) for each group.
441
+ # If the data is approximately normally distributed, ANOVA is robust to
442
+ # moderate departures from normality, especially with larger sample sizes.
443
+
444
+ # print(data[factor])
445
+ # print(type(data[factor]))
446
+ # print(len(data[factor].columns))
447
+ # print(data[factor].nunique())
448
+ # print(data[factor[0]])
449
+ # print(data[factor[0]].unique())
450
+ if group is None:
451
+ group = factor
452
+
453
+ # print(f'\ngroup is :\n{data[group]},\ndv is :\n{dv}\n')
454
+ norm_array = []
455
+ for sub_group in data[group].unique():
456
+ norm_curr = check_normality(
457
+ data.loc[data[group] == sub_group, dv])
458
+ norm_array.append(norm_curr)
459
+ norm_all = True if all(norm_array) else False
460
+
461
+ # Homogeneity of Variances:
462
+ # Check for homogeneity of variances (homoscedasticity) among groups.
463
+ # Levene's test or Bartlett's test can be used for this purpose.
464
+ # If variances are significantly different, consider transformations or use a
465
+ # robust ANOVA method.
466
+
467
+ # # =============================================================================
468
+ # # # method1: stats.levene
469
+ # # =============================================================================
470
+ # # data_array = []
471
+ # # for sub_group in df["group"].unique():
472
+ # # data_array.append(df.loc[df['group'] == sub_group, 'values'].values)
473
+ # # print(data_array)
474
+ # # variance_all = stats.levene(data_array[0],data_array[1],data_array[2])
475
+
476
+ # =============================================================================
477
+ # # method2: pingouin.homoscedasticity
478
+ # =============================================================================
479
+ res_levene = None
480
+ variance_all = pg.homoscedasticity(
481
+ data, dv=dv, group=group, method='levene', alpha=0.05)
482
+ res_levene = True if variance_all.iloc[0,1] > 0.05 else False
483
+ # =============================================================================
484
+ # # ANOVA Assumptions:
485
+ # # Ensure that the assumptions of independence, homogeneity of variances, and
486
+ # # normality are reasonably met before proceeding.
487
+ # =============================================================================
488
+ notes_norm = 'normally' if norm_all else 'NOT-normally'
489
+ notes_variance = 'equal' if res_levene else 'unequal'
490
+ print(f'Data is {notes_norm} distributed, shows {notes_variance} variance')
491
+
492
+ cfg_pmc = corr_pmc(pmc)
493
+ cfg_pair = corr_pair(pair)
494
+ output = {}
495
+ if (cfg_pmc == 'parametric') or (cfg_pmc == 'auto'):
496
+ if 'np' in cfg_pair: # 'unpaired'
497
+ if cfg_pmc == 'auto':
498
+ if norm_all:
499
+ if res_levene:
500
+ res_tab = run_anova(data, dv, factor, ss_type=ss_type,
501
+ detailed=True, effsize='np2')
502
+ notes_stat = f'{data[factor].nunique()} Way ANOVA'
503
+ notes_APA = extract_apa(res_tab)
504
+
505
+ else:
506
+ res_tab = run_welchanova(data, dv, factor)
507
+ notes_stat = f'{data[factor].nunique()} Way Welch ANOVA'
508
+ notes_APA = extract_apa(res_tab)
509
+
510
+ else:
511
+
512
+ res_tab = run_kruskal(data, dv, factor)
513
+ notes_stat = f'Non-parametric Kruskal: {data[factor].nunique()} Way ANOVA'
514
+ notes_APA = extract_apa(res_tab)
515
+
516
+ elif cfg_pmc == 'parametric':
517
+ res_tab = run_anova(data, dv, factor, ss_type=ss_type,
518
+ detailed=True, effsize='np2')
519
+ notes_stat = f'{data[factor].nunique()} Way ANOVA'
520
+ notes_APA = extract_apa(res_tab)
521
+
522
+ elif 'pa' in cfg_pair and 'np' not in cfg_pair: # 'paired'
523
+ res_tab = run_rmanova(data, dv, factor, subject, correction='auto',
524
+ detailed=True, effsize='ng2')
525
+ notes_stat = f'{data[factor].nunique()} Way Repeated measures ANOVA'
526
+ notes_APA = extract_apa(res_tab)
527
+
528
+ elif 'mix' in cfg_pair or 'both' in cfg_pair:
529
+ res_tab = run_mixedanova(data, dv, between, within, subject)
530
+ # notes_stat = f'{len(sum(len(between)+sum(len(within))))} Way Mixed ANOVA'
531
+ notes_stat = ""
532
+ # n_inter = res_tab.loc(res_tab["Source"] == "Interaction")
533
+ # print(n_inter)
534
+ notes_APA = extract_apa(res_tab)
535
+
536
+ elif cfg_pmc == 'non-parametric':
537
+ if 'np' in cfg_pair: # 'unpaired'
538
+ res_tab = run_kruskal(data, dv, factor)
539
+ notes_stat = f'Non-parametric Kruskal: {data[factor].nunique()} Way ANOVA'
540
+ notes_APA = f'H({res_tab.ddof1[0]},n={data.shape[0]})={round(res_tab.H[0], 5)},p={round(res_tab["p-unc"][0], 5)}'
541
+
542
+ elif 'pa' in cfg_pair and 'np' not in cfg_pair: # 'paired'
543
+ res_tab = run_friedman(data, dv, factor, subject, method='chisq')
544
+ notes_stat = f'Non-parametric {data[factor].nunique()} Way Friedman repeated measures ANOVA'
545
+ notes_APA = f'X^2({res_tab.ddof1[0]})={round(res_tab.Q[0], 5)},p={round(res_tab["p-unc"][0], 5)}'
546
+
547
+ # =============================================================================
548
+ # # Post-hoc
549
+ # Post-Hoc Tests (if significant):
550
+ # If ANOVA indicates significant differences, perform post-hoc tests (e.g.,
551
+ # Tukey's HSD, Bonferroni, or Scheffé) to identify which groups differ from each other.
552
+ # # https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tests.html
553
+ # =============================================================================
554
+ go_pmc = True if cfg_pmc == 'parametric' else False
555
+ go_subject = subject if ('pa' in cfg_pair) and (
556
+ 'np' not in cfg_pair) else None
557
+ go_mix_between = between if ('mix' in cfg_pair) or (
558
+ 'both' in cfg_pair) else None
559
+ go_mix_between = None if ('pa' in cfg_pair) or (
560
+ 'np' not in cfg_pair) else factor
561
+ go_mix_within = within if ('mix' in cfg_pair) or (
562
+ 'both' in cfg_pair) else None
563
+ go_mix_within = factor if ('pa' in cfg_pair) or (
564
+ 'np' not in cfg_pair) else None
565
+ if res_tab['p-unc'][0] <= .05:
566
+ # Pairwise Comparisons
567
+ method_post_hoc = [
568
+ "bonf", # 'bonferroni', # : one-step correction
569
+ "sidak", # one-step correction
570
+ "holm", # step-down method using Bonferroni adjustments
571
+ "fdr_bh", # Benjamini/Hochberg (non-negative)
572
+ "fdr_by", # Benjamini/Yekutieli (negative)
573
+ ]
574
+ res_posthoc = pd.DataFrame()
575
+ for met in method_post_hoc:
576
+ post_curr = pg.pairwise_tests(data=data, dv=dv, between=go_mix_between, within=go_mix_within, subject=go_subject, parametric=go_pmc, marginal=True, alpha=0.05, alternative='two-sided',
577
+ padjust=met)
578
+ res_posthoc = pd.concat([res_posthoc, post_curr],
579
+ ignore_index=True)
580
+ else:
581
+ res_posthoc = None
582
+ output['res_posthoc'] = res_posthoc
583
+ # =============================================================================
584
+ # # filling output
585
+ # =============================================================================
586
+
587
+ pd.set_option('display.max_columns', None)
588
+ output['stat'] = notes_stat
589
+ # print(output['APA'])
590
+ output['APA'] = notes_APA
591
+ output['pval'] = res_tab['p-unc']
592
+ output['res_tab'] = res_tab
593
+ if res_tab.shape[0] == len(notes_APA):
594
+ output['res_tab']['APA'] = output['APA'] # note APA in the table
595
+ # print(output['stat'])
596
+ # print(output['res_tab'])
597
+ return output
598
+
599
+
600
+ # =============================================================================
601
+ # # One-way ANOVA
602
+ # =============================================================================
603
+ # url = "http://stats191.stanford.edu/data/rehab.csv"
604
+ # rehab_table = pd.read_table(url, delimiter=",")
605
+ # rehab_table.to_csv("rehab.table")
606
+ # fig, ax = plt.subplots(figsize=(8, 6))
607
+ # fig = rehab_table.boxplot("Time", "Fitness", ax=ax, grid=False)
608
+ # # fig, ax = plt.subplots(figsize=(8, 6))
609
+ # # set_pub()
610
+ # # sns.boxenplot(x="Time",y="Fitness",data = rehab_table)
611
+
612
+ # out2 = FuncMultiCmpt(pmc='pmc', pair='unpair',
613
+ # data=rehab_table, dv='Time', factor='Fitness')
614
+ # # print(out2['res_tab'])
615
+ # # print(out2['APA'])
616
+ # out2['res_posthoc']
617
+ # out2['res_posthoc']['p-unc'][0]
618
+ # out2['res_posthoc']['p-adjust'][0]
619
+ # out2['res_posthoc']['p-corr'][0]
620
+
621
+
622
+ # =============================================================================
623
+ # # Interactions and ANOVA
624
+ # https://www.statsmodels.org/dev/examples/notebooks/generated/interactions_anova.html
625
+ # url = "http://stats191.stanford.edu/data/salary.table"
626
+ # fh = urlopen(url)
627
+ # df = pd.read_table(fh)
628
+ # out1 = FuncMultiCmpt(pmc='pmc', pair='unpaired', data=df,
629
+ # dv='S', factor=['X', 'E', 'M'], group='M')
630
+ # # # two-way anova
631
+ # # https://www.statology.org/two-way-anova-python/
632
+ # # =============================================================================
633
+ # # df = pd.DataFrame({'water': np.repeat(['daily', 'weekly'], 15),
634
+ # # 'sun': np.tile(np.repeat(['low', 'med', 'high'], 5), 2),
635
+ # # 'height': [6, 6, 6, 5, 6, 5, 5, 6, 4, 5,
636
+ # # 6, 6, 7, 8, 7, 3, 4, 4, 4, 5,
637
+ # # 4, 4, 4, 4, 4, 5, 6, 6, 7, 8]})
638
+ # # out1 = FuncMultiCmpt(pmc='pmc', pair='unpaired', data=df,
639
+ # # dv='height', factor=['water','sun'],group='water')
640
+
641
+
642
+ # =============================================================================
643
+ # # two way anova
644
+ # https://www.geeksforgeeks.org/how-to-perform-a-two-way-anova-in-python/
645
+ # =============================================================================
646
+ # df1=pd.DataFrame({'Fertilizer': np.repeat(['daily', 'weekly'], 15),
647
+ # 'Watering': np.repeat(['daily', 'weekly'], 15),
648
+ # 'height': [14, 16, 15, 15, 16, 13, 12, 11,
649
+ # 14, 15, 16, 16, 17, 18, 14, 13,
650
+ # 14, 14, 14, 15, 16, 16, 17, 18,
651
+ # 14, 13, 14, 14, 14, 15]})
652
+
653
+ # df1['subject'] = np.tile(range(0, 15), (1, 2)).T
654
+ # out1 = FuncMultiCmpt(pmc='pmc', pair='unpaired', data=df1,
655
+ # dv='height', factor=['Fertilizer','Watering'],group='Watering')
656
+ # # print(out1['stat'])
657
+ # # print(out1['res_tab'])
658
+
659
+ # =============================================================================
660
+ # # welch anova
661
+ # https://www.geeksforgeeks.org/how-to-perform-welchs-anova-in-python/
662
+ # =============================================================================
663
+ # df = pd.DataFrame({'score': [64, 66, 68, 75, 78, 94, 98, 79, 71, 80,
664
+ # 91, 92, 93, 90, 97, 94, 82, 88, 95, 96,
665
+ # 79, 78, 88, 94, 92, 85, 83, 85, 82, 81],
666
+ # 'group': np.repeat(['strat1', 'strat2', 'strat3'],repeats=10)})
667
+ # out1 = FuncMultiCmpt(pmc='auto',pair='unpaired',data=df, dv='score', factor='group', group='group')
668
+ # =============================================================================
669
+ # # two way anova
670
+ # https://www.statology.org/two-way-anova-python/
671
+ # =============================================================================
672
+ # df = pd.DataFrame({'water': np.repeat(['daily', 'weekly'], 15),
673
+ # 'sun': np.tile(np.repeat(['low', 'med', 'high'], 5), 2),
674
+ # 'height': [6, 6, 6, 5, 6, 5, 5, 6, 4, 5,
675
+ # 6, 6, 7, 8, 7, 3, 4, 4, 4, 5,
676
+ # 4, 4, 4, 4, 4, 5, 6, 6, 7, 8]})
677
+ # df['subject'] = np.tile(range(0, 15), (1, 2)).T
678
+ # out1 = FuncMultiCmpt(pmc='pmc', pair='unpaired', data=df,
679
+ # dv='height', factor=['water', 'sun'], subject='subject', group='water')
680
+ # # print(out1['stat'])
681
+ # # print(out1['res_tab'])
682
+
683
+ # =============================================================================
684
+ # # 3-way ANOVA
685
+ # =============================================================================
686
+ # df = pd.DataFrame({'program': np.repeat([1, 2], 20),
687
+ # 'gender': np.tile(np.repeat(['M', 'F'], 10), 2),
688
+ # 'division': np.tile(np.repeat([1, 2], 5), 4),
689
+ # 'height': [7, 7, 8, 8, 7, 6, 6, 5, 6, 5,
690
+ # 5, 5, 4, 5, 4, 3, 3, 4, 3, 3,
691
+ # 6, 6, 5, 4, 5, 4, 5, 4, 4, 3,
692
+ # 2, 2, 1, 4, 4, 2, 1, 1, 2, 1]})
693
+ # df['subject'] = np.tile(range(0, 20), (1, 2)).T
694
+ # out1 = FuncMultiCmpt(pmc='pmc', pair='unpaired', data=df,
695
+ # dv='height', factor=['gender', 'program', 'division'], subject='subject', group='program')
696
+ # # print(out1['stat'])
697
+ # # print(out1['res_tab'])
698
+
699
+ # =============================================================================
700
+ # # Repeated Measures ANOVA in Python
701
+ # =============================================================================
702
+ # df = pd.DataFrame({'patient': np.repeat([1, 2, 3, 4, 5], 4),
703
+ # 'drug': np.tile([1, 2, 3, 4], 5),
704
+ # 'response': [30, 28, 16, 34,
705
+ # 14, 18, 10, 22,
706
+ # 24, 20, 18, 30,
707
+ # 38, 34, 20, 44,
708
+ # 26, 28, 14, 30]})
709
+ # # df['subject'] = np.tile(range(0, 20), (1, 2)).T
710
+ # out1 = FuncMultiCmpt(pmc='pmc', pair='paired', data=df,
711
+ # dv='response', factor=['drug'], subject='patient', group='drug')
712
+ # print(out1['stat'])
713
+ # print(out1['res_tab'])
714
+ # print(out1['APA'])
715
+
716
+ # =============================================================================
717
+ # # repeated anova
718
+ # https://www.geeksforgeeks.org/how-to-perform-a-repeated-measures-anova-in-python/
719
+ # =============================================================================
720
+ # df = pd.DataFrame({'Cars': np.repeat([1, 2, 3, 4, 5], 4),
721
+ # 'Engine Oil': np.tile([1, 2, 3, 4], 5),
722
+ # 'Mileage': [36, 38, 30, 29,
723
+ # 34, 38, 30, 29,
724
+ # 34, 28, 38, 32,
725
+ # 38, 34, 20, 44,
726
+ # 26, 28, 34, 50]})
727
+ # out1 = FuncMultiCmpt(pmc='pmc', pair='paired', data=df,
728
+ # dv='Mileage', factor=['Engine Oil'], subject='Cars', group='Cars')
729
+ # =============================================================================
730
+ # #two-way repeated anova
731
+ # =============================================================================
732
+ # df = pd.read_csv(
733
+ # "https://reneshbedre.github.io/assets/posts/anova/plants_leaves_two_within.csv")
734
+ # df
735
+ # # df['subject'] = np.tile(range(0, 20), (1, 2)).T
736
+ # out1 = FuncMultiCmpt(pmc='pmc', pair='paired', data=df,
737
+ # dv='num_leaves', factor=['year', 'time'], subject='plants', group='year')
738
+ # print(out1['stat'])
739
+ # print(out1['res_tab'])
740
+ # print(out1['APA'])
741
+
742
+ # =============================================================================
743
+ # # repeated anova
744
+ # =============================================================================
745
+ # df = pd.read_csv('/Users/macjianfeng/Desktop/test.csv')
746
+ # df.head()
747
+ # df.loc[df['animal'].str.contains('Sleep'), 'experiment'] = 'sleep'
748
+ # df.loc[df['animal'].str.contains('Wake'), 'experiment'] = 'wake'
749
+ # df.loc[df['variable'].str.contains('hypo'), 'region'] = 'hypo'
750
+ # df.loc[df['variable'].str.contains('cort'), 'region'] = 'cort'
751
+ # df
752
+ # for i in range(4):
753
+ # match i:
754
+ # case 0:
755
+ # prot_name = 'A1'
756
+ # case 1:
757
+ # prot_name = 'A2'
758
+ # case 2:
759
+ # prot_name = '845'
760
+ # case 3:
761
+ # prot_name = '831'
762
+ # df_tmp = df[df["variable"].str.contains(prot_name)]
763
+ # df_tmp['protein'] = prot_name
764
+ # df_tmp = df_tmp.reset_index()
765
+ # print(df_tmp)
766
+
767
+ # out1 = FuncMultiCmpt(pmc='pmc', pair='mix', data=df_tmp,
768
+ # dv='value', between='experiment', within='region', subject='animal', group='experiment')
769
+ # print(out1['stat'])
770
+ # print(out1['res_tab'])
771
+ # # =============================================================================
772
+ # One-way ANOVA
773
+ # df1 = pd.read_csv('/Users/macjianfeng/Desktop/Book2.csv')
774
+ # df2 = df1.melt()
775
+ # out1 = FuncMultiCmpt(pmc='npmc', pair='unpaired', data=df2,
776
+ # dv='libido', factor=['brand x', 'brand y', 'brand z'], subject='participant')
777
+ # print(out1['stat'])
778
+ # print(out1['res_tab'])
779
+ # =============================================================================
780
+
781
+
782
+ # =============================================================================
783
+ # # #One-way ANOVA new example: https://www.pythonfordatascience.org/anova-python/
784
+ # =============================================================================
785
+ # df1 = pd.read_csv(
786
+ # "https://raw.githubusercontent.com/researchpy/Data-sets/master/difficile.csv")
787
+ # df1.drop('person', axis=1, inplace=True)
788
+ # # Recoding value from numeric to string
789
+ # df1['dose'].replace({1: 'placebo', 2: 'low', 3: 'high'}, inplace=True)
790
+ # df1.head(10)
791
+
792
+ # out3= FuncMultiCmpt(pmc='pmc', data=df1, dv='libido', factor='dose')
793
+ # # print(out3['res_tab'])
794
+ # # # print(out3['res_posthoc'])
795
+ # # print(out3['APA'])
796
+
797
+ # =============================================================================
798
+ # https://lifewithdata.com/2023/06/08/how-to-perform-a-two-way-anova-in-python/
799
+ # =============================================================================
800
+ # data = {
801
+ # 'Diet': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C'],
802
+ # 'Workout': ['Low', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium', 'High'],
803
+ # 'WeightLoss': [3, 4, 5, 3.2, 5, 6, 5.2, 6, 5.5, 4, 5.5, 6.2]
804
+ # }
805
+ # df = pd.DataFrame(data)
806
+ # out4= FuncMultiCmpt(pmc='pmc', pair='unpaired',data=df, dv='WeightLoss', factor=['Diet','Workout'],group='Diet')
807
+
808
+ # =============================================================================
809
+ # # convert to list to string
810
+ # =============================================================================