AutoStatLib 0.2.1__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AutoStatLib might be problematic. Click here for more details.

@@ -1,397 +1,10 @@
1
- import numpy as np
2
- import pandas as pd
3
- from statsmodels.stats.diagnostic import lilliefors
4
- from statsmodels.stats.anova import AnovaRM
5
- from scipy.stats import ttest_rel, ttest_ind, ttest_1samp, wilcoxon, mannwhitneyu, f_oneway, kruskal, friedmanchisquare, shapiro, anderson, normaltest
1
+ from AutoStatLib.statistical_tests import StatisticalTests
2
+ from AutoStatLib.normality_tests import NormalityTests
3
+ from AutoStatLib.helpers import Helpers
4
+ from AutoStatLib.text_formatting import TextFormatting
6
5
 
7
6
 
8
- class __StatisticalTests():
9
- '''
10
- Statistical tests mixin
11
- '''
12
-
13
- def anova_1w_ordinary(self):
14
- stat, p_value = f_oneway(*self.data)
15
- self.tails = 2
16
- # if self.tails == 1 and p_value > 0.5:
17
- # p_value /= 2
18
- # if self.tails == 1:
19
- # p_value /= 2
20
- self.test_name = 'Ordinary One-Way ANOVA'
21
- self.test_id = 'anova_1w_ordinary'
22
- self.paired = False
23
- self.test_stat = stat
24
- self.p_value = p_value
25
-
26
- def anova_1w_rm(self):
27
- """
28
- Perform repeated measures one-way ANOVA test.
29
-
30
- Parameters:
31
- data: list of lists, where each sublist represents repeated measures for a subject
32
- """
33
-
34
- df = self.matrix_to_dataframe(self.data)
35
- res = AnovaRM(df, 'Value', 'Row', within=['Col']).fit()
36
- f_stat = res.anova_table['F Value'][0]
37
- p_value = res.anova_table['Pr > F'][0]
38
-
39
- self.tails = 2
40
- self.test_name = 'Repeated Measures One-Way ANOVA'
41
- self.test_id = 'anova_1w_rm'
42
- self.paired = True
43
- self.test_stat = f_stat
44
- self.p_value = p_value
45
-
46
- def friedman_test(self):
47
- stat, p_value = friedmanchisquare(*self.data)
48
- self.tails = 2
49
- self.test_name = 'Friedman test'
50
- self.test_id = 'friedman'
51
- self.paired = True
52
- self.test_stat = stat
53
- self.p_value = p_value
54
-
55
- def kruskal_wallis_test(self):
56
- stat, p_value = kruskal(*self.data)
57
- self.test_name = 'Kruskal-Wallis test'
58
- self.test_id = 'kruskal_wallis'
59
- self.paired = False
60
- self.test_stat = stat
61
- self.p_value = p_value
62
-
63
- def mann_whitney_u_test(self):
64
- stat, p_value = mannwhitneyu(
65
- self.data[0], self.data[1], alternative='two-sided')
66
- if self.tails == 1:
67
- p_value /= 2
68
- # alternative method of one-tailed calculation
69
- # gives the same result:
70
- # stat, p_value = mannwhitneyu(
71
- # self.data[0], self.data[1], alternative='two-sided' if self.tails == 2 else 'less')
72
- # if self.tails == 1 and p_value > 0.5:
73
- # p_value = 1-p_value
74
-
75
- self.test_name = 'Mann-Whitney U test'
76
- self.test_id = 'mann_whitney'
77
- self.paired = False
78
- self.test_stat = stat
79
- self.p_value = p_value
80
-
81
- def t_test_independent(self):
82
- t_stat, t_p_value = ttest_ind(
83
- self.data[0], self.data[1])
84
- if self.tails == 1:
85
- t_p_value /= 2
86
- self.test_name = 't-test for independent samples'
87
- self.test_id = 't_test_independent'
88
- self.paired = False
89
- self.test_stat = t_stat
90
- self.p_value = t_p_value
91
-
92
- def t_test_paired(self):
93
- t_stat, t_p_value = ttest_rel(
94
- self.data[0], self.data[1])
95
- if self.tails == 1:
96
- t_p_value /= 2
97
- self.test_name = 't-test for paired samples'
98
- self.test_id = 't_test_paired'
99
- self.paired = True
100
- self.test_stat = t_stat
101
- self.p_value = t_p_value
102
-
103
- def t_test_single_sample(self):
104
- if self.popmean == None:
105
- self.popmean = 0
106
- self.AddWarning('no_pop_mean_set')
107
- t_stat, t_p_value = ttest_1samp(self.data[0], self.popmean)
108
- if self.tails == 1:
109
- t_p_value /= 2
110
- self.test_name = 'Single-sample t-test'
111
- self.test_id = 't_test_single_sample'
112
- self.paired = False
113
- self.test_stat = t_stat
114
- self.p_value = t_p_value
115
-
116
- def wilcoxon_single_sample(self):
117
- if self.popmean == None:
118
- self.popmean = 0
119
- self.AddWarning('no_pop_mean_set')
120
- data = [i - self.popmean for i in self.data[0]]
121
- w_stat, p_value = wilcoxon(data)
122
- if self.tails == 1:
123
- p_value /= 2
124
- self.test_name = 'Wilcoxon signed-rank test for single sample'
125
- self.test_id = 'wilcoxon_single_sample'
126
- self.paired = False
127
- self.test_stat = w_stat
128
- self.p_value = p_value
129
-
130
- def wilcoxon(self):
131
- stat, p_value = wilcoxon(self.data[0], self.data[1])
132
- if self.tails == 1:
133
- p_value /= 2
134
- self.test_name = 'Wilcoxon signed-rank test'
135
- self.test_id = 'wilcoxon'
136
- self.paired = True
137
- self.test_stat = stat
138
- self.p_value = p_value
139
-
140
-
141
- class __NormalityTests():
142
- '''
143
- Normality tests mixin
144
-
145
- see the article about minimal sample size for tests:
146
- Power comparisons of Shapiro-Wilk, Kolmogorov-Smirnov,
147
- Lilliefors and Anderson-Darling tests, Nornadiah Mohd Razali1, Yap Bee Wah1
148
- '''
149
-
150
- def check_normality(self, data):
151
- sw = None
152
- lf = None
153
- ad = None
154
- ap = None
155
- n = len(data)
156
-
157
- # Shapiro-Wilk test
158
- sw_stat, sw_p_value = shapiro(data)
159
- if sw_p_value > 0.05:
160
- sw = True
161
- else:
162
- sw = False
163
-
164
- # Lilliefors test
165
- lf_stat, lf_p_value = lilliefors(
166
- data, dist='norm')
167
- if lf_p_value > 0.05:
168
- lf = True
169
- else:
170
- lf = False
171
-
172
- # Anderson-Darling test
173
- if n >= 20:
174
- ad_stat, ad_p_value = self.anderson_get_p(
175
- data, dist='norm')
176
- if ad_p_value > 0.05:
177
- ad = True
178
- else:
179
- ad = False
180
-
181
- # D'Agostino-Pearson test
182
- # test result is skewed if n<20
183
- if n >= 20:
184
- ap_stat, ap_p_value = normaltest(data)
185
- if ap_p_value > 0.05:
186
- ap = True
187
- else:
188
- ap = False
189
-
190
- # print(ap_p_value, ad_p_value, sw_p_value, lf_p_value)
191
-
192
- return (sw, lf, ad, ap)
193
-
194
- def anderson_get_p(self, data, dist='norm'):
195
- '''
196
- calculating p-value for Anderson-Darling test using the method described here:
197
- Computation of Probability Associated with Anderson-Darling Statistic
198
- Lorentz Jantschi and Sorana D. Bolboaca, 2018 - Mathematics
199
-
200
- '''
201
- e = 2.718281828459045
202
- n = len(data)
203
-
204
- ad, critical_values, significance_levels = anderson(
205
- data, dist=dist)
206
-
207
- # adjust ad_stat for small sample sizes:
208
- s = ad*(1 + 0.75/n + 2.25/(n**2))
209
-
210
- if s >= 0.6:
211
- p = e**(1.2937 - 5.709*s + 0.0186*s**2)
212
- elif s > 0.34:
213
- p = e**(0.9177 - 4.279*s - 1.38*s**2)
214
- elif s > 0.2:
215
- p = 1 - e**(-8.318 + 42.796*s - 59.938*s**2)
216
- elif s <= 0.2:
217
- p = 1 - e**(-13.436 + 101.14*s - 223.73*s**2)
218
- else:
219
- p = None
220
-
221
- return ad, p
222
-
223
-
224
- class __Helpers():
225
-
226
- def matrix_to_dataframe(self, matrix):
227
- data = []
228
- cols = []
229
- rows = []
230
-
231
- order_number = 1
232
- for i, row in enumerate(matrix):
233
- for j, value in enumerate(row):
234
- data.append(value)
235
- cols.append(i)
236
- rows.append(j)
237
- order_number += 1
238
-
239
- df = pd.DataFrame(
240
- {'Row': rows, 'Col': cols, 'Value': data})
241
- return df
242
-
243
- def create_results_dict(self) -> dict:
244
-
245
- self.stars_int = self.make_stars()
246
- self.stars_str = '*' * self.stars_int if self.stars_int else 'ns'
247
-
248
- return {
249
- 'p-value': self.make_p_value_printed(),
250
- 'Significance(p<0.05)': True if self.p_value.item() < 0.05 else False,
251
- 'Stars_Printed': self.stars_str,
252
- 'Test_Name': self.test_name,
253
- 'Groups_Compared': self.n_groups,
254
- 'Population_Mean': self.popmean if self.n_groups == 1 else 'N/A',
255
- 'Data_Normaly_Distributed': self.parametric,
256
- 'Parametric_Test_Applied': True if self.test_id in self.test_ids_parametric else False,
257
- 'Paired_Test_Applied': self.paired,
258
- 'Tails': self.tails,
259
- 'p-value_exact': self.p_value.item(),
260
- 'Stars': self.stars_int,
261
- # 'Stat_Value': self.test_stat.item(),
262
- 'Warnings': self.warnings,
263
- 'Groups_N': [len(self.data[i]) for i in range(len(self.data))],
264
- 'Groups_Median': [np.median(self.data[i]).item() for i in range(len(self.data))],
265
- 'Groups_Mean': [np.mean(self.data[i]).item() for i in range(len(self.data))],
266
- 'Groups_SD': [np.std(self.data[i]).item() for i in range(len(self.data))],
267
- 'Groups_SE': [np.std(self.data[i]).item() / np.sqrt(len(self.data)).item() for i in range(len(self.data))],
268
- # actually returns list of lists of numpy dtypes of float64, next make it return regular floats:
269
- 'Samples': self.data,
270
- }
271
-
272
- def log(self, *args, **kwargs):
273
- message = ' '.join(map(str, args))
274
- # print(message, **kwargs)
275
- self.summary += '\n' + message
276
-
277
- def AddWarning(self, warning_id):
278
- message = self.warning_ids_all[warning_id]
279
- self.log(message)
280
- self.warnings.append(message)
281
-
282
-
283
- class __TextFormatting():
284
- '''
285
- Text formatting mixin
286
- '''
287
-
288
- def autospace(self, elements_list, space, delimiter=' ') -> str:
289
- output = ''
290
- for i, element in enumerate(elements_list):
291
- if i == len(elements_list):
292
- output += element
293
- else:
294
- output += element + (space-len(element))*delimiter
295
- return output
296
-
297
- def print_groups(self, space=24, max_length=15):
298
- self.log('')
299
- # Get the number of groups (rows) and the maximum length of rows
300
- data = self.data
301
- num_groups = len(data)
302
- group_longest = max(len(row) for row in data)
303
-
304
- # Print the header
305
- header = [f'Group {i+1}' for i in range(num_groups)]
306
- line = [''*7]
307
- self.log(self.autospace(header, space))
308
- self.log(self.autospace(line, space))
309
-
310
- # Print each column with a placeholder if longer than max_length
311
- for i in range(group_longest):
312
- row_values = []
313
- all_values_empty = True
314
- for row in data:
315
- if len(row) > max_length:
316
- if i < max_length:
317
- row_values.append(str(row[i]))
318
- all_values_empty = False
319
- elif i == max_length:
320
- row_values.append(f'[{len(row) - max_length} more]')
321
- all_values_empty = False
322
- else:
323
- continue
324
- else:
325
- if i < len(row):
326
- row_values.append(str(row[i]))
327
- all_values_empty = False
328
- else:
329
- row_values.append('')
330
- if all_values_empty:
331
- break
332
- self.log(self.autospace(row_values, space))
333
-
334
- def make_stars(self) -> int:
335
- p = self.p_value.item()
336
- if p is not None:
337
- if p < 0.0001:
338
- return 4
339
- if p < 0.001:
340
- return 3
341
- elif p < 0.01:
342
- return 2
343
- elif p < 0.05:
344
- return 1
345
- else:
346
- return 0
347
- return 0
348
-
349
- def make_p_value_printed(self) -> str:
350
- p = self.p_value.item()
351
- if p is not None:
352
- if p > 0.99:
353
- return 'p>0.99'
354
- elif p >= 0.01:
355
- return f'p={p:.2g}'
356
- elif p >= 0.001:
357
- return f'p={p:.2g}'
358
- elif p >= 0.0001:
359
- return f'p={p:.1g}'
360
- elif p < 0.0001:
361
- return 'p<0.0001'
362
- else:
363
- return 'N/A'
364
- return 'N/A'
365
-
366
- def print_results(self):
367
- self.log('\n\nResults: \n')
368
- for i in self.results:
369
- shift = 27 - len(i)
370
- if i == 'Warnings':
371
- self.log(i, ':', ' ' * shift, len(self.results[i]))
372
- elif i == 'Samples':
373
- pass
374
- else:
375
- self.log(i, ':', ' ' * shift, self.results[i])
376
-
377
-
378
- class __InputFormatting():
379
- def floatify_recursive(self, data):
380
- if isinstance(data, list):
381
- # Recursively process sublists and filter out None values
382
- processed_list = [self.floatify_recursive(item) for item in data]
383
- return [item for item in processed_list if item is not None]
384
- else:
385
- try:
386
- # Try to convert the item to float
387
- return np.float64(data)
388
- except (ValueError, TypeError):
389
- # If conversion fails, replace with None
390
- self.warning_flag_non_numeric_data = True
391
- return None
392
-
393
-
394
- class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting, __InputFormatting, __Helpers):
7
+ class StatisticalAnalysis(StatisticalTests, NormalityTests, TextFormatting, Helpers):
395
8
  '''
396
9
  The main class
397
10
  *documentation placeholder*
@@ -403,6 +16,7 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
403
16
  paired=False,
404
17
  tails=2,
405
18
  popmean=None,
19
+ posthoc=False,
406
20
  verbose=True):
407
21
  self.results = None
408
22
  self.error = False
@@ -410,6 +24,7 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
410
24
  self.paired = paired
411
25
  self.tails = tails
412
26
  self.popmean = popmean
27
+ self.posthoc = posthoc
413
28
  self.verbose = verbose
414
29
  self.n_groups = len(self.groups_list)
415
30
  self.warning_flag_non_numeric_data = False
@@ -464,7 +79,7 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
464
79
  'no_pop_mean_set': '\nWarning: No Population Mean was set up for single-sample test, used default 0 value.\n The results might be skewed. \n Please, set the Population Mean and run the test again.\n',
465
80
  }
466
81
 
467
- def __run_test(self, test='auto'):
82
+ def run_test(self, test='auto'):
468
83
 
469
84
  # reset values from previous tests
470
85
  self.results = None
@@ -475,9 +90,11 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
475
90
  self.test_id = None
476
91
  self.test_stat = None
477
92
  self.p_value = None
93
+ self.posthoc_matrix_df = None
94
+ self.posthoc_matrix = []
478
95
 
479
96
  self.log('\n' + '-'*67)
480
- self.log('Statistical analysis initiated for data in {} groups\n'.format(
97
+ self.log('Statistical analysis __init__iated for data in {} groups\n'.format(
481
98
  len(self.groups_list)))
482
99
 
483
100
  # adjusting input data type
@@ -550,29 +167,13 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
550
167
  if not test == 'auto' and self.parametric and not test in self.test_ids_parametric:
551
168
  self.AddWarning('non-param_test_with_normal_data')
552
169
 
553
- if test == 'anova_1w_ordinary':
554
- self.anova_1w_ordinary()
555
- elif test == 'anova_1w_rm':
556
- self.anova_1w_rm()
557
- elif test == 'friedman':
558
- self.friedman_test()
559
- elif test == 'kruskal_wallis':
560
- self.kruskal_wallis_test()
561
- elif test == 'mann_whitney':
562
- self.mann_whitney_u_test()
563
- elif test == 't_test_independent':
564
- self.t_test_independent()
565
- elif test == 't_test_paired':
566
- self.t_test_paired()
567
- elif test == 't_test_single_sample':
568
- self.t_test_single_sample()
569
- elif test == 'wilcoxon':
570
- self.wilcoxon()
571
- elif test == 'wilcoxon_single_sample':
572
- self.wilcoxon_single_sample()
170
+ # run the test
171
+
172
+ if test in self.test_ids_all:
173
+ self.run_test_by_id(test)
573
174
  else:
574
- self.log('Automatic test selection preformed.')
575
- self.__auto()
175
+ self.run_test_auto()
176
+
576
177
 
577
178
  # print the results
578
179
  self.results = self.create_results_dict()
@@ -585,82 +186,49 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
585
186
  if self.verbose == True:
586
187
  print(self.summary)
587
188
 
588
- def __auto(self):
589
-
590
- if self.n_groups == 1:
591
- if self.parametric:
592
- return self.t_test_single_sample()
593
- else:
594
- return self.wilcoxon_single_sample()
595
-
596
- elif self.n_groups == 2:
597
- if self.paired:
598
- if self.parametric:
599
- return self.t_test_paired()
600
- else:
601
- return self.wilcoxon()
602
- else:
603
- if self.parametric:
604
- return self.t_test_independent()
605
- else:
606
- return self.mann_whitney_u_test()
607
-
608
- elif self.n_groups >= 3:
609
- if self.paired:
610
- if self.parametric:
611
- return self.anova_1w_rm()
612
- else:
613
- return self.friedman_test()
614
- else:
615
- if self.parametric:
616
- return self.anova_1w_ordinary()
617
- else:
618
- return self.kruskal_wallis_test()
619
189
 
620
- else:
621
- pass
622
190
 
623
191
  # public methods:
624
192
  def RunAuto(self):
625
- self.__run_test(test='auto')
193
+ self.run_test(test='auto')
626
194
 
627
195
  def RunManual(self, test):
628
- self.__run_test(test)
196
+ self.run_test(test)
629
197
 
630
198
  def RunOnewayAnova(self):
631
- self.__run_test(test='anova_1w_ordinary')
199
+ self.run_test(test='anova_1w_ordinary')
632
200
 
633
201
  def RunOnewayAnovaRM(self):
634
- self.__run_test(test='anova_1w_rm')
202
+ self.run_test(test='anova_1w_rm')
635
203
 
636
204
  def RunFriedman(self):
637
- self.__run_test(test='friedman')
205
+ self.run_test(test='friedman')
638
206
 
639
207
  def RunKruskalWallis(self):
640
- self.__run_test(test='kruskal_wallis')
208
+ self.run_test(test='kruskal_wallis')
641
209
 
642
210
  def RunMannWhitney(self):
643
- self.__run_test(test='mann_whitney')
211
+ self.run_test(test='mann_whitney')
644
212
 
645
213
  def RunTtest(self):
646
- self.__run_test(test='t_test_independent')
214
+ self.run_test(test='t_test_independent')
647
215
 
648
216
  def RunTtestPaired(self):
649
- self.__run_test(test='t_test_paired')
217
+ self.run_test(test='t_test_paired')
650
218
 
651
219
  def RunTtestSingleSample(self):
652
- self.__run_test(test='t_test_single_sample')
220
+ self.run_test(test='t_test_single_sample')
653
221
 
654
222
  def RunWilcoxonSingleSample(self):
655
- self.__run_test(test='wilcoxon_single_sample')
223
+ self.run_test(test='wilcoxon_single_sample')
656
224
 
657
225
  def RunWilcoxon(self):
658
- self.__run_test(test='wilcoxon')
226
+ self.run_test(test='wilcoxon')
659
227
 
660
228
  def GetResult(self):
661
229
  if not self.results and not self.error:
662
230
  print('No test chosen, no results to output')
663
- # self.__run_test(test='auto')
231
+ # self.run_test(test='auto')
664
232
  return self.results
665
233
  if not self.results and self.error:
666
234
  print('Error occured, no results to output')
@@ -671,7 +239,7 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
671
239
  def GetSummary(self):
672
240
  if not self.results and not self.error:
673
241
  print('No test chosen, no summary to output')
674
- # self.__run_test(test='auto')
242
+ # self.run_test(test='auto')
675
243
  return self.summary
676
244
  else:
677
245
  return self.summary
AutoStatLib/_version.py CHANGED
@@ -1,2 +1,2 @@
1
1
  # AutoStatLib package version:
2
- __version__ = "0.2.1"
2
+ __version__ = "0.2.5"
AutoStatLib/helpers.py ADDED
@@ -0,0 +1,77 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ class Helpers():
5
+
6
+ def matrix_to_dataframe(self, matrix):
7
+ data = []
8
+ cols = []
9
+ rows = []
10
+
11
+ order_number = 1
12
+ for i, row in enumerate(matrix):
13
+ for j, value in enumerate(row):
14
+ data.append(value)
15
+ cols.append(i)
16
+ rows.append(j)
17
+ order_number += 1
18
+
19
+ df = pd.DataFrame(
20
+ {'Row': rows, 'Col': cols, 'Value': data})
21
+ return df
22
+
23
+ def floatify_recursive(self, data):
24
+ if isinstance(data, list):
25
+ # Recursively process sublists and filter out None values
26
+ processed_list = [self.floatify_recursive(item) for item in data]
27
+ return [item for item in processed_list if item is not None]
28
+ else:
29
+ try:
30
+ # Try to convert the item to float
31
+ return np.float64(data)
32
+ except (ValueError, TypeError):
33
+ # If conversion fails, replace with None
34
+ self.warning_flag_non_numeric_data = True
35
+ return None
36
+
37
+ def create_results_dict(self) -> dict:
38
+
39
+ self.stars_int = self.make_stars(self.p_value.item())
40
+ self.stars_str = self.make_stars_printed(self.stars_int)
41
+
42
+ return {
43
+ 'p-value': self.make_p_value_printed(self.p_value.item()),
44
+ 'Significance(p<0.05)': True if self.p_value.item() < 0.05 else False,
45
+ 'Stars_Printed': self.stars_str,
46
+ 'Test_Name': self.test_name,
47
+ 'Groups_Compared': self.n_groups,
48
+ 'Population_Mean': self.popmean if self.n_groups == 1 else 'N/A',
49
+ 'Data_Normaly_Distributed': self.parametric,
50
+ 'Parametric_Test_Applied': True if self.test_id in self.test_ids_parametric else False,
51
+ 'Paired_Test_Applied': self.paired,
52
+ 'Tails': self.tails,
53
+ 'p-value_exact': self.p_value.item(),
54
+ 'Stars': self.stars_int,
55
+ # 'Stat_Value': self.test_stat.item(),
56
+ 'Warnings': self.warnings,
57
+ 'Groups_N': [len(self.data[i]) for i in range(len(self.data))],
58
+ 'Groups_Median': [np.median(self.data[i]).item() for i in range(len(self.data))],
59
+ 'Groups_Mean': [np.mean(self.data[i]).item() for i in range(len(self.data))],
60
+ 'Groups_SD': [np.std(self.data[i]).item() for i in range(len(self.data))],
61
+ 'Groups_SE': [np.std(self.data[i]).item() / np.sqrt(len(self.data)).item() for i in range(len(self.data))],
62
+ # actually returns list of lists of numpy dtypes of float64, next make it return regular floats:
63
+ 'Samples': self.data,
64
+ 'Posthoc_Matrix': self.posthoc_matrix if self.posthoc_matrix else 'N/A',
65
+ 'Posthoc_Matrix_printed': [[self.make_p_value_printed(element) for element in row] for row in self.posthoc_matrix] if self.posthoc_matrix else 'N/A',
66
+ 'Posthoc_Matrix_stars': [[self.make_stars_printed(self.make_stars(element)) for element in row] for row in self.posthoc_matrix] if self.posthoc_matrix else 'N/A',
67
+ }
68
+
69
+ def log(self, *args, **kwargs):
70
+ message = ' '.join(map(str, args))
71
+ # print(message, **kwargs)
72
+ self.summary += '\n' + message
73
+
74
+ def AddWarning(self, warning_id):
75
+ message = self.warning_ids_all[warning_id]
76
+ self.log(message)
77
+ self.warnings.append(message)
@@ -0,0 +1,85 @@
1
+ from statsmodels.stats.diagnostic import lilliefors
2
+ from scipy.stats import shapiro, normaltest, anderson
3
+
4
+
5
+ class NormalityTests():
6
+ '''
7
+ Normality tests mixin
8
+
9
+ see the article about minimal sample size for tests:
10
+ Power comparisons of Shapiro-Wilk, Kolmogorov-Smirnov,
11
+ Lilliefors and Anderson-Darling tests, Nornadiah Mohd Razali1, Yap Bee Wah1
12
+ '''
13
+
14
+ def check_normality(self, data):
15
+ sw = None
16
+ lf = None
17
+ ad = None
18
+ ap = None
19
+ n = len(data)
20
+
21
+ # Shapiro-Wilk test
22
+ sw_stat, sw_p_value = shapiro(data)
23
+ if sw_p_value > 0.05:
24
+ sw = True
25
+ else:
26
+ sw = False
27
+
28
+ # Lilliefors test
29
+ lf_stat, lf_p_value = lilliefors(
30
+ data, dist='norm')
31
+ if lf_p_value > 0.05:
32
+ lf = True
33
+ else:
34
+ lf = False
35
+
36
+ # Anderson-Darling test
37
+ if n >= 20:
38
+ ad_stat, ad_p_value = self.anderson_get_p(
39
+ data, dist='norm')
40
+ if ad_p_value > 0.05:
41
+ ad = True
42
+ else:
43
+ ad = False
44
+
45
+ # D'Agostino-Pearson test
46
+ # test result is skewed if n<20
47
+ if n >= 20:
48
+ ap_stat, ap_p_value = normaltest(data)
49
+ if ap_p_value > 0.05:
50
+ ap = True
51
+ else:
52
+ ap = False
53
+
54
+ # print(ap_p_value, ad_p_value, sw_p_value, lf_p_value)
55
+
56
+ return (sw, lf, ad, ap)
57
+
58
+ def anderson_get_p(self, data, dist='norm'):
59
+ '''
60
+ calculating p-value for Anderson-Darling test using the method described here:
61
+ Computation of Probability Associated with Anderson-Darling Statistic
62
+ Lorentz Jantschi and Sorana D. Bolboaca, 2018 - Mathematics
63
+
64
+ '''
65
+ e = 2.718281828459045
66
+ n = len(data)
67
+
68
+ ad, critical_values, significance_levels = anderson(
69
+ data, dist=dist)
70
+
71
+ # adjust ad_stat for small sample sizes:
72
+ s = ad*(1 + 0.75/n + 2.25/(n**2))
73
+
74
+ if s >= 0.6:
75
+ p = e**(1.2937 - 5.709*s + 0.0186*s**2)
76
+ elif s > 0.34:
77
+ p = e**(0.9177 - 4.279*s - 1.38*s**2)
78
+ elif s > 0.2:
79
+ p = 1 - e**(-8.318 + 42.796*s - 59.938*s**2)
80
+ elif s <= 0.2:
81
+ p = 1 - e**(-13.436 + 101.14*s - 223.73*s**2)
82
+ else:
83
+ p = None
84
+
85
+ return ad, p
@@ -0,0 +1,173 @@
1
+ import numpy as np
2
+ import scikit_posthocs as sp
3
+ from statsmodels.stats.anova import AnovaRM
4
+ from scipy.stats import ttest_rel, ttest_ind, ttest_1samp, wilcoxon, mannwhitneyu, f_oneway, kruskal, friedmanchisquare
5
+
6
+
7
+
8
+ class StatisticalTests():
9
+ '''
10
+ Statistical tests mixin
11
+ '''
12
+
13
+ def run_test_auto(self):
14
+
15
+ if self.n_groups == 1:
16
+ if self.parametric:
17
+ self.run_test_by_id('t_test_single_sample')
18
+ else:
19
+ self.run_test_by_id('wilcoxon_single_sample')
20
+
21
+ elif self.n_groups == 2:
22
+ if self.paired:
23
+ if self.parametric:
24
+ self.run_test_by_id('t_test_paired')
25
+ else:
26
+ self.run_test_by_id('wilcoxon')
27
+ else:
28
+ if self.parametric:
29
+ self.run_test_by_id('t_test_independent')
30
+ else:
31
+ self.run_test_by_id('mann_whitney')
32
+
33
+ elif self.n_groups >= 3:
34
+ if self.paired:
35
+ if self.parametric:
36
+ self.run_test_by_id('anova_1w_rm')
37
+ else:
38
+ self.run_test_by_id('friedman')
39
+ else:
40
+ if self.parametric:
41
+ self.run_test_by_id('anova_1w_ordinary')
42
+ else:
43
+ self.run_test_by_id('kruskal_wallis')
44
+
45
+ else:
46
+ pass
47
+
48
+ def run_test_by_id(self, test_id):
49
+
50
+ test_names_dict = {
51
+ 'anova_1w_ordinary': 'Ordinary One-Way ANOVA',
52
+ 'anova_1w_rm': 'Repeated Measures One-Way ANOVA',
53
+ 'friedman': 'Friedman test',
54
+ 'kruskal_wallis': 'Kruskal-Wallis test',
55
+ 'mann_whitney': 'Mann-Whitney U test',
56
+ 't_test_independent': 't-test for independent samples',
57
+ 't_test_paired': 't-test for paired samples',
58
+ 't_test_single_sample': 'Single-sample t-test',
59
+ 'wilcoxon': 'Wilcoxon signed-rank test',
60
+ 'wilcoxon_single_sample': 'Wilcoxon signed-rank test for single sample',
61
+ }
62
+
63
+ match test_id:
64
+ case 'anova_1w_ordinary': stat, p_value = self.anova_1w_ordinary()
65
+ case 'anova_1w_rm': stat, p_value = self.anova_1w_rm()
66
+ case 'friedman': stat, p_value = self.friedman()
67
+ case 'kruskal_wallis': stat, p_value = self.kruskal_wallis()
68
+ case 'mann_whitney': stat, p_value = self.mann_whitney()
69
+ case 't_test_independent': stat, p_value = self.t_test_independent()
70
+ case 't_test_paired': stat, p_value = self.t_test_paired()
71
+ case 't_test_single_sample': stat, p_value = self.t_test_single_sample()
72
+ case 'wilcoxon': stat, p_value = self.wilcoxon()
73
+ case 'wilcoxon_single_sample': stat, p_value = self.wilcoxon_single_sample()
74
+
75
+ if test_id in self.test_ids_dependent:
76
+ self.paired = True
77
+ else:
78
+ self.paired = False
79
+
80
+ self.test_name = test_names_dict[test_id]
81
+ self.test_id = test_id
82
+ self.test_stat = stat
83
+ self.p_value = p_value
84
+
85
+ def anova_1w_ordinary(self):
86
+ stat, p_value = f_oneway(*self.data)
87
+ self.tails = 2
88
+ # if self.tails == 1 and p_value > 0.5:
89
+ # p_value /= 2
90
+ # if self.tails == 1:
91
+ # p_value /= 2
92
+ return stat, p_value
93
+
94
+ def anova_1w_rm(self):
95
+ """
96
+ Perform repeated measures one-way ANOVA test.
97
+
98
+ Parameters:
99
+ data: list of lists, where each sublist represents repeated measures for a subject
100
+ """
101
+
102
+ df = self.matrix_to_dataframe(self.data)
103
+ res = AnovaRM(df, 'Value', 'Row', within=['Col']).fit()
104
+ stat = res.anova_table['F Value'][0]
105
+ p_value = res.anova_table['Pr > F'][0]
106
+
107
+ self.tails = 2
108
+ return stat, p_value
109
+
110
+ def friedman(self):
111
+ stat, p_value = friedmanchisquare(*self.data)
112
+ self.tails = 2
113
+ return stat, p_value
114
+
115
+ def kruskal_wallis(self):
116
+ stat, p_value = kruskal(*self.data)
117
+
118
+ # Perform Dunn's multiple comparisons if Kruskal-Wallis is significant
119
+ if p_value < 0.05 and self.posthoc:
120
+ self.posthoc_matrix = sp.posthoc_dunn(self.data, p_adjust='bonferroni').values.tolist()
121
+ return stat, p_value
122
+
123
+ def mann_whitney(self):
124
+ stat, p_value = mannwhitneyu(
125
+ self.data[0], self.data[1], alternative='two-sided')
126
+ if self.tails == 1:
127
+ p_value /= 2
128
+ # alternative method of one-tailed calculation
129
+ # gives the same result:
130
+ # stat, p_value = mannwhitneyu(
131
+ # self.data[0], self.data[1], alternative='two-sided' if self.tails == 2 else 'less')
132
+ # if self.tails == 1 and p_value > 0.5:
133
+ # p_value = 1-p_value
134
+ return stat, p_value
135
+
136
+ def t_test_independent(self):
137
+ stat, p_value = ttest_ind(
138
+ self.data[0], self.data[1])
139
+ if self.tails == 1:
140
+ p_value /= 2
141
+ return stat, p_value
142
+
143
+ def t_test_paired(self):
144
+ stat, p_value = ttest_rel(
145
+ self.data[0], self.data[1])
146
+ if self.tails == 1:
147
+ p_value /= 2
148
+ return stat, p_value
149
+
150
+ def t_test_single_sample(self):
151
+ if self.popmean == None:
152
+ self.popmean = 0
153
+ self.AddWarning('no_pop_mean_set')
154
+ stat, p_value = ttest_1samp(self.data[0], self.popmean)
155
+ if self.tails == 1:
156
+ p_value /= 2
157
+ return stat, p_value
158
+
159
+ def wilcoxon(self):
160
+ stat, p_value = wilcoxon(self.data[0], self.data[1])
161
+ if self.tails == 1:
162
+ p_value /= 2
163
+ return stat, p_value
164
+
165
+ def wilcoxon_single_sample(self):
166
+ if self.popmean == None:
167
+ self.popmean = 0
168
+ self.AddWarning('no_pop_mean_set')
169
+ data = [i - self.popmean for i in self.data[0]]
170
+ stat, p_value = wilcoxon(data)
171
+ if self.tails == 1:
172
+ p_value /= 2
173
+ return stat, p_value
@@ -0,0 +1,98 @@
1
+
2
+
3
+ class TextFormatting():
4
+ '''
5
+ Text formatting mixin
6
+ '''
7
+
8
+ def autospace(self, elements_list, space, delimiter=' ') -> str:
9
+ output = ''
10
+ for i, element in enumerate(elements_list):
11
+ if i == len(elements_list):
12
+ output += element
13
+ else:
14
+ output += element + (space-len(element))*delimiter
15
+ return output
16
+
17
+ def print_groups(self, space=24, max_length=15):
18
+ self.log('')
19
+ # Get the number of groups (rows) and the maximum length of rows
20
+ data = self.data
21
+ num_groups = len(data)
22
+ group_longest = max(len(row) for row in data)
23
+
24
+ # Print the header
25
+ header = [f'Group {i+1}' for i in range(num_groups)]
26
+ line = [''*7]
27
+ self.log(self.autospace(header, space))
28
+ self.log(self.autospace(line, space))
29
+
30
+ # Print each column with a placeholder if longer than max_length
31
+ for i in range(group_longest):
32
+ row_values = []
33
+ all_values_empty = True
34
+ for row in data:
35
+ if len(row) > max_length:
36
+ if i < max_length:
37
+ row_values.append(str(row[i]))
38
+ all_values_empty = False
39
+ elif i == max_length:
40
+ row_values.append(f'[{len(row) - max_length} more]')
41
+ all_values_empty = False
42
+ else:
43
+ continue
44
+ else:
45
+ if i < len(row):
46
+ row_values.append(str(row[i]))
47
+ all_values_empty = False
48
+ else:
49
+ row_values.append('')
50
+ if all_values_empty:
51
+ break
52
+ self.log(self.autospace(row_values, space))
53
+
54
+ def make_stars(self, p) -> int:
55
+ if p is not None:
56
+ if p < 0.0001:
57
+ return 4
58
+ if p < 0.001:
59
+ return 3
60
+ elif p < 0.01:
61
+ return 2
62
+ elif p < 0.05:
63
+ return 1
64
+ else:
65
+ return 0
66
+ return 0
67
+
68
+ def make_stars_printed(self, n) -> str:
69
+ return '*' * n if n else 'ns'
70
+
71
+ def make_p_value_printed(self, p) -> str:
72
+ if p is not None:
73
+ if p > 0.99:
74
+ return 'p>0.99'
75
+ elif p >= 0.01:
76
+ return f'p={p:.2g}'
77
+ elif p >= 0.001:
78
+ return f'p={p:.2g}'
79
+ elif p >= 0.0001:
80
+ return f'p={p:.1g}'
81
+ elif p < 0.0001:
82
+ return 'p<0.0001'
83
+ else:
84
+ return 'N/A'
85
+ return 'N/A'
86
+
87
+ def print_results(self):
88
+ self.log('\n\nResults: \n')
89
+ for i in self.results:
90
+ shift = 27 - len(i)
91
+ if i == 'Warnings':
92
+ self.log(i, ':', ' ' * shift, len(self.results[i]))
93
+ if i == 'Posthoc_Matrix':
94
+ self.log(i, ':', ' ' * shift, '{0}x{0} matrix'.format(len(self.results[i])))
95
+ elif i == 'Samples' or i == 'Posthoc_Matrix_printed' or i == 'Posthoc_Matrix_stars':
96
+ pass
97
+ else:
98
+ self.log(i, ':', ' ' * shift, self.results[i])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: AutoStatLib
3
- Version: 0.2.1
3
+ Version: 0.2.5
4
4
  Summary: AutoStatLib - a simple statistical analysis tool
5
5
  Author: Stemonitis, SciWare LLC
6
6
  Author-email: konung-yaropolk <yaropolk1995@gmail.com>
@@ -531,6 +531,7 @@ License-File: LICENSE
531
531
  Requires-Dist: numpy
532
532
  Requires-Dist: scipy
533
533
  Requires-Dist: statsmodels
534
+ Requires-Dist: scikit-posthocs
534
535
  Requires-Dist: pandas
535
536
 
536
537
  # AutoStatLib - python library for automated statistical analysis
@@ -595,10 +596,11 @@ analysis.RunTtestPaired()
595
596
  analysis.RunWilcoxon()
596
597
 
597
598
  # 3 and more independed groups comparison:
598
- analysis.RunAnova()
599
+ analysis.RunOnewayAnova()
599
600
  analysis.RunKruskalWallis()
600
601
 
601
602
  # 3 and more depended groups comparison:
603
+ analysis.RunOnewayAnovaRM()
602
604
  analysis.RunFriedman()
603
605
 
604
606
  # single group tests"
@@ -648,20 +650,42 @@ If errors occured, *GetResult()* returns an empty dictionary
648
650
 
649
651
 
650
652
  ---
651
- ## Pre-Alpha dev status.
652
-
653
- ### TODO:
654
-
655
- --Kruskal-Wallis test - add Dunn's multiple comparisons
656
- --Anova: add 2-way anova and 3-way(?)
657
-
658
- check:
659
- --Wilcoxon signed-rank test and Mann-whitney - check mechanism of one-tailed calc, looks like it works wrong
660
-
661
-
662
- checked tests:
663
- --Wilcoxon 2 tail - ok
664
- --Mann-whitney 2 tail - ok
653
+ ## Pre-Alpha dev status.
654
+
655
+ ### TODO:
656
+
657
+ -- Anova: posthocs
658
+ -- Anova: add 2-way anova and 3-way anova
659
+ -- onevay Anova: add repeated measures (for normal dependent values) with and without Gaisser-Greenhouse correction
660
+ -- onevay Anova: add Brown-Forsithe and Welch (for normal independent values with unequal SDs between groups)
661
+ -- paired T-test: add ratio-paired t-test (ratios of paired values are consistent)
662
+ -- add Welch test (for norm data unequal variances)
663
+ -- add Kolmogorov-smirnov test (unpaired nonparametric 2 sample, compare cumulative distributions)
664
+ -- add independent t-test with Welch correction (do not assume equal SDs in groups)
665
+ -- add correlation test, correlation diagram
666
+ -- add linear regression, regression diagram
667
+ -- add QQ plot
668
+ -- n-sample tests: add onetail option
669
+
670
+ ✅ done -- detailed normality test results
671
+ ✅ done -- added posthoc: Kruskal-Wallis Dunn's multiple comparisons
672
+
673
+
674
+ tests check:
675
+ 1-sample:
676
+ --Wilcoxon 2,1 tails - ok
677
+ --t-tests 2,1 tails -ok
678
+
679
+ 2-sample:
680
+ --Wilcoxon 2,1 tails - ok
681
+ --Mann-whitney 2,1 tails - ok
682
+ --t-tests 2,1 tails -ok
683
+
684
+ n-sample:
685
+ --Kruskal-Wallis 2 tail - ok
686
+ --Dunn's multiple comparisons - ??
687
+ --Friedman 2 tail - ok
688
+ --one-way ANOWA 2 tail - ok
665
689
 
666
690
 
667
691
 
@@ -0,0 +1,13 @@
1
+ AutoStatLib/AutoStatLib.py,sha256=yPNnwCvHSSlEKQvtnoaLFDq6znPlXCz-CrzGInG-1Ys,9647
2
+ AutoStatLib/__init__.py,sha256=0wHYnglzKRPqSHtZlfbMEA2Bj5rDR4LLaXbOrJi-sqM,101
3
+ AutoStatLib/__main__.py,sha256=ROKWensrxDh3Gl-yhexJ-BYFohDSh9y-CuMkaLpmnnQ,247
4
+ AutoStatLib/_version.py,sha256=-QrGYOb9bx4vC_twSInOBJoijtj78lvUzV19y4-tH38,53
5
+ AutoStatLib/helpers.py,sha256=9Fj9pHlXSM3tGHF5L0-i6DilA9VZk6Re93ob_IRxsYg,3424
6
+ AutoStatLib/normality_tests.py,sha256=wvOmo6F7drnhhikoGltyQJC4OBk3PLCszY6ItJk1e0M,2385
7
+ AutoStatLib/statistical_tests.py,sha256=LDcBRkq56hepR23RZtbBnZOs9k9frVjmiB2EKiEkCYs,5990
8
+ AutoStatLib/text_formatting.py,sha256=ShE4BRO69lsC1VT3SsYrmPkuvW7QnyfHVPZEbjNQ_hI,3250
9
+ AutoStatLib-0.2.5.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
10
+ AutoStatLib-0.2.5.dist-info/METADATA,sha256=qJxSrqHlL0wsqaH-ah6MAJa15ikH4NCco1dyVxuNlWs,36572
11
+ AutoStatLib-0.2.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
12
+ AutoStatLib-0.2.5.dist-info/top_level.txt,sha256=BuHzVyE2andc7RwD_UPmDjLl9CUAyBH6WHZGjaIReUI,12
13
+ AutoStatLib-0.2.5.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- AutoStatLib/AutoStatLib.py,sha256=_Id6bJb1OmGpUyfB0ho6-2F9S_8YO8euMB-prLjfpPI,23976
2
- AutoStatLib/__init__.py,sha256=0wHYnglzKRPqSHtZlfbMEA2Bj5rDR4LLaXbOrJi-sqM,101
3
- AutoStatLib/__main__.py,sha256=ROKWensrxDh3Gl-yhexJ-BYFohDSh9y-CuMkaLpmnnQ,247
4
- AutoStatLib/_version.py,sha256=jkitUHmog4Z-O5_8BUMHBBb92A758Kea22juu9b2a2Q,53
5
- AutoStatLib-0.2.1.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
6
- AutoStatLib-0.2.1.dist-info/METADATA,sha256=PFpKRRElBXYciMgOuMRI8wsoCKkX9iiwMVNJthvC_3A,35569
7
- AutoStatLib-0.2.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
- AutoStatLib-0.2.1.dist-info/top_level.txt,sha256=BuHzVyE2andc7RwD_UPmDjLl9CUAyBH6WHZGjaIReUI,12
9
- AutoStatLib-0.2.1.dist-info/RECORD,,