AutoStatLib 0.2.2__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AutoStatLib might be problematic. Click here for more details.

@@ -1,428 +1,10 @@
1
- import numpy as np
2
- import pandas as pd
3
- from statsmodels.stats.diagnostic import lilliefors
4
- from statsmodels.stats.anova import AnovaRM
5
- from scipy.stats import ttest_rel, ttest_ind, ttest_1samp, wilcoxon, mannwhitneyu, f_oneway, kruskal, friedmanchisquare, shapiro, anderson, normaltest
1
+ from AutoStatLib.statistical_tests import StatisticalTests
2
+ from AutoStatLib.normality_tests import NormalityTests
3
+ from AutoStatLib.helpers import Helpers
4
+ from AutoStatLib.text_formatting import TextFormatting
6
5
 
7
6
 
8
- class __StatisticalTests():
9
- '''
10
- Statistical tests mixin
11
- '''
12
-
13
- def run_test_auto(self):
14
-
15
- if self.n_groups == 1:
16
- if self.parametric:
17
- self.run_test_by_id('t_test_single_sample')
18
- else:
19
- self.run_test_by_id('wilcoxon_single_sample')
20
-
21
- elif self.n_groups == 2:
22
- if self.paired:
23
- if self.parametric:
24
- self.run_test_by_id('t_test_paired')
25
- else:
26
- self.run_test_by_id('wilcoxon')
27
- else:
28
- if self.parametric:
29
- self.run_test_by_id('t_test_independent')
30
- else:
31
- self.run_test_by_id('mann_whitney')
32
-
33
- elif self.n_groups >= 3:
34
- if self.paired:
35
- if self.parametric:
36
- self.run_test_by_id('anova_1w_rm')
37
- else:
38
- self.run_test_by_id('friedman')
39
- else:
40
- if self.parametric:
41
- self.run_test_by_id('anova_1w_ordinary')
42
- else:
43
- self.run_test_by_id('kruskal_wallis')
44
-
45
- else:
46
- pass
47
-
48
- def run_test_by_id(self, test_id):
49
-
50
- test_names_dict = {
51
- 'anova_1w_ordinary': 'Ordinary One-Way ANOVA',
52
- 'anova_1w_rm': 'Repeated Measures One-Way ANOVA',
53
- 'friedman': 'Friedman test',
54
- 'kruskal_wallis': 'Kruskal-Wallis test',
55
- 'mann_whitney': 'Mann-Whitney U test',
56
- 't_test_independent': 't-test for independent samples',
57
- 't_test_paired': 't-test for paired samples',
58
- 't_test_single_sample': 'Single-sample t-test',
59
- 'wilcoxon': 'Wilcoxon signed-rank test',
60
- 'wilcoxon_single_sample': 'Wilcoxon signed-rank test for single sample',
61
- }
62
-
63
- match test_id:
64
- case 'anova_1w_ordinary': stat, p_value = self.anova_1w_ordinary()
65
- case 'anova_1w_rm': stat, p_value = self.anova_1w_rm()
66
- case 'friedman': stat, p_value = self.friedman()
67
- case 'kruskal_wallis': stat, p_value = self.kruskal_wallis()
68
- case 'mann_whitney': stat, p_value = self.mann_whitney()
69
- case 't_test_independent': stat, p_value = self.t_test_independent()
70
- case 't_test_paired': stat, p_value = self.t_test_paired()
71
- case 't_test_single_sample': stat, p_value = self.t_test_single_sample()
72
- case 'wilcoxon': stat, p_value = self.wilcoxon()
73
- case 'wilcoxon_single_sample': stat, p_value = self.wilcoxon_single_sample()
74
-
75
- if test_id in self.test_ids_dependent:
76
- self.paired = True
77
- else:
78
- self.paired = False
79
-
80
- self.test_name = test_names_dict[test_id]
81
- self.test_id = test_id
82
- self.test_stat = stat
83
- self.p_value = p_value
84
-
85
- def anova_1w_ordinary(self):
86
- stat, p_value = f_oneway(*self.data)
87
- self.tails = 2
88
- # if self.tails == 1 and p_value > 0.5:
89
- # p_value /= 2
90
- # if self.tails == 1:
91
- # p_value /= 2
92
- return stat, p_value
93
-
94
- def anova_1w_rm(self):
95
- """
96
- Perform repeated measures one-way ANOVA test.
97
-
98
- Parameters:
99
- data: list of lists, where each sublist represents repeated measures for a subject
100
- """
101
-
102
- df = self.matrix_to_dataframe(self.data)
103
- res = AnovaRM(df, 'Value', 'Row', within=['Col']).fit()
104
- stat = res.anova_table['F Value'][0]
105
- p_value = res.anova_table['Pr > F'][0]
106
-
107
- self.tails = 2
108
- return stat, p_value
109
-
110
- def friedman(self):
111
- stat, p_value = friedmanchisquare(*self.data)
112
- self.tails = 2
113
- return stat, p_value
114
-
115
- def kruskal_wallis(self):
116
- stat, p_value = kruskal(*self.data)
117
- return stat, p_value
118
-
119
- def mann_whitney(self):
120
- stat, p_value = mannwhitneyu(
121
- self.data[0], self.data[1], alternative='two-sided')
122
- if self.tails == 1:
123
- p_value /= 2
124
- # alternative method of one-tailed calculation
125
- # gives the same result:
126
- # stat, p_value = mannwhitneyu(
127
- # self.data[0], self.data[1], alternative='two-sided' if self.tails == 2 else 'less')
128
- # if self.tails == 1 and p_value > 0.5:
129
- # p_value = 1-p_value
130
- return stat, p_value
131
-
132
- def t_test_independent(self):
133
- stat, p_value = ttest_ind(
134
- self.data[0], self.data[1])
135
- if self.tails == 1:
136
- p_value /= 2
137
- return stat, p_value
138
-
139
- def t_test_paired(self):
140
- stat, p_value = ttest_rel(
141
- self.data[0], self.data[1])
142
- if self.tails == 1:
143
- p_value /= 2
144
- return stat, p_value
145
-
146
- def t_test_single_sample(self):
147
- if self.popmean == None:
148
- self.popmean = 0
149
- self.AddWarning('no_pop_mean_set')
150
- stat, p_value = ttest_1samp(self.data[0], self.popmean)
151
- if self.tails == 1:
152
- p_value /= 2
153
- return stat, p_value
154
-
155
- def wilcoxon(self):
156
- stat, p_value = wilcoxon(self.data[0], self.data[1])
157
- if self.tails == 1:
158
- p_value /= 2
159
- return stat, p_value
160
-
161
- def wilcoxon_single_sample(self):
162
- if self.popmean == None:
163
- self.popmean = 0
164
- self.AddWarning('no_pop_mean_set')
165
- data = [i - self.popmean for i in self.data[0]]
166
- stat, p_value = wilcoxon(data)
167
- if self.tails == 1:
168
- p_value /= 2
169
- return stat, p_value
170
-
171
-
172
- class __NormalityTests():
173
- '''
174
- Normality tests mixin
175
-
176
- see the article about minimal sample size for tests:
177
- Power comparisons of Shapiro-Wilk, Kolmogorov-Smirnov,
178
- Lilliefors and Anderson-Darling tests, Nornadiah Mohd Razali1, Yap Bee Wah1
179
- '''
180
-
181
- def check_normality(self, data):
182
- sw = None
183
- lf = None
184
- ad = None
185
- ap = None
186
- n = len(data)
187
-
188
- # Shapiro-Wilk test
189
- sw_stat, sw_p_value = shapiro(data)
190
- if sw_p_value > 0.05:
191
- sw = True
192
- else:
193
- sw = False
194
-
195
- # Lilliefors test
196
- lf_stat, lf_p_value = lilliefors(
197
- data, dist='norm')
198
- if lf_p_value > 0.05:
199
- lf = True
200
- else:
201
- lf = False
202
-
203
- # Anderson-Darling test
204
- if n >= 20:
205
- ad_stat, ad_p_value = self.anderson_get_p(
206
- data, dist='norm')
207
- if ad_p_value > 0.05:
208
- ad = True
209
- else:
210
- ad = False
211
-
212
- # D'Agostino-Pearson test
213
- # test result is skewed if n<20
214
- if n >= 20:
215
- ap_stat, ap_p_value = normaltest(data)
216
- if ap_p_value > 0.05:
217
- ap = True
218
- else:
219
- ap = False
220
-
221
- # print(ap_p_value, ad_p_value, sw_p_value, lf_p_value)
222
-
223
- return (sw, lf, ad, ap)
224
-
225
- def anderson_get_p(self, data, dist='norm'):
226
- '''
227
- calculating p-value for Anderson-Darling test using the method described here:
228
- Computation of Probability Associated with Anderson-Darling Statistic
229
- Lorentz Jantschi and Sorana D. Bolboaca, 2018 - Mathematics
230
-
231
- '''
232
- e = 2.718281828459045
233
- n = len(data)
234
-
235
- ad, critical_values, significance_levels = anderson(
236
- data, dist=dist)
237
-
238
- # adjust ad_stat for small sample sizes:
239
- s = ad*(1 + 0.75/n + 2.25/(n**2))
240
-
241
- if s >= 0.6:
242
- p = e**(1.2937 - 5.709*s + 0.0186*s**2)
243
- elif s > 0.34:
244
- p = e**(0.9177 - 4.279*s - 1.38*s**2)
245
- elif s > 0.2:
246
- p = 1 - e**(-8.318 + 42.796*s - 59.938*s**2)
247
- elif s <= 0.2:
248
- p = 1 - e**(-13.436 + 101.14*s - 223.73*s**2)
249
- else:
250
- p = None
251
-
252
- return ad, p
253
-
254
-
255
- class __Helpers():
256
-
257
- def matrix_to_dataframe(self, matrix):
258
- data = []
259
- cols = []
260
- rows = []
261
-
262
- order_number = 1
263
- for i, row in enumerate(matrix):
264
- for j, value in enumerate(row):
265
- data.append(value)
266
- cols.append(i)
267
- rows.append(j)
268
- order_number += 1
269
-
270
- df = pd.DataFrame(
271
- {'Row': rows, 'Col': cols, 'Value': data})
272
- return df
273
-
274
- def create_results_dict(self) -> dict:
275
-
276
- self.stars_int = self.make_stars()
277
- self.stars_str = '*' * self.stars_int if self.stars_int else 'ns'
278
-
279
- return {
280
- 'p-value': self.make_p_value_printed(),
281
- 'Significance(p<0.05)': True if self.p_value.item() < 0.05 else False,
282
- 'Stars_Printed': self.stars_str,
283
- 'Test_Name': self.test_name,
284
- 'Groups_Compared': self.n_groups,
285
- 'Population_Mean': self.popmean if self.n_groups == 1 else 'N/A',
286
- 'Data_Normaly_Distributed': self.parametric,
287
- 'Parametric_Test_Applied': True if self.test_id in self.test_ids_parametric else False,
288
- 'Paired_Test_Applied': self.paired,
289
- 'Tails': self.tails,
290
- 'p-value_exact': self.p_value.item(),
291
- 'Stars': self.stars_int,
292
- # 'Stat_Value': self.test_stat.item(),
293
- 'Warnings': self.warnings,
294
- 'Groups_N': [len(self.data[i]) for i in range(len(self.data))],
295
- 'Groups_Median': [np.median(self.data[i]).item() for i in range(len(self.data))],
296
- 'Groups_Mean': [np.mean(self.data[i]).item() for i in range(len(self.data))],
297
- 'Groups_SD': [np.std(self.data[i]).item() for i in range(len(self.data))],
298
- 'Groups_SE': [np.std(self.data[i]).item() / np.sqrt(len(self.data)).item() for i in range(len(self.data))],
299
- # actually returns list of lists of numpy dtypes of float64, next make it return regular floats:
300
- 'Samples': self.data,
301
- }
302
-
303
- def log(self, *args, **kwargs):
304
- message = ' '.join(map(str, args))
305
- # print(message, **kwargs)
306
- self.summary += '\n' + message
307
-
308
- def AddWarning(self, warning_id):
309
- message = self.warning_ids_all[warning_id]
310
- self.log(message)
311
- self.warnings.append(message)
312
-
313
-
314
- class __TextFormatting():
315
- '''
316
- Text formatting mixin
317
- '''
318
-
319
- def autospace(self, elements_list, space, delimiter=' ') -> str:
320
- output = ''
321
- for i, element in enumerate(elements_list):
322
- if i == len(elements_list):
323
- output += element
324
- else:
325
- output += element + (space-len(element))*delimiter
326
- return output
327
-
328
- def print_groups(self, space=24, max_length=15):
329
- self.log('')
330
- # Get the number of groups (rows) and the maximum length of rows
331
- data = self.data
332
- num_groups = len(data)
333
- group_longest = max(len(row) for row in data)
334
-
335
- # Print the header
336
- header = [f'Group {i+1}' for i in range(num_groups)]
337
- line = [''*7]
338
- self.log(self.autospace(header, space))
339
- self.log(self.autospace(line, space))
340
-
341
- # Print each column with a placeholder if longer than max_length
342
- for i in range(group_longest):
343
- row_values = []
344
- all_values_empty = True
345
- for row in data:
346
- if len(row) > max_length:
347
- if i < max_length:
348
- row_values.append(str(row[i]))
349
- all_values_empty = False
350
- elif i == max_length:
351
- row_values.append(f'[{len(row) - max_length} more]')
352
- all_values_empty = False
353
- else:
354
- continue
355
- else:
356
- if i < len(row):
357
- row_values.append(str(row[i]))
358
- all_values_empty = False
359
- else:
360
- row_values.append('')
361
- if all_values_empty:
362
- break
363
- self.log(self.autospace(row_values, space))
364
-
365
- def make_stars(self) -> int:
366
- p = self.p_value.item()
367
- if p is not None:
368
- if p < 0.0001:
369
- return 4
370
- if p < 0.001:
371
- return 3
372
- elif p < 0.01:
373
- return 2
374
- elif p < 0.05:
375
- return 1
376
- else:
377
- return 0
378
- return 0
379
-
380
- def make_p_value_printed(self) -> str:
381
- p = self.p_value.item()
382
- if p is not None:
383
- if p > 0.99:
384
- return 'p>0.99'
385
- elif p >= 0.01:
386
- return f'p={p:.2g}'
387
- elif p >= 0.001:
388
- return f'p={p:.2g}'
389
- elif p >= 0.0001:
390
- return f'p={p:.1g}'
391
- elif p < 0.0001:
392
- return 'p<0.0001'
393
- else:
394
- return 'N/A'
395
- return 'N/A'
396
-
397
- def print_results(self):
398
- self.log('\n\nResults: \n')
399
- for i in self.results:
400
- shift = 27 - len(i)
401
- if i == 'Warnings':
402
- self.log(i, ':', ' ' * shift, len(self.results[i]))
403
- elif i == 'Samples':
404
- pass
405
- else:
406
- self.log(i, ':', ' ' * shift, self.results[i])
407
-
408
-
409
- class __InputFormatting():
410
- def floatify_recursive(self, data):
411
- if isinstance(data, list):
412
- # Recursively process sublists and filter out None values
413
- processed_list = [self.floatify_recursive(item) for item in data]
414
- return [item for item in processed_list if item is not None]
415
- else:
416
- try:
417
- # Try to convert the item to float
418
- return np.float64(data)
419
- except (ValueError, TypeError):
420
- # If conversion fails, replace with None
421
- self.warning_flag_non_numeric_data = True
422
- return None
423
-
424
-
425
- class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting, __InputFormatting, __Helpers):
7
+ class StatisticalAnalysis(StatisticalTests, NormalityTests, TextFormatting, Helpers):
426
8
  '''
427
9
  The main class
428
10
  *documentation placeholder*
@@ -434,6 +16,7 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
434
16
  paired=False,
435
17
  tails=2,
436
18
  popmean=None,
19
+ posthoc=True,
437
20
  verbose=True):
438
21
  self.results = None
439
22
  self.error = False
@@ -441,6 +24,7 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
441
24
  self.paired = paired
442
25
  self.tails = tails
443
26
  self.popmean = popmean
27
+ self.posthoc = posthoc
444
28
  self.verbose = verbose
445
29
  self.n_groups = len(self.groups_list)
446
30
  self.warning_flag_non_numeric_data = False
@@ -495,7 +79,7 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
495
79
  'no_pop_mean_set': '\nWarning: No Population Mean was set up for single-sample test, used default 0 value.\n The results might be skewed. \n Please, set the Population Mean and run the test again.\n',
496
80
  }
497
81
 
498
- def __run_test(self, test='auto'):
82
+ def run_test(self, test='auto'):
499
83
 
500
84
  # reset values from previous tests
501
85
  self.results = None
@@ -506,9 +90,12 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
506
90
  self.test_id = None
507
91
  self.test_stat = None
508
92
  self.p_value = None
93
+ self.posthoc_matrix_df = None
94
+ self.posthoc_matrix = []
95
+ self.posthoc_name = None
509
96
 
510
97
  self.log('\n' + '-'*67)
511
- self.log('Statistical analysis initiated for data in {} groups\n'.format(
98
+ self.log('Statistical analysis __init__iated for data in {} groups\n'.format(
512
99
  len(self.groups_list)))
513
100
 
514
101
  # adjusting input data type
@@ -588,7 +175,6 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
588
175
  else:
589
176
  self.run_test_auto()
590
177
 
591
-
592
178
  # print the results
593
179
  self.results = self.create_results_dict()
594
180
  self.print_results()
@@ -600,49 +186,48 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
600
186
  if self.verbose == True:
601
187
  print(self.summary)
602
188
 
603
-
604
-
605
189
  # public methods:
190
+
606
191
  def RunAuto(self):
607
- self.__run_test(test='auto')
192
+ self.run_test(test='auto')
608
193
 
609
194
  def RunManual(self, test):
610
- self.__run_test(test)
195
+ self.run_test(test)
611
196
 
612
197
  def RunOnewayAnova(self):
613
- self.__run_test(test='anova_1w_ordinary')
198
+ self.run_test(test='anova_1w_ordinary')
614
199
 
615
200
  def RunOnewayAnovaRM(self):
616
- self.__run_test(test='anova_1w_rm')
201
+ self.run_test(test='anova_1w_rm')
617
202
 
618
203
  def RunFriedman(self):
619
- self.__run_test(test='friedman')
204
+ self.run_test(test='friedman')
620
205
 
621
206
  def RunKruskalWallis(self):
622
- self.__run_test(test='kruskal_wallis')
207
+ self.run_test(test='kruskal_wallis')
623
208
 
624
209
  def RunMannWhitney(self):
625
- self.__run_test(test='mann_whitney')
210
+ self.run_test(test='mann_whitney')
626
211
 
627
212
  def RunTtest(self):
628
- self.__run_test(test='t_test_independent')
213
+ self.run_test(test='t_test_independent')
629
214
 
630
215
  def RunTtestPaired(self):
631
- self.__run_test(test='t_test_paired')
216
+ self.run_test(test='t_test_paired')
632
217
 
633
218
  def RunTtestSingleSample(self):
634
- self.__run_test(test='t_test_single_sample')
219
+ self.run_test(test='t_test_single_sample')
635
220
 
636
221
  def RunWilcoxonSingleSample(self):
637
- self.__run_test(test='wilcoxon_single_sample')
222
+ self.run_test(test='wilcoxon_single_sample')
638
223
 
639
224
  def RunWilcoxon(self):
640
- self.__run_test(test='wilcoxon')
225
+ self.run_test(test='wilcoxon')
641
226
 
642
227
  def GetResult(self):
643
228
  if not self.results and not self.error:
644
229
  print('No test chosen, no results to output')
645
- # self.__run_test(test='auto')
230
+ # self.run_test(test='auto')
646
231
  return self.results
647
232
  if not self.results and self.error:
648
233
  print('Error occured, no results to output')
@@ -653,7 +238,7 @@ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting
653
238
  def GetSummary(self):
654
239
  if not self.results and not self.error:
655
240
  print('No test chosen, no summary to output')
656
- # self.__run_test(test='auto')
241
+ # self.run_test(test='auto')
657
242
  return self.summary
658
243
  else:
659
244
  return self.summary
AutoStatLib/_version.py CHANGED
@@ -1,2 +1,2 @@
1
1
  # AutoStatLib package version:
2
- __version__ = "0.2.2"
2
+ __version__ = "0.2.6"
AutoStatLib/helpers.py ADDED
@@ -0,0 +1,80 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+
5
+ class Helpers():
6
+
7
+ def matrix_to_dataframe(self, matrix):
8
+ data = []
9
+ cols = []
10
+ rows = []
11
+
12
+ order_number = 1
13
+ for i, row in enumerate(matrix):
14
+ for j, value in enumerate(row):
15
+ data.append(value)
16
+ cols.append(i)
17
+ rows.append(j)
18
+ order_number += 1
19
+
20
+ df = pd.DataFrame(
21
+ {'Row': rows, 'Col': cols, 'Value': data})
22
+ return df
23
+
24
+ def floatify_recursive(self, data):
25
+ if isinstance(data, list):
26
+ # Recursively process sublists and filter out None values
27
+ processed_list = [self.floatify_recursive(item) for item in data]
28
+ return [item for item in processed_list if item is not None]
29
+ else:
30
+ try:
31
+ # Try to convert the item to float
32
+ return np.float64(data)
33
+ except (ValueError, TypeError):
34
+ # If conversion fails, replace with None
35
+ self.warning_flag_non_numeric_data = True
36
+ return None
37
+
38
+ def create_results_dict(self) -> dict:
39
+
40
+ self.stars_int = self.make_stars(self.p_value.item())
41
+ self.stars_str = self.make_stars_printed(self.stars_int)
42
+
43
+ return {
44
+ 'p-value': self.make_p_value_printed(self.p_value.item()),
45
+ 'Significance(p<0.05)': True if self.p_value.item() < 0.05 else False,
46
+ 'Stars_Printed': self.stars_str,
47
+ 'Test_Name': self.test_name,
48
+ 'Groups_Compared': self.n_groups,
49
+ 'Population_Mean': self.popmean if self.n_groups == 1 else 'N/A',
50
+ 'Data_Normaly_Distributed': self.parametric,
51
+ 'Parametric_Test_Applied': True if self.test_id in self.test_ids_parametric else False,
52
+ 'Paired_Test_Applied': self.paired,
53
+ 'Tails': self.tails,
54
+ 'p-value_exact': self.p_value.item(),
55
+ 'Stars': self.stars_int,
56
+ # 'Stat_Value': self.test_stat.item(),
57
+ 'Warnings': self.warnings,
58
+ 'Groups_N': [len(self.data[i]) for i in range(len(self.data))],
59
+ 'Groups_Median': [np.median(self.data[i]).item() for i in range(len(self.data))],
60
+ 'Groups_Mean': [np.mean(self.data[i]).item() for i in range(len(self.data))],
61
+ 'Groups_SD': [np.std(self.data[i]).item() for i in range(len(self.data))],
62
+ 'Groups_SE': [np.std(self.data[i]).item() / np.sqrt(len(self.data)).item() for i in range(len(self.data))],
63
+ # actually returns list of lists of numpy dtypes of float64, next make it return regular floats:
64
+ 'Samples': self.data,
65
+ 'Posthoc_Tests_Name': self.posthoc_name if self.posthoc_name is not None else '',
66
+ 'Posthoc_Matrix': self.posthoc_matrix if self.posthoc_matrix else [],
67
+ 'Posthoc_Matrix_bool': [[bool(element) for element in row] for row in self.posthoc_matrix] if self.posthoc_matrix else [],
68
+ 'Posthoc_Matrix_printed': [[self.make_p_value_printed(element) for element in row] for row in self.posthoc_matrix] if self.posthoc_matrix else [],
69
+ 'Posthoc_Matrix_stars': [[self.make_stars_printed(self.make_stars(element)) for element in row] for row in self.posthoc_matrix] if self.posthoc_matrix else [],
70
+ }
71
+
72
+ def log(self, *args, **kwargs):
73
+ message = ' '.join(map(str, args))
74
+ # print(message, **kwargs)
75
+ self.summary += '\n' + message
76
+
77
+ def AddWarning(self, warning_id):
78
+ message = self.warning_ids_all[warning_id]
79
+ self.log(message)
80
+ self.warnings.append(message)
@@ -0,0 +1,83 @@
1
+ from statsmodels.stats.diagnostic import lilliefors
2
+ from scipy.stats import shapiro, normaltest, anderson
3
+
4
+
5
+ class NormalityTests():
6
+ '''
7
+ Normality tests mixin
8
+
9
+ see the article about minimal sample size for tests:
10
+ Power comparisons of Shapiro-Wilk, Kolmogorov-Smirnov,
11
+ Lilliefors and Anderson-Darling tests, Nornadiah Mohd Razali1, Yap Bee Wah1
12
+ '''
13
+
14
+ def check_normality(self, data):
15
+ sw = None
16
+ lf = None
17
+ ad = None
18
+ ap = None
19
+ n = len(data)
20
+
21
+ # Shapiro-Wilk test
22
+ sw_stat, sw_p_value = shapiro(data)
23
+ if sw_p_value and sw_p_value > 0.05:
24
+ sw = True
25
+ else:
26
+ sw = False
27
+
28
+ # Lilliefors test
29
+ lf_stat, lf_p_value = lilliefors(
30
+ data, dist='norm')
31
+ if lf_p_value and lf_p_value > 0.05:
32
+ lf = True
33
+ else:
34
+ lf = False
35
+
36
+ # Anderson-Darling test
37
+ if n >= 20:
38
+ ad_stat, ad_p_value = self.anderson_get_p(
39
+ data, dist='norm')
40
+ if ad_p_value and ad_p_value > 0.05:
41
+ ad = True
42
+ else:
43
+ ad = False
44
+
45
+ # D'Agostino-Pearson test
46
+ # test result is skewed if n<20
47
+ if n >= 20:
48
+ ap_stat, ap_p_value = normaltest(data)
49
+ if ap_p_value and ap_p_value > 0.05:
50
+ ap = True
51
+ else:
52
+ ap = False
53
+
54
+ return (sw, lf, ad, ap)
55
+
56
+ def anderson_get_p(self, data, dist='norm'):
57
+ '''
58
+ calculating p-value for Anderson-Darling test using the method described here:
59
+ Computation of Probability Associated with Anderson-Darling Statistic
60
+ Lorentz Jantschi and Sorana D. Bolboaca, 2018 - Mathematics
61
+
62
+ '''
63
+ e = 2.718281828459045
64
+ n = len(data)
65
+
66
+ ad, critical_values, significance_levels = anderson(
67
+ data, dist=dist)
68
+
69
+ # adjust ad_stat for small sample sizes:
70
+ s = ad*(1 + 0.75/n + 2.25/(n**2))
71
+
72
+ if s >= 0.6:
73
+ p = e**(1.2937 - 5.709*s + 0.0186*s**2)
74
+ elif s > 0.34:
75
+ p = e**(0.9177 - 4.279*s - 1.38*s**2)
76
+ elif s > 0.2:
77
+ p = 1 - e**(-8.318 + 42.796*s - 59.938*s**2)
78
+ elif s <= 0.2:
79
+ p = 1 - e**(-13.436 + 101.14*s - 223.73*s**2)
80
+ else:
81
+ p = None
82
+
83
+ return ad, p
@@ -0,0 +1,184 @@
1
+ import numpy as np
2
+ import scikit_posthocs as sp
3
+ from statsmodels.stats.anova import AnovaRM
4
+ from statsmodels.stats.multicomp import pairwise_tukeyhsd
5
+ from scipy.stats import ttest_rel, ttest_ind, ttest_1samp, wilcoxon, mannwhitneyu, f_oneway, kruskal, friedmanchisquare
6
+
7
+
8
+ class StatisticalTests():
9
+ '''
10
+ Statistical tests mixin
11
+ '''
12
+
13
+ def run_test_auto(self):
14
+
15
+ if self.n_groups == 1:
16
+ if self.parametric:
17
+ self.run_test_by_id('t_test_single_sample')
18
+ else:
19
+ self.run_test_by_id('wilcoxon_single_sample')
20
+
21
+ elif self.n_groups == 2:
22
+ if self.paired:
23
+ if self.parametric:
24
+ self.run_test_by_id('t_test_paired')
25
+ else:
26
+ self.run_test_by_id('wilcoxon')
27
+ else:
28
+ if self.parametric:
29
+ self.run_test_by_id('t_test_independent')
30
+ else:
31
+ self.run_test_by_id('mann_whitney')
32
+
33
+ elif self.n_groups >= 3:
34
+ if self.paired:
35
+ if self.parametric:
36
+ self.run_test_by_id('anova_1w_rm')
37
+ else:
38
+ self.run_test_by_id('friedman')
39
+ else:
40
+ if self.parametric:
41
+ self.run_test_by_id('anova_1w_ordinary')
42
+ else:
43
+ self.run_test_by_id('kruskal_wallis')
44
+
45
+ else:
46
+ pass
47
+
48
+ def run_test_by_id(self, test_id):
49
+
50
+ test_names_dict = {
51
+ 'anova_1w_ordinary': 'Ordinary One-Way ANOVA',
52
+ 'anova_1w_rm': 'Repeated Measures One-Way ANOVA',
53
+ 'friedman': 'Friedman test',
54
+ 'kruskal_wallis': 'Kruskal-Wallis test',
55
+ 'mann_whitney': 'Mann-Whitney U test',
56
+ 't_test_independent': 't-test for independent samples',
57
+ 't_test_paired': 't-test for paired samples',
58
+ 't_test_single_sample': 'Single-sample t-test',
59
+ 'wilcoxon': 'Wilcoxon signed-rank test',
60
+ 'wilcoxon_single_sample': 'Wilcoxon signed-rank test for single sample',
61
+ }
62
+
63
+ match test_id:
64
+ case 'anova_1w_ordinary': stat, p_value = self.anova_1w_ordinary()
65
+ case 'anova_1w_rm': stat, p_value = self.anova_1w_rm()
66
+ case 'friedman': stat, p_value = self.friedman()
67
+ case 'kruskal_wallis': stat, p_value = self.kruskal_wallis()
68
+ case 'mann_whitney': stat, p_value = self.mann_whitney()
69
+ case 't_test_independent': stat, p_value = self.t_test_independent()
70
+ case 't_test_paired': stat, p_value = self.t_test_paired()
71
+ case 't_test_single_sample': stat, p_value = self.t_test_single_sample()
72
+ case 'wilcoxon': stat, p_value = self.wilcoxon()
73
+ case 'wilcoxon_single_sample': stat, p_value = self.wilcoxon_single_sample()
74
+
75
+ if test_id in self.test_ids_dependent:
76
+ self.paired = True
77
+ else:
78
+ self.paired = False
79
+
80
+ self.test_name = test_names_dict[test_id]
81
+ self.test_id = test_id
82
+ self.test_stat = stat
83
+ self.p_value = p_value
84
+
85
+ def anova_1w_ordinary(self):
86
+ stat, p_value = f_oneway(*self.data)
87
+ self.tails = 2
88
+ # if self.tails == 1 and p_value > 0.5:
89
+ # p_value /= 2
90
+ # if self.tails == 1:
91
+ # p_value /= 2
92
+
93
+ # if p_value < 0.05 and self.posthoc:
94
+ # data_flat = np.concatenate(self.data)
95
+ # self.posthoc_name = 'Tukey`s multiple comparisons'
96
+ # group_labels = np.concatenate(
97
+ # [[f"Group_{i+1}"] * len(group) for i, group in enumerate(self.data)])
98
+ # # Tukey's multiple comparisons
99
+ # tukey_result = pairwise_tukeyhsd(data_flat, group_labels)
100
+ # print(tukey_result)
101
+ return stat, p_value
102
+
103
+ def anova_1w_rm(self):
104
+ """
105
+ Perform repeated measures one-way ANOVA test.
106
+
107
+ Parameters:
108
+ data: list of lists, where each sublist represents repeated measures for a subject
109
+ """
110
+
111
+ df = self.matrix_to_dataframe(self.data)
112
+ res = AnovaRM(df, 'Value', 'Row', within=['Col']).fit()
113
+ stat = res.anova_table['F Value'][0]
114
+ p_value = res.anova_table['Pr > F'][0]
115
+
116
+ self.tails = 2
117
+ return stat, p_value
118
+
119
+ def friedman(self):
120
+ stat, p_value = friedmanchisquare(*self.data)
121
+ self.tails = 2
122
+ return stat, p_value
123
+
124
+ def kruskal_wallis(self):
125
+ stat, p_value = kruskal(*self.data)
126
+
127
+ # Perform Dunn's multiple comparisons if Kruskal-Wallis is significant
128
+ if p_value < 0.05 and self.posthoc:
129
+ self.posthoc_matrix = sp.posthoc_dunn(
130
+ self.data, p_adjust='bonferroni').values.tolist()
131
+ self.posthoc_name = 'Dunn`s multiple comparisons'
132
+ return stat, p_value
133
+
134
+ def mann_whitney(self):
135
+ stat, p_value = mannwhitneyu(
136
+ self.data[0], self.data[1], alternative='two-sided')
137
+ if self.tails == 1:
138
+ p_value /= 2
139
+ # alternative method of one-tailed calculation
140
+ # gives the same result:
141
+ # stat, p_value = mannwhitneyu(
142
+ # self.data[0], self.data[1], alternative='two-sided' if self.tails == 2 else 'less')
143
+ # if self.tails == 1 and p_value > 0.5:
144
+ # p_value = 1-p_value
145
+ return stat, p_value
146
+
147
+ def t_test_independent(self):
148
+ stat, p_value = ttest_ind(
149
+ self.data[0], self.data[1])
150
+ if self.tails == 1:
151
+ p_value /= 2
152
+ return stat, p_value
153
+
154
+ def t_test_paired(self):
155
+ stat, p_value = ttest_rel(
156
+ self.data[0], self.data[1])
157
+ if self.tails == 1:
158
+ p_value /= 2
159
+ return stat, p_value
160
+
161
+ def t_test_single_sample(self):
162
+ if self.popmean == None:
163
+ self.popmean = 0
164
+ self.AddWarning('no_pop_mean_set')
165
+ stat, p_value = ttest_1samp(self.data[0], self.popmean)
166
+ if self.tails == 1:
167
+ p_value /= 2
168
+ return stat, p_value
169
+
170
+ def wilcoxon(self):
171
+ stat, p_value = wilcoxon(self.data[0], self.data[1])
172
+ if self.tails == 1:
173
+ p_value /= 2
174
+ return stat, p_value
175
+
176
+ def wilcoxon_single_sample(self):
177
+ if self.popmean == None:
178
+ self.popmean = 0
179
+ self.AddWarning('no_pop_mean_set')
180
+ data = [i - self.popmean for i in self.data[0]]
181
+ stat, p_value = wilcoxon(data)
182
+ if self.tails == 1:
183
+ p_value /= 2
184
+ return stat, p_value
@@ -0,0 +1,106 @@
1
+
2
+
3
+ class TextFormatting():
4
+ '''
5
+ Text formatting mixin
6
+ '''
7
+
8
+ def autospace(self, elements_list, space, delimiter=' ') -> str:
9
+ output = ''
10
+ for i, element in enumerate(elements_list):
11
+ if i == len(elements_list):
12
+ output += element
13
+ else:
14
+ output += element + (space-len(element))*delimiter
15
+ return output
16
+
17
+ def print_groups(self, space=24, max_length=15):
18
+ self.log('')
19
+ # Get the number of groups (rows) and the maximum length of rows
20
+ data = self.data
21
+ num_groups = len(data)
22
+ group_longest = max(len(row) for row in data)
23
+
24
+ # Print the header
25
+ header = [f'Group {i+1}' for i in range(num_groups)]
26
+ line = [''*7]
27
+ self.log(self.autospace(header, space))
28
+ self.log(self.autospace(line, space))
29
+
30
+ # Print each column with a placeholder if longer than max_length
31
+ for i in range(group_longest):
32
+ row_values = []
33
+ all_values_empty = True
34
+ for row in data:
35
+ if len(row) > max_length:
36
+ if i < max_length:
37
+ row_values.append(str(row[i]))
38
+ all_values_empty = False
39
+ elif i == max_length:
40
+ row_values.append(f'[{len(row) - max_length} more]')
41
+ all_values_empty = False
42
+ else:
43
+ continue
44
+ else:
45
+ if i < len(row):
46
+ row_values.append(str(row[i]))
47
+ all_values_empty = False
48
+ else:
49
+ row_values.append('')
50
+ if all_values_empty:
51
+ break
52
+ self.log(self.autospace(row_values, space))
53
+
54
+ def print_results(self):
55
+ self.log('\n\nResults: \n')
56
+ for i in self.results:
57
+ shift = 27 - len(i)
58
+ if i == 'Warnings':
59
+ self.log(i, ':', ' ' * shift, len(self.results[i]))
60
+ elif i == 'Posthoc_Tests_Name':
61
+ self.log(i, ':', ' ' * shift,
62
+ self.results[i]) if self.results[i] != '' else 'N/A'
63
+ elif i == 'Posthoc_Matrix':
64
+ self.log(i, ':', ' ' * shift, '{0}x{0} matrix'.format(
65
+ len(self.results[i])) if self.results[i] else 'N/A')
66
+ elif (i == 'Samples'
67
+ or i == 'Posthoc_Matrix_bool'
68
+ or i == 'Posthoc_Matrix_printed'
69
+ or i == 'Posthoc_Matrix_stars'
70
+ ):
71
+ pass
72
+ else:
73
+ self.log(i, ':', ' ' * shift, self.results[i])
74
+
75
+ def make_p_value_printed(self, p) -> str:
76
+ if p is not None:
77
+ if p > 0.99:
78
+ return 'p>0.99'
79
+ elif p >= 0.01:
80
+ return f'p={p:.2g}'
81
+ elif p >= 0.001:
82
+ return f'p={p:.2g}'
83
+ elif p >= 0.0001:
84
+ return f'p={p:.1g}'
85
+ elif p < 0.0001:
86
+ return 'p<0.0001'
87
+ else:
88
+ return 'N/A'
89
+ return 'N/A'
90
+
91
+ def make_stars(self, p) -> int:
92
+ if p is not None:
93
+ if p < 0.0001:
94
+ return 4
95
+ if p < 0.001:
96
+ return 3
97
+ elif p < 0.01:
98
+ return 2
99
+ elif p < 0.05:
100
+ return 1
101
+ else:
102
+ return 0
103
+ return 0
104
+
105
+ def make_stars_printed(self, n) -> str:
106
+ return '*' * n if n else 'ns'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: AutoStatLib
3
- Version: 0.2.2
3
+ Version: 0.2.6
4
4
  Summary: AutoStatLib - a simple statistical analysis tool
5
5
  Author: Stemonitis, SciWare LLC
6
6
  Author-email: konung-yaropolk <yaropolk1995@gmail.com>
@@ -509,15 +509,16 @@ License: GNU LESSER GENERAL PUBLIC LICENSE
509
509
 
510
510
  That's all there is to it!
511
511
 
512
- Project-URL: Homepage, https://github.com/konung-yaropolk/NPL
513
- Project-URL: Issues, https://github.com/konung-yaropolk/NPL/issues
512
+ Project-URL: Homepage, https://github.com/konung-yaropolk/AutoStatLib
513
+ Project-URL: Repository, https://github.com/konung-yaropolk/AutoStatLib.git
514
+ Project-URL: Issues, https://github.com/konung-yaropolk/AutoStatLib/issues
514
515
  Keywords: Science,Statistics
515
516
  Classifier: Programming Language :: Python
516
517
  Classifier: Programming Language :: Python :: 3
517
518
  Classifier: Programming Language :: Python :: 3.12
518
519
  Classifier: License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)
519
520
  Classifier: Operating System :: OS Independent
520
- Classifier: Development Status :: 2 - Pre-Alpha
521
+ Classifier: Development Status :: 4 - Beta
521
522
  Classifier: Intended Audience :: Developers
522
523
  Classifier: Intended Audience :: Science/Research
523
524
  Classifier: Natural Language :: English
@@ -531,6 +532,7 @@ License-File: LICENSE
531
532
  Requires-Dist: numpy
532
533
  Requires-Dist: scipy
533
534
  Requires-Dist: statsmodels
535
+ Requires-Dist: scikit-posthocs
534
536
  Requires-Dist: pandas
535
537
 
536
538
  # AutoStatLib - python library for automated statistical analysis
@@ -619,26 +621,30 @@ results = analysis.GetResult()
619
621
  The results dictionary keys with representing value types:
620
622
  ```
621
623
  {
622
- 'p-value': String
623
- 'Significance(p<0.05)': Boolean
624
- 'Stars_Printed': String
625
- 'Test_Name': String
626
- 'Groups_Compared': Integer
627
- 'Population_Mean': Float (taken from the input)
628
- 'Data_Normaly_Distributed': Boolean
629
- 'Parametric_Test_Applied': Boolean
630
- 'Paired_Test_Applied': Boolean
631
- 'Tails': Integer (taken from the input)
632
- 'p-value_exact': Float
633
- 'Stars': Integer
634
- 'Warnings': String
635
- 'Groups_N': List of integers
636
- 'Groups_Median': List of floats
637
- 'Groups_Mean': List of floats
638
- 'Groups_SD': List of floats
639
- 'Groups_SE': List of floats
640
- 'Samples': List of input values by groups
624
+ 'p-value' : String
625
+ 'Significance(p<0.05)' : Boolean
626
+ 'Stars_Printed' : String
627
+ 'Test_Name' : String
628
+ 'Groups_Compared' : Integer
629
+ 'Population_Mean' : Float (taken from the input)
630
+ 'Data_Normaly_Distributed' : Boolean
631
+ 'Parametric_Test_Applied' : Boolean
632
+ 'Paired_Test_Applied' : Boolean
633
+ 'Tails' : Integer (taken from the input)
634
+ 'p-value_exact' : Float
635
+ 'Stars' : Integer
636
+ 'Warnings' : String
637
+ 'Groups_N' : List of integers
638
+ 'Groups_Median' : List of floats
639
+ 'Groups_Mean' : List of floats
640
+ 'Groups_SD' : List of floats
641
+ 'Groups_SE' : List of floats
642
+ 'Samples' : List of input values by groups
641
643
  (taken from the input)
644
+ 'Posthoc_Matrix' : 2D List of floats
645
+ 'Posthoc_Matrix_bool' : 2D List of Boolean
646
+ 'Posthoc_Matrix_printed': 2D List of String
647
+ 'Posthoc_Matrix_stars': 2D List of String
642
648
  }
643
649
  ```
644
650
  If errors occured, *GetResult()* returns an empty dictionary
@@ -653,7 +659,7 @@ If errors occured, *GetResult()* returns an empty dictionary
653
659
 
654
660
  ### TODO:
655
661
 
656
- -- Kruskal-Wallis test - add Dunn's multiple comparisons
662
+ -- Anova: posthocs
657
663
  -- Anova: add 2-way anova and 3-way anova
658
664
  -- onevay Anova: add repeated measures (for normal dependent values) with and without Gaisser-Greenhouse correction
659
665
  -- onevay Anova: add Brown-Forsithe and Welch (for normal independent values with unequal SDs between groups)
@@ -666,10 +672,11 @@ If errors occured, *GetResult()* returns an empty dictionary
666
672
  -- add QQ plot
667
673
  -- n-sample tests: add onetail option
668
674
 
669
- ✅ done -- detailed normality test results
675
+ ✅ done -- detailed normality test results
676
+ ✅ done -- added posthoc: Kruskal-Wallis Dunn's multiple comparisons
670
677
 
671
678
 
672
- checked tests:
679
+ tests check:
673
680
  1-sample:
674
681
  --Wilcoxon 2,1 tails - ok
675
682
  --t-tests 2,1 tails -ok
@@ -681,6 +688,7 @@ checked tests:
681
688
 
682
689
  n-sample:
683
690
  --Kruskal-Wallis 2 tail - ok
691
+ --Dunn's multiple comparisons - ??
684
692
  --Friedman 2 tail - ok
685
693
  --one-way ANOWA 2 tail - ok
686
694
 
@@ -0,0 +1,13 @@
1
+ AutoStatLib/AutoStatLib.py,sha256=KJM2x-fChnxVinnCFsAKpoacKeoIJcJw_r8FYqPCljk,9677
2
+ AutoStatLib/__init__.py,sha256=0wHYnglzKRPqSHtZlfbMEA2Bj5rDR4LLaXbOrJi-sqM,101
3
+ AutoStatLib/__main__.py,sha256=ROKWensrxDh3Gl-yhexJ-BYFohDSh9y-CuMkaLpmnnQ,247
4
+ AutoStatLib/_version.py,sha256=dXJmKxrIARBAzN_ILPim1iDgRdZ6HKMR3FqtamGwNUk,53
5
+ AutoStatLib/helpers.py,sha256=d8P6_q706rjuc6N4WBbdOqNQFuAIjCHfmrhgJABFxqE,3646
6
+ AutoStatLib/normality_tests.py,sha256=TYeKpfpJRzOHvDZucObuZhPktjiZpSZwh381eJ8ENC4,2381
7
+ AutoStatLib/statistical_tests.py,sha256=xfHdTtN5Es_qoVMUwX8VFsl-FLpF3zd56S9ya7dPXVo,6566
8
+ AutoStatLib/text_formatting.py,sha256=rWDsrlZdquook7lUg8t2mb3az8nR12BDprxfy_NwE2o,3576
9
+ autostatlib-0.2.6.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
10
+ autostatlib-0.2.6.dist-info/METADATA,sha256=BDNKvdfcHaPlSF2q3YuDpOPFK_K7_hKcDH2NgjdCYtQ,36872
11
+ autostatlib-0.2.6.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
12
+ autostatlib-0.2.6.dist-info/top_level.txt,sha256=BuHzVyE2andc7RwD_UPmDjLl9CUAyBH6WHZGjaIReUI,12
13
+ autostatlib-0.2.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (76.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- AutoStatLib/AutoStatLib.py,sha256=lUDNdzH2NdsyGm1jgLvQ1b-PXIyo8SfMApEK4uOQxSg,23479
2
- AutoStatLib/__init__.py,sha256=0wHYnglzKRPqSHtZlfbMEA2Bj5rDR4LLaXbOrJi-sqM,101
3
- AutoStatLib/__main__.py,sha256=ROKWensrxDh3Gl-yhexJ-BYFohDSh9y-CuMkaLpmnnQ,247
4
- AutoStatLib/_version.py,sha256=WbLB15iApm4FvkoTxz3n4t20nHfs58LNdIBr1m1YbxU,53
5
- AutoStatLib-0.2.2.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
6
- AutoStatLib-0.2.2.dist-info/METADATA,sha256=4Ro1Bo6FsklfwMo-G5N9C--n-7HJA4nMNns6qivu90k,36473
7
- AutoStatLib-0.2.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
8
- AutoStatLib-0.2.2.dist-info/top_level.txt,sha256=BuHzVyE2andc7RwD_UPmDjLl9CUAyBH6WHZGjaIReUI,12
9
- AutoStatLib-0.2.2.dist-info/RECORD,,