AutoStatLib 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of AutoStatLib might be problematic. Click here for more details.

@@ -0,0 +1,605 @@
1
+ import numpy as np
2
+ from statsmodels.stats.diagnostic import lilliefors
3
+ from scipy.stats import ttest_rel, ttest_ind, ttest_1samp, wilcoxon, mannwhitneyu, f_oneway, kruskal, friedmanchisquare, shapiro, kstest, anderson, normaltest
4
+
5
+
6
+ class __StatisticalTests():
7
+ '''
8
+ Statistical tests mixin
9
+ '''
10
+
11
+ def anova(self):
12
+ stat, p_value = f_oneway(*self.data)
13
+ self.tails = 2
14
+ # if self.tails == 1 and p_value > 0.5:
15
+ # p_value /= 2
16
+ # if self.tails == 1:
17
+ # p_value /= 2
18
+ self.test_name = 'ANOVA'
19
+ self.test_id = 'anova'
20
+ self.paired = False
21
+ self.test_stat = stat
22
+ self.p_value = p_value
23
+
24
+ def friedman_test(self):
25
+ stat, p_value = friedmanchisquare(*self.data)
26
+ self.tails = 2
27
+ self.test_name = 'Friedman test'
28
+ self.test_id = 'friedman'
29
+ self.paired = True
30
+ self.test_stat = stat
31
+ self.p_value = p_value
32
+
33
+ def kruskal_wallis_test(self):
34
+ stat, p_value = kruskal(*self.data)
35
+ self.test_name = 'Kruskal-Wallis test'
36
+ self.test_id = 'kruskal_wallis'
37
+ self.paired = False
38
+ self.test_stat = stat
39
+ self.p_value = p_value
40
+
41
+ def mann_whitney_u_test(self):
42
+ stat, p_value = mannwhitneyu(
43
+ self.data[0], self.data[1], alternative='two-sided')
44
+ if self.tails == 1:
45
+ p_value /= 2
46
+ # alternative method of one-tailed calculation
47
+ # gives the same result:
48
+ # stat, p_value = mannwhitneyu(
49
+ # self.data[0], self.data[1], alternative='two-sided' if self.tails == 2 else 'less')
50
+ # if self.tails == 1 and p_value > 0.5:
51
+ # p_value = 1-p_value
52
+
53
+ self.test_name = 'Mann-Whitney U test'
54
+ self.test_id = 'mann_whitney'
55
+ self.paired = False
56
+ self.test_stat = stat
57
+ self.p_value = p_value
58
+
59
+ def t_test_independend(self):
60
+ t_stat, t_p_value = ttest_ind(
61
+ self.data[0], self.data[1])
62
+ if self.tails == 1:
63
+ t_p_value /= 2
64
+ self.test_name = 't-test for independend samples'
65
+ self.test_id = 't_test_independend'
66
+ self.paired = False
67
+ self.test_stat = t_stat
68
+ self.p_value = t_p_value
69
+
70
+ def t_test_paired(self):
71
+ t_stat, t_p_value = ttest_rel(
72
+ self.data[0], self.data[1])
73
+ if self.tails == 1:
74
+ t_p_value /= 2
75
+ self.test_name = 't-test for paired samples'
76
+ self.test_id = 't_test_paired'
77
+ self.paired = True
78
+ self.test_stat = t_stat
79
+ self.p_value = t_p_value
80
+
81
+ def t_test_single_sample(self):
82
+ if self.popmean == None:
83
+ self.popmean = 0
84
+ self.AddWarning('no_pop_mean_set')
85
+ t_stat, t_p_value = ttest_1samp(self.data[0], self.popmean)
86
+ if self.tails == 1:
87
+ t_p_value /= 2
88
+ self.test_name = 'Single-sample t-test'
89
+ self.test_id = 't_test_single_sample'
90
+ self.paired = False
91
+ self.test_stat = t_stat
92
+ self.p_value = t_p_value
93
+
94
+ def wilcoxon_single_sample(self):
95
+ if self.popmean == None:
96
+ self.popmean = 0
97
+ self.AddWarning('no_pop_mean_set')
98
+ data = [i - self.popmean for i in self.data[0]]
99
+ w_stat, w_p_value = wilcoxon(data)
100
+ if self.tails == 1:
101
+ p_value /= 2
102
+ self.test_name = 'Wilcoxon signed-rank test for single sample'
103
+ self.test_id = 'wilcoxon_single_sample'
104
+ self.paired = False
105
+ self.test_stat = w_stat
106
+ self.p_value = w_p_value
107
+
108
+ def wilcoxon(self):
109
+ stat, p_value = wilcoxon(self.data[0], self.data[1])
110
+ if self.tails == 1:
111
+ p_value /= 2
112
+ self.test_name = 'Wilcoxon signed-rank test'
113
+ self.test_id = 'wilcoxon'
114
+ self.paired = True
115
+ self.test_stat = stat
116
+ self.p_value = p_value
117
+
118
+
119
+ class __NormalityTests():
120
+ '''
121
+ Normality tests mixin
122
+
123
+ see the article about minimum sample size for tests:
124
+ Power comparisons of Shapiro-Wilk, Kolmogorov-Smirnov,
125
+ Lilliefors and Anderson-Darling tests, Nornadiah Mohd Razali1, Yap Bee Wah1
126
+ '''
127
+
128
+ def check_normality(self, data):
129
+ sw = None
130
+ lf = None
131
+ ad = None
132
+ ap = None
133
+ n = len(data)
134
+
135
+ # Shapiro-Wilk test
136
+ sw_stat, sw_p_value = shapiro(data)
137
+ if sw_p_value > 0.05:
138
+ sw = True
139
+ else:
140
+ sw = False
141
+
142
+ # Lilliefors test
143
+ lf_stat, lf_p_value = lilliefors(
144
+ data, dist='norm')
145
+ if lf_p_value > 0.05:
146
+ lf = True
147
+ else:
148
+ lf = False
149
+
150
+ # Anderson-Darling test
151
+ ad_stat, ad_p_value = self.anderson_get_p(
152
+ data, dist='norm')
153
+ if ad_p_value > 0.05 and n >= 20:
154
+ ad = True
155
+ elif ad_p_value <= 0.05 and n >= 20:
156
+ ad = False
157
+
158
+ # D'Agostino-Pearson test
159
+ ap_stat, ap_p_value = normaltest(data)
160
+ # test result is skewed if n<20
161
+ if ap_p_value > 0.05 and n >= 20:
162
+ ap = True
163
+ elif ap_p_value <= 0.05 and n >= 20:
164
+ ap = False
165
+
166
+ # print(ap_p_value, ad_p_value, sw_p_value, lf_p_value)
167
+
168
+ return (sw, lf, ad, ap)
169
+
170
+ def anderson_get_p(self, data, dist='norm'):
171
+ '''
172
+ calculating p-value for Anderson-Darling test using the method described here:
173
+ Computation of Probability Associated with Anderson-Darling Statistic
174
+ Lorentz Jantschi and Sorana D. Bolboaca, 2018 - Mathematics
175
+
176
+ '''
177
+ e = 2.718281828459045
178
+ n = len(data)
179
+
180
+ ad, critical_values, significance_levels = anderson(
181
+ data, dist=dist)
182
+
183
+ # adjust ad_stat for small sample sizes:
184
+ s = ad*(1 + 0.75/n + 2.25/(n**2))
185
+
186
+ if s >= 0.6:
187
+ p = e**(1.2937 - 5.709*s + 0.0186*s**2)
188
+ elif s > 0.34:
189
+ p = e**(0.9177 - 4.279*s - 1.38*s**2)
190
+ elif s > 0.2:
191
+ p = 1 - e**(-8.318 + 42.796*s - 59.938*s**2)
192
+ elif s <= 0.2:
193
+ p = 1 - e**(-13.436 + 101.14*s - 223.73*s**2)
194
+ else:
195
+ p = None
196
+
197
+ return ad, p
198
+
199
+
200
+ class __TextFormatting():
201
+ '''
202
+ Text formatting mixin
203
+ '''
204
+
205
+ def autospace(self, elements_list, space, delimiter=' ') -> str:
206
+ output = ''
207
+ for i, element in enumerate(elements_list):
208
+ if i == len(elements_list):
209
+ output += element
210
+ else:
211
+ output += element + (space-len(element))*delimiter
212
+ return output
213
+
214
+ def print_groups(self, space=24, max_length=15):
215
+ self.log('')
216
+ # Get the number of groups (rows) and the maximum length of rows
217
+ data = self.data
218
+ num_groups = len(data)
219
+ group_longest = max(len(row) for row in data)
220
+
221
+ # Print the header
222
+ header = [f'Group {i+1}' for i in range(num_groups)]
223
+ line = [''*7]
224
+ self.log(self.autospace(header, space))
225
+ self.log(self.autospace(line, space))
226
+
227
+ # Print each column with a placeholder if longer than max_length
228
+ for i in range(group_longest):
229
+ row_values = []
230
+ all_values_empty = True
231
+ for row in data:
232
+ if len(row) > max_length:
233
+ if i < max_length:
234
+ row_values.append(str(row[i]))
235
+ all_values_empty = False
236
+ elif i == max_length:
237
+ row_values.append(f'[{len(row) - max_length} more]')
238
+ all_values_empty = False
239
+ else:
240
+ continue
241
+ else:
242
+ if i < len(row):
243
+ row_values.append(str(row[i]))
244
+ all_values_empty = False
245
+ else:
246
+ row_values.append('')
247
+ if all_values_empty:
248
+ break
249
+ self.log(self.autospace(row_values, space))
250
+
251
+ def make_stars(self) -> int:
252
+ p = self.p_value.item()
253
+ if p is not None:
254
+ if p < 0.0001:
255
+ return 4
256
+ if p < 0.001:
257
+ return 3
258
+ elif p < 0.01:
259
+ return 2
260
+ elif p < 0.05:
261
+ return 1
262
+ else:
263
+ return 0
264
+ return 0
265
+
266
+ def make_p_value_printed(self) -> str:
267
+ p = self.p_value.item()
268
+ if p is not None:
269
+ if p > 0.99:
270
+ return 'p>0.99'
271
+ elif p >= 0.01:
272
+ return f'p={p:.2g}'
273
+ elif p >= 0.001:
274
+ return f'p={p:.2g}'
275
+ elif p >= 0.0001:
276
+ return f'p={p:.1g}'
277
+ elif p < 0.0001:
278
+ return 'p<0.0001'
279
+ else:
280
+ return 'N/A'
281
+ return 'N/A'
282
+
283
+ def print_results(self):
284
+ self.log('\n\nResults: \n')
285
+ for i in self.results:
286
+ shift = 27 - len(i)
287
+ if i == 'Warnings':
288
+ self.log(i, ':', ' ' * shift, len(self.results[i]))
289
+ elif i == 'Samples':
290
+ pass
291
+ else:
292
+ self.log(i, ':', ' ' * shift, self.results[i])
293
+
294
+ def create_results_dict(self) -> dict:
295
+
296
+ self.stars_int = self.make_stars()
297
+ self.stars_str = '*' * self.stars_int if self.stars_int else 'ns'
298
+
299
+ return {
300
+ 'p-value': self.make_p_value_printed(),
301
+ 'Significance(p<0.05)': True if self.p_value.item() < 0.05 else False,
302
+ 'Stars_Printed': self.stars_str,
303
+ 'Test_Name': self.test_name,
304
+ 'Groups_Compared': self.n_groups,
305
+ 'Population_Mean': self.popmean if self.n_groups == 1 else 'N/A',
306
+ 'Data_Normaly_Distributed': self.parametric,
307
+ 'Parametric_Test_Applied': True if self.test_id in self.test_ids_parametric else False,
308
+ 'Paired_Test_Applied': self.paired,
309
+ 'Tails': self.tails,
310
+ 'p-value_exact': self.p_value.item(),
311
+ 'Stars': self.stars_int,
312
+ # 'Stat_Value': self.test_stat.item(),
313
+ 'Warnings': self.warnings,
314
+ 'Groups_N': [len(self.data[i]) for i in range(len(self.data))],
315
+ 'Groups_Median': [np.median(self.data[i]).item() for i in range(len(self.data))],
316
+ 'Groups_Mean': [np.mean(self.data[i]).item() for i in range(len(self.data))],
317
+ 'Groups_SD': [np.std(self.data[i]).item() for i in range(len(self.data))],
318
+ 'Groups_SE': [np.std(self.data[i]).item() / np.sqrt(len(self.data)).item() for i in range(len(self.data))],
319
+ # actually returns list of lists of numpy dtypes of float64, next make it return regular floats:
320
+ 'Samples': self.data,
321
+ }
322
+
323
+ def log(self, *args, **kwargs):
324
+ message = ' '.join(map(str, args))
325
+ # print(message, **kwargs)
326
+ self.summary += '\n' + message
327
+
328
+ def AddWarning(self, warning_id):
329
+ message = self.warning_ids_all[warning_id]
330
+ self.log(message)
331
+ self.warnings.append(message)
332
+
333
+
334
+ class __InputFormatting():
335
+ def floatify_recursive(self, data):
336
+ if isinstance(data, list):
337
+ # Recursively process sublists and filter out None values
338
+ processed_list = [self.floatify_recursive(item) for item in data]
339
+ return [item for item in processed_list if item is not None]
340
+ else:
341
+ try:
342
+ # Try to convert the item to float
343
+ return np.float64(data)
344
+ except (ValueError, TypeError):
345
+ # If conversion fails, replace with None
346
+ self.warning_flag_non_numeric_data = True
347
+ return None
348
+
349
+
350
+ class StatisticalAnalysis(__StatisticalTests, __NormalityTests, __TextFormatting, __InputFormatting):
351
+ '''
352
+ The main class
353
+ *documentation placeholder*
354
+
355
+ '''
356
+
357
+ def __init__(self,
358
+ groups_list,
359
+ paired=False,
360
+ tails=2,
361
+ popmean=None,
362
+ verbose=True):
363
+ self.results = None
364
+ self.error = False
365
+ self.groups_list = groups_list
366
+ self.paired = paired
367
+ self.tails = tails
368
+ self.popmean = popmean
369
+ self.verbose = verbose
370
+ self.n_groups = len(self.groups_list)
371
+ self.warning_flag_non_numeric_data = False
372
+ self.summary = ''
373
+ self.test_ids_parametric = ['anova',
374
+ 't_test_independend',
375
+ 't_test_paired',
376
+ 't_test_single_sample',]
377
+ self.test_ids_all = [ # in aplhabetical order
378
+ 'anova',
379
+ 'friedman',
380
+ 'kruskal_wallis',
381
+ 'mann_whitney',
382
+ 't_test_independend',
383
+ 't_test_paired',
384
+ 't_test_single_sample',
385
+ 'wilcoxon',
386
+ 'wilcoxon_single_sample',
387
+ ]
388
+ self.warning_ids_all = {
389
+ 'not-numeric': '\nWarning: Non-numeric data was found in input and ignored.\n Make sure the input data is correct to get the correct results\n',
390
+ 'param_test_with_non-normal_data': '\nWarning: Parametric test was manualy chosen for Not-Normaly distributed data.\n The results might be skewed. \n Please, run non-parametric test or preform automatic test selection.\n',
391
+ 'non-param_test_with_normal_data': '\nWarning: Non-Parametric test was manualy chosen for Normaly distributed data.\n The results might be skewed. \n Please, run parametric test or preform automatic test selection.\n',
392
+ 'no_pop_mean_set': '\nWarning: No Population Mean was set up for single-sample test, used default 0 value.\n The results might be skewed. \n Please, set the Population Mean and run the test again.\n',
393
+ }
394
+
395
+ def __run_test(self, test='auto'):
396
+
397
+ # reset values from previous tests
398
+ self.results = None
399
+ self.error = False
400
+ self.warnings = []
401
+ self.normals = []
402
+ self.test_name = None
403
+ self.test_id = None
404
+ self.test_stat = None
405
+ self.p_value = None
406
+
407
+ self.log('\n' + '-'*67)
408
+ self.log('Statistical analysis initiated for data in {} groups\n'.format(
409
+ len(self.groups_list)))
410
+
411
+ # adjusting input data type
412
+ self.data = self.floatify_recursive(self.groups_list)
413
+ if self.warning_flag_non_numeric_data:
414
+ self.AddWarning('not-numeric')
415
+
416
+ # delete the empty cols from input
417
+ self.data = [col for col in self.data if any(
418
+ x is not None for x in col)]
419
+
420
+ # User input assertion block
421
+ try:
422
+ assert self.tails in [1, 2], 'Tails parameter can be 1 or 2 only'
423
+ assert test in self.test_ids_all or test == 'auto', 'Wrong test id choosen, ensure you called correct function'
424
+ assert not (self.n_groups > 1
425
+ and (test == 't_test_single_sample'
426
+ or test == 'wilcoxon_single_sample')), 'Only one group of data must be given for single-group tests'
427
+ assert all(len(
428
+ group) > 2 for group in self.data), 'Each group must contain at least three values'
429
+ assert not (self.paired == True and not all(len(lst) == len(
430
+ self.data[0]) for lst in self.data)), 'Paired groups must be the same length'
431
+ assert not (test == 'friedman' and not all(len(lst) == len(
432
+ self.data[0]) for lst in self.data)), 'Paired groups must be the same length for Friedman Chi Square test'
433
+ assert not (test == 't_test_paired' and not all(len(lst) == len(
434
+ self.data[0]) for lst in self.data)), 'Paired groups must be the same length for Paired t-test'
435
+ assert not (test == 'wilcoxon' and not all(len(lst) == len(
436
+ self.data[0]) for lst in self.data)), 'Paired groups must be the same length for Wilcoxon signed-rank test'
437
+ assert not (test == 'friedman' and self.n_groups <
438
+ 3), 'At least three groups of data must be given for 3-groups tests'
439
+ assert not ((test == 'anova'
440
+ or test == 'kruskal_wallis') and self.n_groups < 2), 'At least two groups of data must be given for ANOVA or Kruskal Wallis tests'
441
+ assert not ((test == 'wilcoxon'
442
+ or test == 't_test_independend'
443
+ or test == 't_test_paired'
444
+ or test == 'mann_whitney')
445
+ and self.n_groups != 2), 'Only two groups of data must be given for 2-groups tests'
446
+ except AssertionError as error:
447
+ self.log('\nTest :', test)
448
+ self.log('Error :', error)
449
+ self.log('-'*67 + '\n')
450
+ self.error = True
451
+ print(self.summary)
452
+ return
453
+
454
+ # Print the data
455
+ self.print_groups()
456
+
457
+ # Normality tests
458
+ self.log(
459
+ '\n\nThe group is assumed to be normally distributed if at least one')
460
+ self.log(
461
+ 'normality test result is positive. Normality checked by tests:')
462
+ self.log('Shapiro-Wilk, Lilliefors, Anderson-Darling, D\'Agostino-Pearson')
463
+ self.log(
464
+ '[+] -positive, [-] -negative, [ ] -too small group for the test\n')
465
+ self.log(' Test : SW LF AD AP ')
466
+ for i, data in enumerate(self.data):
467
+ poll = self.check_normality(data)
468
+ isnormal = any(poll)
469
+ poll_print = tuple(
470
+ '+' if x is True else '-' if x is False else ' ' if x is None else 'e' for x in poll)
471
+ self.normals.append(isnormal)
472
+ self.log(
473
+ f' Group {i+1}: {poll_print[0]} {poll_print[1]} {poll_print[2]} {poll_print[3]} so disrtibution seems {"normal" if isnormal else "not normal"}')
474
+ self.parametric = all(self.normals)
475
+
476
+ # print test choosen
477
+ self.log('\n\nInput:\n')
478
+ self.log('Data Normaly Distributed: ', self.parametric)
479
+ self.log('Paired Groups: ', self.paired)
480
+ self.log('Groups: ', self.n_groups)
481
+ self.log('Test chosen by user: ', test)
482
+
483
+ # Wrong test Warnings
484
+ if not test == 'auto' and not self.parametric and test in self.test_ids_parametric:
485
+ self.AddWarning('param_test_with_non-normal_data')
486
+ if not test == 'auto' and self.parametric and not test in self.test_ids_parametric:
487
+ self.AddWarning('non-param_test_with_normal_data')
488
+
489
+ if test == 'anova':
490
+ self.anova()
491
+ elif test == 'friedman':
492
+ self.friedman_test()
493
+ elif test == 'kruskal_wallis':
494
+ self.kruskal_wallis_test()
495
+ elif test == 'mann_whitney':
496
+ self.mann_whitney_u_test()
497
+ elif test == 't_test_independend':
498
+ self.t_test_independend()
499
+ elif test == 't_test_paired':
500
+ self.t_test_paired()
501
+ elif test == 't_test_single_sample':
502
+ self.t_test_single_sample()
503
+ elif test == 'wilcoxon':
504
+ self.wilcoxon()
505
+ elif test == 'wilcoxon_single_sample':
506
+ self.wilcoxon_single_sample()
507
+ else:
508
+ self.log('Automatic test selection preformed.')
509
+ self.__auto()
510
+
511
+ # print the results
512
+ self.results = self.create_results_dict()
513
+ self.print_results()
514
+ self.log(
515
+ '\n\nResults above are accessible as a dictionary via GetResult() method')
516
+ self.log('-'*67 + '\n')
517
+
518
+ # print the results to console:
519
+ if self.verbose == True:
520
+ print(self.summary)
521
+
522
+ def __auto(self):
523
+
524
+ if self.n_groups == 2:
525
+ if self.paired:
526
+ if self.parametric:
527
+ return self.t_test_paired()
528
+ else:
529
+ return self.wilcoxon()
530
+ else:
531
+ if self.parametric:
532
+ return self.t_test_independend()
533
+ else:
534
+ return self.mann_whitney_u_test()
535
+ elif self.n_groups == 1:
536
+ if self.parametric:
537
+ return self.t_test_single_sample()
538
+ else:
539
+ return self.wilcoxon_single_sample()
540
+ else:
541
+ if self.paired:
542
+ return self.friedman_test()
543
+ else:
544
+ if self.parametric:
545
+ return self.anova()
546
+ else:
547
+ return self.kruskal_wallis_test()
548
+
549
+ # public methods:
550
+ def RunAuto(self):
551
+ self.__run_test(test='auto')
552
+
553
+ def RunManual(self, test):
554
+ self.__run_test(test)
555
+
556
+ def RunAnova(self):
557
+ self.__run_test(test='anova')
558
+
559
+ def RunFriedman(self):
560
+ self.__run_test(test='friedman')
561
+
562
+ def RunKruskalWallis(self):
563
+ self.__run_test(test='kruskal_wallis')
564
+
565
+ def RunMannWhitney(self):
566
+ self.__run_test(test='mann_whitney')
567
+
568
+ def RunTtest(self):
569
+ self.__run_test(test='t_test_independend')
570
+
571
+ def RunTtestPaired(self):
572
+ self.__run_test(test='t_test_paired')
573
+
574
+ def RunTtestSingleSample(self):
575
+ self.__run_test(test='t_test_single_sample')
576
+
577
+ def RunWilcoxonSingleSample(self):
578
+ self.__run_test(test='wilcoxon_single_sample')
579
+
580
+ def RunWilcoxon(self):
581
+ self.__run_test(test='wilcoxon')
582
+
583
+ def GetResult(self):
584
+ if not self.results and not self.error:
585
+ self.__run_test(test='auto')
586
+ return self.results
587
+ if not self.results and self.error:
588
+ print('Error occured, no results to output')
589
+ return {}
590
+ else:
591
+ return self.results
592
+
593
+ def GetSummary(self):
594
+ if not self.results and not self.error:
595
+ self.__run_test(test='auto')
596
+ return self.summary
597
+ else:
598
+ return self.summary
599
+
600
+ def PrintSummary(self):
601
+ print(self.summary)
602
+
603
+
604
+ if __name__ == '__main__':
605
+ print('\nThis package works as an imported module only\n')
@@ -0,0 +1,2 @@
1
+ from .AutoStatLib import StatisticalAnalysis
2
+ from ._version import __version__
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python
2
+ from .AutoStatLib import StatisticalAnalysis
3
+ from ._version import __version__
4
+
5
+ if __name__ == '__main__':
6
+ print('\nThis package works as an imported module only\n')
@@ -0,0 +1,2 @@
1
+ # AutoStatLib package version:
2
+ __version__ = "0.1.6"