statslibx 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +2 -2
- statslibx/descriptive.py +235 -148
- statslibx/inferential.py +100 -72
- statslibx/utils.py +427 -60
- {statslibx-0.1.5.dist-info → statslibx-0.1.6.dist-info}/METADATA +1 -1
- {statslibx-0.1.5.dist-info → statslibx-0.1.6.dist-info}/RECORD +8 -8
- {statslibx-0.1.5.dist-info → statslibx-0.1.6.dist-info}/WHEEL +0 -0
- {statslibx-0.1.5.dist-info → statslibx-0.1.6.dist-info}/top_level.txt +0 -0
statslibx/inferential.py
CHANGED
|
@@ -12,9 +12,16 @@ class InferentialStats:
|
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
def __init__(self, data: Union[pd.DataFrame, np.ndarray],
|
|
15
|
-
|
|
15
|
+
backend: Literal['pandas', 'polars'] = 'pandas'):
|
|
16
16
|
"""
|
|
17
17
|
Inicializar con DataFrame o array numpy
|
|
18
|
+
|
|
19
|
+
Parameters:
|
|
20
|
+
-----------
|
|
21
|
+
data : DataFrame o ndarray
|
|
22
|
+
Datos a analizar
|
|
23
|
+
backend : str
|
|
24
|
+
'pandas' o 'polars' para procesamiento
|
|
18
25
|
"""
|
|
19
26
|
|
|
20
27
|
if isinstance(data, str) and os.path.exists(data):
|
|
@@ -67,7 +74,7 @@ class InferentialStats:
|
|
|
67
74
|
# ============= INTERVALOS DE CONFIANZA =============
|
|
68
75
|
|
|
69
76
|
def confidence_interval(self, column: str, confidence: float = 0.95,
|
|
70
|
-
|
|
77
|
+
statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
|
|
71
78
|
"""
|
|
72
79
|
Intervalo de confianza para diferentes estadísticos
|
|
73
80
|
|
|
@@ -120,8 +127,8 @@ class InferentialStats:
|
|
|
120
127
|
# ============= PRUEBAS DE HIPÓTESIS =============
|
|
121
128
|
|
|
122
129
|
def t_test_1sample(self, column: str, popmean: float = None,
|
|
123
|
-
|
|
124
|
-
|
|
130
|
+
popmedian: float = None,
|
|
131
|
+
alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
|
|
125
132
|
"""
|
|
126
133
|
Prueba t de una muestra (para media o mediana)
|
|
127
134
|
|
|
@@ -176,8 +183,8 @@ class InferentialStats:
|
|
|
176
183
|
raise ValueError("Debe especificar popmean o popmedian")
|
|
177
184
|
|
|
178
185
|
def t_test_2sample(self, column1: str, column2: str,
|
|
179
|
-
|
|
180
|
-
|
|
186
|
+
equal_var: bool = True,
|
|
187
|
+
alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
|
|
181
188
|
"""
|
|
182
189
|
Prueba t de dos muestras independientes
|
|
183
190
|
|
|
@@ -211,9 +218,16 @@ class InferentialStats:
|
|
|
211
218
|
)
|
|
212
219
|
|
|
213
220
|
def t_test_paired(self, column1: str, column2: str,
|
|
214
|
-
|
|
221
|
+
alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
|
|
215
222
|
"""
|
|
216
223
|
Prueba t pareada
|
|
224
|
+
|
|
225
|
+
Parameters:
|
|
226
|
+
-----------
|
|
227
|
+
column1, column2:
|
|
228
|
+
Datos a analizar
|
|
229
|
+
alternative:
|
|
230
|
+
"two-sided", "less" o "greater"
|
|
217
231
|
"""
|
|
218
232
|
from scipy import stats
|
|
219
233
|
|
|
@@ -231,7 +245,7 @@ class InferentialStats:
|
|
|
231
245
|
)
|
|
232
246
|
|
|
233
247
|
def mann_whitney_test(self, column1: str, column2: str,
|
|
234
|
-
|
|
248
|
+
alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
|
|
235
249
|
"""
|
|
236
250
|
Prueba de Mann-Whitney U (alternativa no paramétrica al t-test)
|
|
237
251
|
|
|
@@ -339,8 +353,8 @@ class InferentialStats:
|
|
|
339
353
|
)
|
|
340
354
|
|
|
341
355
|
def normality_test(self, column: str,
|
|
342
|
-
|
|
343
|
-
|
|
356
|
+
method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
|
|
357
|
+
test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
|
|
344
358
|
"""
|
|
345
359
|
Prueba de normalidad con múltiples métodos y estadísticos
|
|
346
360
|
|
|
@@ -495,74 +509,88 @@ class InferentialStats:
|
|
|
495
509
|
column1: str = None,
|
|
496
510
|
column2: str = None,
|
|
497
511
|
alpha: float = 0.05,
|
|
498
|
-
homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene"
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
if column1 is None:
|
|
504
|
-
raise ValueError("Debes especificar 'column1'.")
|
|
505
|
-
|
|
506
|
-
x = data[column1].dropna()
|
|
507
|
-
|
|
508
|
-
if method in ["difference_mean", "variance"] and column2 is None:
|
|
509
|
-
raise ValueError("Para este método debes pasar 'column2'.")
|
|
510
|
-
|
|
511
|
-
y = data[column2].dropna() if column2 else None
|
|
512
|
-
|
|
513
|
-
# --- homoscedasticity test ---
|
|
514
|
-
homo_result = None
|
|
515
|
-
if method in ["difference_mean", "variance"]:
|
|
516
|
-
homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
|
|
517
|
-
|
|
518
|
-
# --- MAIN HYPOTHESIS TESTS ---
|
|
519
|
-
if method == "mean":
|
|
520
|
-
# One-sample t-test
|
|
521
|
-
t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
|
|
522
|
-
test_name = "One-sample t-test"
|
|
523
|
-
|
|
524
|
-
elif method == "difference_mean":
|
|
525
|
-
# Two-sample t-test
|
|
526
|
-
equal_var = homo_result["equal_var"]
|
|
527
|
-
t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
|
|
528
|
-
test_name = "Two-sample t-test"
|
|
529
|
-
|
|
530
|
-
elif method == "proportion":
|
|
531
|
-
# Proportion test (z-test)
|
|
532
|
-
p_hat = np.mean(x)
|
|
533
|
-
n = len(x)
|
|
534
|
-
z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
|
|
535
|
-
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
|
|
536
|
-
t_stat = z_stat
|
|
537
|
-
test_name = "Proportion Z-test"
|
|
538
|
-
|
|
539
|
-
elif method == "variance":
|
|
540
|
-
# Classic F-test
|
|
541
|
-
var_x = np.var(x, ddof=1)
|
|
542
|
-
var_y = np.var(y, ddof=1)
|
|
543
|
-
F = var_x / var_y
|
|
544
|
-
dfn = len(x) - 1
|
|
545
|
-
dfd = len(y) - 1
|
|
546
|
-
|
|
547
|
-
p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
|
|
548
|
-
t_stat = F
|
|
549
|
-
test_name = "Variance F-test"
|
|
512
|
+
homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
|
|
513
|
+
|
|
514
|
+
"""
|
|
515
|
+
Test de Hipotesis
|
|
550
516
|
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
517
|
+
Parameters:
|
|
518
|
+
-----------
|
|
519
|
+
method : str
|
|
520
|
+
'mean', 'difference_mean', 'proportion' o 'variance'
|
|
521
|
+
column1, column2 : str
|
|
522
|
+
Columnas numéricas a comparar
|
|
523
|
+
alpha : float
|
|
524
|
+
Nivel de significancia (default 0.05)
|
|
525
|
+
homoscedasticity : str
|
|
526
|
+
Método de homocedasticidad
|
|
527
|
+
'levene', 'bartlett' o 'var_test'
|
|
528
|
+
"""
|
|
529
|
+
|
|
530
|
+
data = self.data
|
|
531
|
+
|
|
532
|
+
if column1 is None:
|
|
533
|
+
raise ValueError("Debes especificar 'column1'.")
|
|
534
|
+
|
|
535
|
+
x = data[column1].dropna()
|
|
536
|
+
|
|
537
|
+
if method in ["difference_mean", "variance"] and column2 is None:
|
|
538
|
+
raise ValueError("Para este método debes pasar 'column2'.")
|
|
539
|
+
|
|
540
|
+
y = data[column2].dropna() if column2 else None
|
|
541
|
+
|
|
542
|
+
# --- homoscedasticity test ---
|
|
543
|
+
homo_result = None
|
|
544
|
+
if method in ["difference_mean", "variance"]:
|
|
545
|
+
homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
|
|
546
|
+
|
|
547
|
+
# --- MAIN HYPOTHESIS TESTS ---
|
|
548
|
+
if method == "mean":
|
|
549
|
+
# One-sample t-test
|
|
550
|
+
t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
|
|
551
|
+
test_name = "One-sample t-test"
|
|
552
|
+
|
|
553
|
+
elif method == "difference_mean":
|
|
554
|
+
# Two-sample t-test
|
|
555
|
+
equal_var = homo_result["equal_var"]
|
|
556
|
+
t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
|
|
557
|
+
test_name = "Two-sample t-test"
|
|
558
|
+
|
|
559
|
+
elif method == "proportion":
|
|
560
|
+
# Proportion test (z-test)
|
|
561
|
+
p_hat = np.mean(x)
|
|
562
|
+
n = len(x)
|
|
563
|
+
z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
|
|
564
|
+
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
|
|
565
|
+
t_stat = z_stat
|
|
566
|
+
test_name = "Proportion Z-test"
|
|
567
|
+
|
|
568
|
+
elif method == "variance":
|
|
569
|
+
# Classic F-test
|
|
570
|
+
var_x = np.var(x, ddof=1)
|
|
571
|
+
var_y = np.var(y, ddof=1)
|
|
572
|
+
F = var_x / var_y
|
|
573
|
+
dfn = len(x) - 1
|
|
574
|
+
dfd = len(y) - 1
|
|
575
|
+
|
|
576
|
+
p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
|
|
577
|
+
t_stat = F
|
|
578
|
+
test_name = "Variance F-test"
|
|
579
|
+
|
|
580
|
+
return {
|
|
581
|
+
"test": test_name,
|
|
582
|
+
"statistic": t_stat,
|
|
583
|
+
"p_value": p_value,
|
|
584
|
+
"alpha": alpha,
|
|
585
|
+
"reject_H0": p_value < alpha,
|
|
586
|
+
"homoscedasticity_test": homo_result
|
|
587
|
+
}
|
|
559
588
|
|
|
560
589
|
def _homoscedasticity_test(
|
|
561
590
|
self,
|
|
562
591
|
x,
|
|
563
592
|
y,
|
|
564
|
-
method: Literal["levene", "bartlett", "var_test"] = "levene"
|
|
565
|
-
) -> Dict[str, Any]:
|
|
593
|
+
method: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
|
|
566
594
|
|
|
567
595
|
if method == "levene":
|
|
568
596
|
stat, p = stats.levene(x, y)
|