statslibx 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +2 -2
- statslibx/datasets/__init__.py +1 -0
- statslibx/datasets/course_completion.csv +100001 -0
- statslibx/descriptive.py +274 -148
- statslibx/inferential.py +139 -72
- statslibx/utils.py +288 -82
- {statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/METADATA +1 -1
- statslibx-0.1.6.dist-info/RECORD +14 -0
- statslibx-0.1.4.dist-info/RECORD +0 -13
- {statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/WHEEL +0 -0
- {statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/top_level.txt +0 -0
statslibx/inferential.py
CHANGED
|
@@ -4,6 +4,7 @@ import pandas as pd
|
|
|
4
4
|
from typing import Optional, Union, Literal, List, Dict, Any
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from scipy import stats
|
|
7
|
+
import os
|
|
7
8
|
|
|
8
9
|
class InferentialStats:
|
|
9
10
|
"""
|
|
@@ -11,10 +12,21 @@ class InferentialStats:
|
|
|
11
12
|
"""
|
|
12
13
|
|
|
13
14
|
def __init__(self, data: Union[pd.DataFrame, np.ndarray],
|
|
14
|
-
|
|
15
|
+
backend: Literal['pandas', 'polars'] = 'pandas'):
|
|
15
16
|
"""
|
|
16
17
|
Inicializar con DataFrame o array numpy
|
|
18
|
+
|
|
19
|
+
Parameters:
|
|
20
|
+
-----------
|
|
21
|
+
data : DataFrame o ndarray
|
|
22
|
+
Datos a analizar
|
|
23
|
+
backend : str
|
|
24
|
+
'pandas' o 'polars' para procesamiento
|
|
17
25
|
"""
|
|
26
|
+
|
|
27
|
+
if isinstance(data, str) and os.path.exists(data):
|
|
28
|
+
data = InferentialStats.from_file(data).data
|
|
29
|
+
|
|
18
30
|
if isinstance(data, np.ndarray):
|
|
19
31
|
if data.ndim == 1:
|
|
20
32
|
data = pd.DataFrame({'var': data})
|
|
@@ -24,11 +36,45 @@ class InferentialStats:
|
|
|
24
36
|
self.data = data
|
|
25
37
|
self.backend = backend
|
|
26
38
|
self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def from_file(path: str):
|
|
42
|
+
"""
|
|
43
|
+
Carga automática de archivos y devuelve instancia de Intelligence.
|
|
44
|
+
Soporta CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
|
|
45
|
+
"""
|
|
46
|
+
if not os.path.exists(path):
|
|
47
|
+
raise FileNotFoundError(f"Archivo no encontrado: {path}")
|
|
48
|
+
|
|
49
|
+
ext = os.path.splitext(path)[1].lower()
|
|
50
|
+
|
|
51
|
+
if ext == ".csv":
|
|
52
|
+
df = pd.read_csv(path)
|
|
53
|
+
|
|
54
|
+
elif ext in [".xlsx", ".xls"]:
|
|
55
|
+
df = pd.read_excel(path)
|
|
56
|
+
|
|
57
|
+
elif ext in [".txt", ".tsv"]:
|
|
58
|
+
df = pd.read_table(path)
|
|
59
|
+
|
|
60
|
+
elif ext == ".json":
|
|
61
|
+
df = pd.read_json(path)
|
|
62
|
+
|
|
63
|
+
elif ext == ".parquet":
|
|
64
|
+
df = pd.read_parquet(path)
|
|
65
|
+
|
|
66
|
+
elif ext == ".feather":
|
|
67
|
+
df = pd.read_feather(path)
|
|
68
|
+
|
|
69
|
+
else:
|
|
70
|
+
raise ValueError(f"Formato no soportado: {ext}")
|
|
71
|
+
|
|
72
|
+
return InferentialStats(df)
|
|
27
73
|
|
|
28
74
|
# ============= INTERVALOS DE CONFIANZA =============
|
|
29
75
|
|
|
30
76
|
def confidence_interval(self, column: str, confidence: float = 0.95,
|
|
31
|
-
|
|
77
|
+
statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
|
|
32
78
|
"""
|
|
33
79
|
Intervalo de confianza para diferentes estadísticos
|
|
34
80
|
|
|
@@ -81,8 +127,8 @@ class InferentialStats:
|
|
|
81
127
|
# ============= PRUEBAS DE HIPÓTESIS =============
|
|
82
128
|
|
|
83
129
|
def t_test_1sample(self, column: str, popmean: float = None,
|
|
84
|
-
|
|
85
|
-
|
|
130
|
+
popmedian: float = None,
|
|
131
|
+
alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
|
|
86
132
|
"""
|
|
87
133
|
Prueba t de una muestra (para media o mediana)
|
|
88
134
|
|
|
@@ -137,8 +183,8 @@ class InferentialStats:
|
|
|
137
183
|
raise ValueError("Debe especificar popmean o popmedian")
|
|
138
184
|
|
|
139
185
|
def t_test_2sample(self, column1: str, column2: str,
|
|
140
|
-
|
|
141
|
-
|
|
186
|
+
equal_var: bool = True,
|
|
187
|
+
alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
|
|
142
188
|
"""
|
|
143
189
|
Prueba t de dos muestras independientes
|
|
144
190
|
|
|
@@ -172,9 +218,16 @@ class InferentialStats:
|
|
|
172
218
|
)
|
|
173
219
|
|
|
174
220
|
def t_test_paired(self, column1: str, column2: str,
|
|
175
|
-
|
|
221
|
+
alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
|
|
176
222
|
"""
|
|
177
223
|
Prueba t pareada
|
|
224
|
+
|
|
225
|
+
Parameters:
|
|
226
|
+
-----------
|
|
227
|
+
column1, column2:
|
|
228
|
+
Datos a analizar
|
|
229
|
+
alternative:
|
|
230
|
+
"two-sided", "less" o "greater"
|
|
178
231
|
"""
|
|
179
232
|
from scipy import stats
|
|
180
233
|
|
|
@@ -192,7 +245,7 @@ class InferentialStats:
|
|
|
192
245
|
)
|
|
193
246
|
|
|
194
247
|
def mann_whitney_test(self, column1: str, column2: str,
|
|
195
|
-
|
|
248
|
+
alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
|
|
196
249
|
"""
|
|
197
250
|
Prueba de Mann-Whitney U (alternativa no paramétrica al t-test)
|
|
198
251
|
|
|
@@ -300,8 +353,8 @@ class InferentialStats:
|
|
|
300
353
|
)
|
|
301
354
|
|
|
302
355
|
def normality_test(self, column: str,
|
|
303
|
-
|
|
304
|
-
|
|
356
|
+
method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
|
|
357
|
+
test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
|
|
305
358
|
"""
|
|
306
359
|
Prueba de normalidad con múltiples métodos y estadísticos
|
|
307
360
|
|
|
@@ -456,74 +509,88 @@ class InferentialStats:
|
|
|
456
509
|
column1: str = None,
|
|
457
510
|
column2: str = None,
|
|
458
511
|
alpha: float = 0.05,
|
|
459
|
-
homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene"
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
if column1 is None:
|
|
465
|
-
raise ValueError("Debes especificar 'column1'.")
|
|
466
|
-
|
|
467
|
-
x = data[column1].dropna()
|
|
468
|
-
|
|
469
|
-
if method in ["difference_mean", "variance"] and column2 is None:
|
|
470
|
-
raise ValueError("Para este método debes pasar 'column2'.")
|
|
471
|
-
|
|
472
|
-
y = data[column2].dropna() if column2 else None
|
|
473
|
-
|
|
474
|
-
# --- homoscedasticity test ---
|
|
475
|
-
homo_result = None
|
|
476
|
-
if method in ["difference_mean", "variance"]:
|
|
477
|
-
homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
|
|
478
|
-
|
|
479
|
-
# --- MAIN HYPOTHESIS TESTS ---
|
|
480
|
-
if method == "mean":
|
|
481
|
-
# One-sample t-test
|
|
482
|
-
t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
|
|
483
|
-
test_name = "One-sample t-test"
|
|
484
|
-
|
|
485
|
-
elif method == "difference_mean":
|
|
486
|
-
# Two-sample t-test
|
|
487
|
-
equal_var = homo_result["equal_var"]
|
|
488
|
-
t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
|
|
489
|
-
test_name = "Two-sample t-test"
|
|
490
|
-
|
|
491
|
-
elif method == "proportion":
|
|
492
|
-
# Proportion test (z-test)
|
|
493
|
-
p_hat = np.mean(x)
|
|
494
|
-
n = len(x)
|
|
495
|
-
z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
|
|
496
|
-
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
|
|
497
|
-
t_stat = z_stat
|
|
498
|
-
test_name = "Proportion Z-test"
|
|
499
|
-
|
|
500
|
-
elif method == "variance":
|
|
501
|
-
# Classic F-test
|
|
502
|
-
var_x = np.var(x, ddof=1)
|
|
503
|
-
var_y = np.var(y, ddof=1)
|
|
504
|
-
F = var_x / var_y
|
|
505
|
-
dfn = len(x) - 1
|
|
506
|
-
dfd = len(y) - 1
|
|
507
|
-
|
|
508
|
-
p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
|
|
509
|
-
t_stat = F
|
|
510
|
-
test_name = "Variance F-test"
|
|
512
|
+
homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
|
|
513
|
+
|
|
514
|
+
"""
|
|
515
|
+
Test de Hipotesis
|
|
511
516
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
517
|
+
Parameters:
|
|
518
|
+
-----------
|
|
519
|
+
method : str
|
|
520
|
+
'mean', 'difference_mean', 'proportion' o 'variance'
|
|
521
|
+
column1, column2 : str
|
|
522
|
+
Columnas numéricas a comparar
|
|
523
|
+
alpha : float
|
|
524
|
+
Nivel de significancia (default 0.05)
|
|
525
|
+
homoscedasticity : str
|
|
526
|
+
Método de homocedasticidad
|
|
527
|
+
'levene', 'bartlett' o 'var_test'
|
|
528
|
+
"""
|
|
529
|
+
|
|
530
|
+
data = self.data
|
|
531
|
+
|
|
532
|
+
if column1 is None:
|
|
533
|
+
raise ValueError("Debes especificar 'column1'.")
|
|
534
|
+
|
|
535
|
+
x = data[column1].dropna()
|
|
536
|
+
|
|
537
|
+
if method in ["difference_mean", "variance"] and column2 is None:
|
|
538
|
+
raise ValueError("Para este método debes pasar 'column2'.")
|
|
539
|
+
|
|
540
|
+
y = data[column2].dropna() if column2 else None
|
|
541
|
+
|
|
542
|
+
# --- homoscedasticity test ---
|
|
543
|
+
homo_result = None
|
|
544
|
+
if method in ["difference_mean", "variance"]:
|
|
545
|
+
homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
|
|
546
|
+
|
|
547
|
+
# --- MAIN HYPOTHESIS TESTS ---
|
|
548
|
+
if method == "mean":
|
|
549
|
+
# One-sample t-test
|
|
550
|
+
t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
|
|
551
|
+
test_name = "One-sample t-test"
|
|
552
|
+
|
|
553
|
+
elif method == "difference_mean":
|
|
554
|
+
# Two-sample t-test
|
|
555
|
+
equal_var = homo_result["equal_var"]
|
|
556
|
+
t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
|
|
557
|
+
test_name = "Two-sample t-test"
|
|
558
|
+
|
|
559
|
+
elif method == "proportion":
|
|
560
|
+
# Proportion test (z-test)
|
|
561
|
+
p_hat = np.mean(x)
|
|
562
|
+
n = len(x)
|
|
563
|
+
z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
|
|
564
|
+
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
|
|
565
|
+
t_stat = z_stat
|
|
566
|
+
test_name = "Proportion Z-test"
|
|
567
|
+
|
|
568
|
+
elif method == "variance":
|
|
569
|
+
# Classic F-test
|
|
570
|
+
var_x = np.var(x, ddof=1)
|
|
571
|
+
var_y = np.var(y, ddof=1)
|
|
572
|
+
F = var_x / var_y
|
|
573
|
+
dfn = len(x) - 1
|
|
574
|
+
dfd = len(y) - 1
|
|
575
|
+
|
|
576
|
+
p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
|
|
577
|
+
t_stat = F
|
|
578
|
+
test_name = "Variance F-test"
|
|
579
|
+
|
|
580
|
+
return {
|
|
581
|
+
"test": test_name,
|
|
582
|
+
"statistic": t_stat,
|
|
583
|
+
"p_value": p_value,
|
|
584
|
+
"alpha": alpha,
|
|
585
|
+
"reject_H0": p_value < alpha,
|
|
586
|
+
"homoscedasticity_test": homo_result
|
|
587
|
+
}
|
|
520
588
|
|
|
521
589
|
def _homoscedasticity_test(
|
|
522
590
|
self,
|
|
523
591
|
x,
|
|
524
592
|
y,
|
|
525
|
-
method: Literal["levene", "bartlett", "var_test"] = "levene"
|
|
526
|
-
) -> Dict[str, Any]:
|
|
593
|
+
method: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
|
|
527
594
|
|
|
528
595
|
if method == "levene":
|
|
529
596
|
stat, p = stats.levene(x, y)
|