statslibx 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statslibx/inferential.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
4
  from typing import Optional, Union, Literal, List, Dict, Any
5
5
  from datetime import datetime
6
6
  from scipy import stats
7
+ import os
7
8
 
8
9
  class InferentialStats:
9
10
  """
@@ -11,10 +12,21 @@ class InferentialStats:
11
12
  """
12
13
 
13
14
  def __init__(self, data: Union[pd.DataFrame, np.ndarray],
14
- backend: Literal['pandas', 'polars'] = 'pandas'):
15
+ backend: Literal['pandas', 'polars'] = 'pandas'):
15
16
  """
16
17
  Inicializar con DataFrame o array numpy
18
+
19
+ Parameters:
20
+ -----------
21
+ data : DataFrame o ndarray
22
+ Datos a analizar
23
+ backend : str
24
+ 'pandas' o 'polars' para procesamiento
17
25
  """
26
+
27
+ if isinstance(data, str) and os.path.exists(data):
28
+ data = InferentialStats.from_file(data).data
29
+
18
30
  if isinstance(data, np.ndarray):
19
31
  if data.ndim == 1:
20
32
  data = pd.DataFrame({'var': data})
@@ -24,11 +36,45 @@ class InferentialStats:
24
36
  self.data = data
25
37
  self.backend = backend
26
38
  self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
39
+
40
+ @staticmethod
41
+ def from_file(path: str):
42
+ """
43
+ Carga automática de archivos y devuelve instancia de Intelligence.
44
+ Soporta CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
45
+ """
46
+ if not os.path.exists(path):
47
+ raise FileNotFoundError(f"Archivo no encontrado: {path}")
48
+
49
+ ext = os.path.splitext(path)[1].lower()
50
+
51
+ if ext == ".csv":
52
+ df = pd.read_csv(path)
53
+
54
+ elif ext in [".xlsx", ".xls"]:
55
+ df = pd.read_excel(path)
56
+
57
+ elif ext in [".txt", ".tsv"]:
58
+ df = pd.read_table(path)
59
+
60
+ elif ext == ".json":
61
+ df = pd.read_json(path)
62
+
63
+ elif ext == ".parquet":
64
+ df = pd.read_parquet(path)
65
+
66
+ elif ext == ".feather":
67
+ df = pd.read_feather(path)
68
+
69
+ else:
70
+ raise ValueError(f"Formato no soportado: {ext}")
71
+
72
+ return InferentialStats(df)
27
73
 
28
74
  # ============= INTERVALOS DE CONFIANZA =============
29
75
 
30
76
  def confidence_interval(self, column: str, confidence: float = 0.95,
31
- statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
77
+ statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
32
78
  """
33
79
  Intervalo de confianza para diferentes estadísticos
34
80
 
@@ -81,8 +127,8 @@ class InferentialStats:
81
127
  # ============= PRUEBAS DE HIPÓTESIS =============
82
128
 
83
129
  def t_test_1sample(self, column: str, popmean: float = None,
84
- popmedian: float = None,
85
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
130
+ popmedian: float = None,
131
+ alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
86
132
  """
87
133
  Prueba t de una muestra (para media o mediana)
88
134
 
@@ -137,8 +183,8 @@ class InferentialStats:
137
183
  raise ValueError("Debe especificar popmean o popmedian")
138
184
 
139
185
  def t_test_2sample(self, column1: str, column2: str,
140
- equal_var: bool = True,
141
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
186
+ equal_var: bool = True,
187
+ alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
142
188
  """
143
189
  Prueba t de dos muestras independientes
144
190
 
@@ -172,9 +218,16 @@ class InferentialStats:
172
218
  )
173
219
 
174
220
  def t_test_paired(self, column1: str, column2: str,
175
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
221
+ alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
176
222
  """
177
223
  Prueba t pareada
224
+
225
+ Parameters:
226
+ -----------
227
+ column1, column2:
228
+ Datos a analizar
229
+ alternative:
230
+ "two-sided", "less" o "greater"
178
231
  """
179
232
  from scipy import stats
180
233
 
@@ -192,7 +245,7 @@ class InferentialStats:
192
245
  )
193
246
 
194
247
  def mann_whitney_test(self, column1: str, column2: str,
195
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
248
+ alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
196
249
  """
197
250
  Prueba de Mann-Whitney U (alternativa no paramétrica al t-test)
198
251
 
@@ -300,8 +353,8 @@ class InferentialStats:
300
353
  )
301
354
 
302
355
  def normality_test(self, column: str,
303
- method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
304
- test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
356
+ method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
357
+ test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
305
358
  """
306
359
  Prueba de normalidad con múltiples métodos y estadísticos
307
360
 
@@ -456,74 +509,88 @@ class InferentialStats:
456
509
  column1: str = None,
457
510
  column2: str = None,
458
511
  alpha: float = 0.05,
459
- homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene"
460
- ) -> Dict[str, Any]:
461
-
462
- data = self.data
463
-
464
- if column1 is None:
465
- raise ValueError("Debes especificar 'column1'.")
466
-
467
- x = data[column1].dropna()
468
-
469
- if method in ["difference_mean", "variance"] and column2 is None:
470
- raise ValueError("Para este método debes pasar 'column2'.")
471
-
472
- y = data[column2].dropna() if column2 else None
473
-
474
- # --- homoscedasticity test ---
475
- homo_result = None
476
- if method in ["difference_mean", "variance"]:
477
- homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
478
-
479
- # --- MAIN HYPOTHESIS TESTS ---
480
- if method == "mean":
481
- # One-sample t-test
482
- t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
483
- test_name = "One-sample t-test"
484
-
485
- elif method == "difference_mean":
486
- # Two-sample t-test
487
- equal_var = homo_result["equal_var"]
488
- t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
489
- test_name = "Two-sample t-test"
490
-
491
- elif method == "proportion":
492
- # Proportion test (z-test)
493
- p_hat = np.mean(x)
494
- n = len(x)
495
- z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
496
- p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
497
- t_stat = z_stat
498
- test_name = "Proportion Z-test"
499
-
500
- elif method == "variance":
501
- # Classic F-test
502
- var_x = np.var(x, ddof=1)
503
- var_y = np.var(y, ddof=1)
504
- F = var_x / var_y
505
- dfn = len(x) - 1
506
- dfd = len(y) - 1
507
-
508
- p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
509
- t_stat = F
510
- test_name = "Variance F-test"
512
+ homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
513
+
514
+ """
515
+ Test de Hipotesis
511
516
 
512
- return {
513
- "test": test_name,
514
- "statistic": t_stat,
515
- "p_value": p_value,
516
- "alpha": alpha,
517
- "reject_H0": p_value < alpha,
518
- "homoscedasticity_test": homo_result
519
- }
517
+ Parameters:
518
+ -----------
519
+ method : str
520
+ 'mean', 'difference_mean', 'proportion' o 'variance'
521
+ column1, column2 : str
522
+ Columnas numéricas a comparar
523
+ alpha : float
524
+ Nivel de significancia (default 0.05)
525
+ homoscedasticity : str
526
+ Método de homocedasticidad
527
+ 'levene', 'bartlett' o 'var_test'
528
+ """
529
+
530
+ data = self.data
531
+
532
+ if column1 is None:
533
+ raise ValueError("Debes especificar 'column1'.")
534
+
535
+ x = data[column1].dropna()
536
+
537
+ if method in ["difference_mean", "variance"] and column2 is None:
538
+ raise ValueError("Para este método debes pasar 'column2'.")
539
+
540
+ y = data[column2].dropna() if column2 else None
541
+
542
+ # --- homoscedasticity test ---
543
+ homo_result = None
544
+ if method in ["difference_mean", "variance"]:
545
+ homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
546
+
547
+ # --- MAIN HYPOTHESIS TESTS ---
548
+ if method == "mean":
549
+ # One-sample t-test
550
+ t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
551
+ test_name = "One-sample t-test"
552
+
553
+ elif method == "difference_mean":
554
+ # Two-sample t-test
555
+ equal_var = homo_result["equal_var"]
556
+ t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
557
+ test_name = "Two-sample t-test"
558
+
559
+ elif method == "proportion":
560
+ # Proportion test (z-test)
561
+ p_hat = np.mean(x)
562
+ n = len(x)
563
+ z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
564
+ p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
565
+ t_stat = z_stat
566
+ test_name = "Proportion Z-test"
567
+
568
+ elif method == "variance":
569
+ # Classic F-test
570
+ var_x = np.var(x, ddof=1)
571
+ var_y = np.var(y, ddof=1)
572
+ F = var_x / var_y
573
+ dfn = len(x) - 1
574
+ dfd = len(y) - 1
575
+
576
+ p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
577
+ t_stat = F
578
+ test_name = "Variance F-test"
579
+
580
+ return {
581
+ "test": test_name,
582
+ "statistic": t_stat,
583
+ "p_value": p_value,
584
+ "alpha": alpha,
585
+ "reject_H0": p_value < alpha,
586
+ "homoscedasticity_test": homo_result
587
+ }
520
588
 
521
589
  def _homoscedasticity_test(
522
590
  self,
523
591
  x,
524
592
  y,
525
- method: Literal["levene", "bartlett", "var_test"] = "levene"
526
- ) -> Dict[str, Any]:
593
+ method: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
527
594
 
528
595
  if method == "levene":
529
596
  stat, p = stats.levene(x, y)