statslibx 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statslibx/inferential.py CHANGED
@@ -12,9 +12,16 @@ class InferentialStats:
12
12
  """
13
13
 
14
14
  def __init__(self, data: Union[pd.DataFrame, np.ndarray],
15
- backend: Literal['pandas', 'polars'] = 'pandas'):
15
+ backend: Literal['pandas', 'polars'] = 'pandas'):
16
16
  """
17
17
  Inicializar con DataFrame o array numpy
18
+
19
+ Parameters:
20
+ -----------
21
+ data : DataFrame o ndarray
22
+ Datos a analizar
23
+ backend : str
24
+ 'pandas' o 'polars' para procesamiento
18
25
  """
19
26
 
20
27
  if isinstance(data, str) and os.path.exists(data):
@@ -67,7 +74,7 @@ class InferentialStats:
67
74
  # ============= INTERVALOS DE CONFIANZA =============
68
75
 
69
76
  def confidence_interval(self, column: str, confidence: float = 0.95,
70
- statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
77
+ statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
71
78
  """
72
79
  Intervalo de confianza para diferentes estadísticos
73
80
 
@@ -120,8 +127,8 @@ class InferentialStats:
120
127
  # ============= PRUEBAS DE HIPÓTESIS =============
121
128
 
122
129
  def t_test_1sample(self, column: str, popmean: float = None,
123
- popmedian: float = None,
124
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
130
+ popmedian: float = None,
131
+ alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
125
132
  """
126
133
  Prueba t de una muestra (para media o mediana)
127
134
 
@@ -176,8 +183,8 @@ class InferentialStats:
176
183
  raise ValueError("Debe especificar popmean o popmedian")
177
184
 
178
185
  def t_test_2sample(self, column1: str, column2: str,
179
- equal_var: bool = True,
180
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
186
+ equal_var: bool = True,
187
+ alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
181
188
  """
182
189
  Prueba t de dos muestras independientes
183
190
 
@@ -211,9 +218,16 @@ class InferentialStats:
211
218
  )
212
219
 
213
220
  def t_test_paired(self, column1: str, column2: str,
214
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
221
+ alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
215
222
  """
216
223
  Prueba t pareada
224
+
225
+ Parameters:
226
+ -----------
227
+ column1, column2:
228
+ Datos a analizar
229
+ alternative:
230
+ "two-sided", "less" o "greater"
217
231
  """
218
232
  from scipy import stats
219
233
 
@@ -231,7 +245,7 @@ class InferentialStats:
231
245
  )
232
246
 
233
247
  def mann_whitney_test(self, column1: str, column2: str,
234
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
248
+ alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
235
249
  """
236
250
  Prueba de Mann-Whitney U (alternativa no paramétrica al t-test)
237
251
 
@@ -339,8 +353,8 @@ class InferentialStats:
339
353
  )
340
354
 
341
355
  def normality_test(self, column: str,
342
- method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
343
- test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
356
+ method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
357
+ test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
344
358
  """
345
359
  Prueba de normalidad con múltiples métodos y estadísticos
346
360
 
@@ -495,74 +509,88 @@ class InferentialStats:
495
509
  column1: str = None,
496
510
  column2: str = None,
497
511
  alpha: float = 0.05,
498
- homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene"
499
- ) -> Dict[str, Any]:
500
-
501
- data = self.data
502
-
503
- if column1 is None:
504
- raise ValueError("Debes especificar 'column1'.")
505
-
506
- x = data[column1].dropna()
507
-
508
- if method in ["difference_mean", "variance"] and column2 is None:
509
- raise ValueError("Para este método debes pasar 'column2'.")
510
-
511
- y = data[column2].dropna() if column2 else None
512
-
513
- # --- homoscedasticity test ---
514
- homo_result = None
515
- if method in ["difference_mean", "variance"]:
516
- homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
517
-
518
- # --- MAIN HYPOTHESIS TESTS ---
519
- if method == "mean":
520
- # One-sample t-test
521
- t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
522
- test_name = "One-sample t-test"
523
-
524
- elif method == "difference_mean":
525
- # Two-sample t-test
526
- equal_var = homo_result["equal_var"]
527
- t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
528
- test_name = "Two-sample t-test"
529
-
530
- elif method == "proportion":
531
- # Proportion test (z-test)
532
- p_hat = np.mean(x)
533
- n = len(x)
534
- z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
535
- p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
536
- t_stat = z_stat
537
- test_name = "Proportion Z-test"
538
-
539
- elif method == "variance":
540
- # Classic F-test
541
- var_x = np.var(x, ddof=1)
542
- var_y = np.var(y, ddof=1)
543
- F = var_x / var_y
544
- dfn = len(x) - 1
545
- dfd = len(y) - 1
546
-
547
- p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
548
- t_stat = F
549
- test_name = "Variance F-test"
512
+ homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
513
+
514
+ """
515
+ Test de Hipotesis
550
516
 
551
- return {
552
- "test": test_name,
553
- "statistic": t_stat,
554
- "p_value": p_value,
555
- "alpha": alpha,
556
- "reject_H0": p_value < alpha,
557
- "homoscedasticity_test": homo_result
558
- }
517
+ Parameters:
518
+ -----------
519
+ method : str
520
+ 'mean', 'difference_mean', 'proportion' o 'variance'
521
+ column1, column2 : str
522
+ Columnas numéricas a comparar
523
+ alpha : float
524
+ Nivel de significancia (default 0.05)
525
+ homoscedasticity : str
526
+ Método de homocedasticidad
527
+ 'levene', 'bartlett' o 'var_test'
528
+ """
529
+
530
+ data = self.data
531
+
532
+ if column1 is None:
533
+ raise ValueError("Debes especificar 'column1'.")
534
+
535
+ x = data[column1].dropna()
536
+
537
+ if method in ["difference_mean", "variance"] and column2 is None:
538
+ raise ValueError("Para este método debes pasar 'column2'.")
539
+
540
+ y = data[column2].dropna() if column2 else None
541
+
542
+ # --- homoscedasticity test ---
543
+ homo_result = None
544
+ if method in ["difference_mean", "variance"]:
545
+ homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
546
+
547
+ # --- MAIN HYPOTHESIS TESTS ---
548
+ if method == "mean":
549
+ # One-sample t-test
550
+ t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
551
+ test_name = "One-sample t-test"
552
+
553
+ elif method == "difference_mean":
554
+ # Two-sample t-test
555
+ equal_var = homo_result["equal_var"]
556
+ t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
557
+ test_name = "Two-sample t-test"
558
+
559
+ elif method == "proportion":
560
+ # Proportion test (z-test)
561
+ p_hat = np.mean(x)
562
+ n = len(x)
563
+ z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
564
+ p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
565
+ t_stat = z_stat
566
+ test_name = "Proportion Z-test"
567
+
568
+ elif method == "variance":
569
+ # Classic F-test
570
+ var_x = np.var(x, ddof=1)
571
+ var_y = np.var(y, ddof=1)
572
+ F = var_x / var_y
573
+ dfn = len(x) - 1
574
+ dfd = len(y) - 1
575
+
576
+ p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
577
+ t_stat = F
578
+ test_name = "Variance F-test"
579
+
580
+ return {
581
+ "test": test_name,
582
+ "statistic": t_stat,
583
+ "p_value": p_value,
584
+ "alpha": alpha,
585
+ "reject_H0": p_value < alpha,
586
+ "homoscedasticity_test": homo_result
587
+ }
559
588
 
560
589
  def _homoscedasticity_test(
561
590
  self,
562
591
  x,
563
592
  y,
564
- method: Literal["levene", "bartlett", "var_test"] = "levene"
565
- ) -> Dict[str, Any]:
593
+ method: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
566
594
 
567
595
  if method == "levene":
568
596
  stat, p = stats.levene(x, y)
statslibx/io.py ADDED
@@ -0,0 +1,21 @@
1
+ import pandas as pd
2
+ import polars as pl
3
+ from pathlib import Path
4
+
5
+
6
+ def load_file(path: str):
7
+ path = Path(path)
8
+
9
+ if not path.exists():
10
+ raise FileNotFoundError(f"{path} not found")
11
+
12
+ if path.suffix == ".csv":
13
+ return pd.read_csv(path)
14
+
15
+ if path.suffix == ".json":
16
+ return pd.read_json(path)
17
+
18
+ if path.suffix in {".txt", ".tsv"}:
19
+ return pd.read_csv(path, sep="\t")
20
+
21
+ raise ValueError(f"Unsupported file type: {path.suffix}")
@@ -0,0 +1,221 @@
1
+ from typing import Optional, Union, List, Dict, Any
2
+ import pandas as pd
3
+ import polars as pl
4
+ import numpy as np
5
+
6
+
7
+ class Preprocessing:
8
+
9
+ def __init__(self, data: Union[pd.DataFrame, pl.DataFrame]):
10
+ if not isinstance(data, (pd.DataFrame, pl.DataFrame)):
11
+ raise TypeError("data must be a pandas or polars DataFrame")
12
+ self.data = data
13
+
14
+ # ------------------------------------------------------------------
15
+ # Internal helpers
16
+ # ------------------------------------------------------------------
17
+
18
+ def _is_pandas(self) -> bool:
19
+ return isinstance(self.data, pd.DataFrame)
20
+
21
+ def _is_polars(self) -> bool:
22
+ return isinstance(self.data, pl.DataFrame)
23
+
24
+ def _count_nulls(self, column: str) -> int:
25
+ if self._is_pandas():
26
+ return int(self.data[column].isna().sum())
27
+ return int(self.data[column].null_count())
28
+
29
+ def _get_columns(self, columns):
30
+ if columns is None:
31
+ return list(self.data.columns)
32
+ if isinstance(columns, str):
33
+ return [columns]
34
+ return columns
35
+
36
+ # ------------------------------------------------------------------
37
+ # Inspection
38
+ # ------------------------------------------------------------------
39
+
40
+ def detect_nulls(
41
+ self,
42
+ columns: Optional[Union[str, List[str]]] = None
43
+ ) -> pd.DataFrame:
44
+
45
+ columns = self._get_columns(columns)
46
+ total = self.data.shape[0]
47
+
48
+ rows = []
49
+ for col in columns:
50
+ nulls = self._count_nulls(col)
51
+ rows.append({
52
+ "column": col,
53
+ "nulls": nulls,
54
+ "non_nulls": total - nulls,
55
+ "null_pct": nulls / total
56
+ })
57
+
58
+ return pd.DataFrame(rows)
59
+
60
+ def check_uniqueness(self) -> pd.DataFrame:
61
+ if self._is_pandas():
62
+ unique = self.data.nunique()
63
+ return pd.DataFrame({
64
+ "column": unique.index,
65
+ "unique_values": unique.values
66
+ })
67
+
68
+ unique = self.data.select(pl.all().n_unique())
69
+ return unique.to_pandas().melt(
70
+ var_name="column",
71
+ value_name="unique_values"
72
+ )
73
+
74
+ def preview_data(self, n: int = 5):
75
+ return self.data.head(n)
76
+
77
+ # ------------------------------------------------------------------
78
+ # Description
79
+ # ------------------------------------------------------------------
80
+
81
+ def describe_numeric(self):
82
+ if self._is_pandas():
83
+ return self.data.select_dtypes(include=np.number).describe()
84
+
85
+ return self.data.select(pl.all().filter(pl.col(pl.NUMERIC))).describe()
86
+
87
+ def describe_categorical(self):
88
+ if self._is_pandas():
89
+ return self.data.select_dtypes(include="object").describe()
90
+
91
+ return self.data.select(pl.all().filter(pl.col(pl.Utf8))).describe()
92
+
93
+ # ------------------------------------------------------------------
94
+ # Transformations
95
+ # ------------------------------------------------------------------
96
+
97
+ def fill_nulls(
98
+ self,
99
+ fill_with: Any,
100
+ columns: Optional[Union[str, List[str]]] = None
101
+ ):
102
+ columns = self._get_columns(columns)
103
+
104
+ if self._is_pandas():
105
+ self.data[columns] = self.data[columns].fillna(fill_with)
106
+
107
+ else:
108
+ self.data = self.data.with_columns([
109
+ pl.col(col).fill_null(fill_with) for col in columns
110
+ ])
111
+
112
+ return self
113
+
114
+ def normalize(self, column: str):
115
+ if self._is_pandas():
116
+ col = self.data[column]
117
+ self.data[column] = (col - col.min()) / (col.max() - col.min())
118
+ else:
119
+ self.data = self.data.with_columns(
120
+ ((pl.col(column) - pl.col(column).min()) /
121
+ (pl.col(column).max() - pl.col(column).min()))
122
+ .alias(column)
123
+ )
124
+ return self
125
+
126
+ def standardize(self, column: str):
127
+ if self._is_pandas():
128
+ col = self.data[column]
129
+ self.data[column] = (col - col.mean()) / col.std()
130
+ else:
131
+ self.data = self.data.with_columns(
132
+ ((pl.col(column) - pl.col(column).mean()) /
133
+ pl.col(column).std())
134
+ .alias(column)
135
+ )
136
+ return self
137
+
138
+ # ------------------------------------------------------------------
139
+ # Filtering
140
+ # ------------------------------------------------------------------
141
+
142
+ def filter_rows(self, condition):
143
+ if self._is_pandas():
144
+ self.data = self.data.loc[condition]
145
+ else:
146
+ self.data = self.data.filter(condition)
147
+ return self
148
+
149
+ def filter_columns(self, columns: List[str]):
150
+ if self._is_pandas():
151
+ self.data = self.data[columns]
152
+ else:
153
+ self.data = self.data.select(columns)
154
+ return self
155
+
156
+ def rename_columns(self, mapping: Dict[str, str]):
157
+ if self._is_pandas():
158
+ self.data = self.data.rename(columns=mapping)
159
+ else:
160
+ self.data = self.data.rename(mapping)
161
+ return self
162
+
163
+ # ------------------------------------------------------------------
164
+ # Outliers
165
+ # ------------------------------------------------------------------
166
+
167
+ def detect_outliers(
168
+ self,
169
+ column: str,
170
+ method: str = "iqr"
171
+ ) -> pd.DataFrame:
172
+
173
+ if self._is_pandas():
174
+ series = self.data[column]
175
+ else:
176
+ series = self.data[column].to_pandas()
177
+
178
+ if method == "iqr":
179
+ q1 = series.quantile(0.25)
180
+ q3 = series.quantile(0.75)
181
+ iqr = q3 - q1
182
+ mask = (series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)
183
+
184
+ elif method == "zscore":
185
+ z = (series - series.mean()) / series.std()
186
+ mask = z.abs() > 3
187
+
188
+ else:
189
+ raise ValueError("method must be 'iqr' or 'zscore'")
190
+
191
+ return self.data[mask]
192
+
193
+ # ------------------------------------------------------------------
194
+ # Data Quality Report
195
+ # ------------------------------------------------------------------
196
+
197
+ def data_quality(self) -> pd.DataFrame:
198
+ total_rows = self.data.shape[0]
199
+ rows = []
200
+
201
+ for col in self.data.columns:
202
+ nulls = self._count_nulls(col)
203
+
204
+ if self._is_pandas():
205
+ dtype = str(self.data[col].dtype)
206
+ unique = self.data[col].nunique()
207
+ else:
208
+ dtype = str(self.data.schema[col])
209
+ unique = self.data[col].n_unique()
210
+
211
+ rows.append({
212
+ "column": col,
213
+ "dtype": dtype,
214
+ "nulls": nulls,
215
+ "null_pct": nulls / total_rows,
216
+ "unique_values": unique,
217
+ "completeness_pct": 1 - (nulls / total_rows)
218
+ })
219
+
220
+ return pd.DataFrame(rows)
221
+