statslibx 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statslibx/io.py ADDED
@@ -0,0 +1,21 @@
1
+ import pandas as pd
2
+ import polars as pl
3
+ from pathlib import Path
4
+
5
+
6
+ def load_file(path: str):
7
+ path = Path(path)
8
+
9
+ if not path.exists():
10
+ raise FileNotFoundError(f"{path} not found")
11
+
12
+ if path.suffix == ".csv":
13
+ return pd.read_csv(path)
14
+
15
+ if path.suffix == ".json":
16
+ return pd.read_json(path)
17
+
18
+ if path.suffix in {".txt", ".tsv"}:
19
+ return pd.read_csv(path, sep="\t")
20
+
21
+ raise ValueError(f"Unsupported file type: {path.suffix}")
@@ -0,0 +1,221 @@
1
+ from typing import Optional, Union, List, Dict, Any
2
+ import pandas as pd
3
+ import polars as pl
4
+ import numpy as np
5
+
6
+
7
+ class Preprocessing:
8
+
9
+ def __init__(self, data: Union[pd.DataFrame, pl.DataFrame]):
10
+ if not isinstance(data, (pd.DataFrame, pl.DataFrame)):
11
+ raise TypeError("data must be a pandas or polars DataFrame")
12
+ self.data = data
13
+
14
+ # ------------------------------------------------------------------
15
+ # Internal helpers
16
+ # ------------------------------------------------------------------
17
+
18
+ def _is_pandas(self) -> bool:
19
+ return isinstance(self.data, pd.DataFrame)
20
+
21
+ def _is_polars(self) -> bool:
22
+ return isinstance(self.data, pl.DataFrame)
23
+
24
+ def _count_nulls(self, column: str) -> int:
25
+ if self._is_pandas():
26
+ return int(self.data[column].isna().sum())
27
+ return int(self.data[column].null_count())
28
+
29
+ def _get_columns(self, columns):
30
+ if columns is None:
31
+ return list(self.data.columns)
32
+ if isinstance(columns, str):
33
+ return [columns]
34
+ return columns
35
+
36
+ # ------------------------------------------------------------------
37
+ # Inspection
38
+ # ------------------------------------------------------------------
39
+
40
+ def detect_nulls(
41
+ self,
42
+ columns: Optional[Union[str, List[str]]] = None
43
+ ) -> pd.DataFrame:
44
+
45
+ columns = self._get_columns(columns)
46
+ total = self.data.shape[0]
47
+
48
+ rows = []
49
+ for col in columns:
50
+ nulls = self._count_nulls(col)
51
+ rows.append({
52
+ "column": col,
53
+ "nulls": nulls,
54
+ "non_nulls": total - nulls,
55
+ "null_pct": nulls / total
56
+ })
57
+
58
+ return pd.DataFrame(rows)
59
+
60
+ def check_uniqueness(self) -> pd.DataFrame:
61
+ if self._is_pandas():
62
+ unique = self.data.nunique()
63
+ return pd.DataFrame({
64
+ "column": unique.index,
65
+ "unique_values": unique.values
66
+ })
67
+
68
+ unique = self.data.select(pl.all().n_unique())
69
+ return unique.to_pandas().melt(
70
+ var_name="column",
71
+ value_name="unique_values"
72
+ )
73
+
74
+ def preview_data(self, n: int = 5):
75
+ return self.data.head(n)
76
+
77
+ # ------------------------------------------------------------------
78
+ # Description
79
+ # ------------------------------------------------------------------
80
+
81
+ def describe_numeric(self):
82
+ if self._is_pandas():
83
+ return self.data.select_dtypes(include=np.number).describe()
84
+
85
+ return self.data.select(pl.all().filter(pl.col(pl.NUMERIC))).describe()
86
+
87
+ def describe_categorical(self):
88
+ if self._is_pandas():
89
+ return self.data.select_dtypes(include="object").describe()
90
+
91
+ return self.data.select(pl.all().filter(pl.col(pl.Utf8))).describe()
92
+
93
+ # ------------------------------------------------------------------
94
+ # Transformations
95
+ # ------------------------------------------------------------------
96
+
97
+ def fill_nulls(
98
+ self,
99
+ fill_with: Any,
100
+ columns: Optional[Union[str, List[str]]] = None
101
+ ):
102
+ columns = self._get_columns(columns)
103
+
104
+ if self._is_pandas():
105
+ self.data[columns] = self.data[columns].fillna(fill_with)
106
+
107
+ else:
108
+ self.data = self.data.with_columns([
109
+ pl.col(col).fill_null(fill_with) for col in columns
110
+ ])
111
+
112
+ return self
113
+
114
+ def normalize(self, column: str):
115
+ if self._is_pandas():
116
+ col = self.data[column]
117
+ self.data[column] = (col - col.min()) / (col.max() - col.min())
118
+ else:
119
+ self.data = self.data.with_columns(
120
+ ((pl.col(column) - pl.col(column).min()) /
121
+ (pl.col(column).max() - pl.col(column).min()))
122
+ .alias(column)
123
+ )
124
+ return self
125
+
126
+ def standardize(self, column: str):
127
+ if self._is_pandas():
128
+ col = self.data[column]
129
+ self.data[column] = (col - col.mean()) / col.std()
130
+ else:
131
+ self.data = self.data.with_columns(
132
+ ((pl.col(column) - pl.col(column).mean()) /
133
+ pl.col(column).std())
134
+ .alias(column)
135
+ )
136
+ return self
137
+
138
+ # ------------------------------------------------------------------
139
+ # Filtering
140
+ # ------------------------------------------------------------------
141
+
142
+ def filter_rows(self, condition):
143
+ if self._is_pandas():
144
+ self.data = self.data.loc[condition]
145
+ else:
146
+ self.data = self.data.filter(condition)
147
+ return self
148
+
149
+ def filter_columns(self, columns: List[str]):
150
+ if self._is_pandas():
151
+ self.data = self.data[columns]
152
+ else:
153
+ self.data = self.data.select(columns)
154
+ return self
155
+
156
+ def rename_columns(self, mapping: Dict[str, str]):
157
+ if self._is_pandas():
158
+ self.data = self.data.rename(columns=mapping)
159
+ else:
160
+ self.data = self.data.rename(mapping)
161
+ return self
162
+
163
+ # ------------------------------------------------------------------
164
+ # Outliers
165
+ # ------------------------------------------------------------------
166
+
167
+ def detect_outliers(
168
+ self,
169
+ column: str,
170
+ method: str = "iqr"
171
+ ) -> pd.DataFrame:
172
+
173
+ if self._is_pandas():
174
+ series = self.data[column]
175
+ else:
176
+ series = self.data[column].to_pandas()
177
+
178
+ if method == "iqr":
179
+ q1 = series.quantile(0.25)
180
+ q3 = series.quantile(0.75)
181
+ iqr = q3 - q1
182
+ mask = (series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)
183
+
184
+ elif method == "zscore":
185
+ z = (series - series.mean()) / series.std()
186
+ mask = z.abs() > 3
187
+
188
+ else:
189
+ raise ValueError("method must be 'iqr' or 'zscore'")
190
+
191
+ return self.data[mask]
192
+
193
+ # ------------------------------------------------------------------
194
+ # Data Quality Report
195
+ # ------------------------------------------------------------------
196
+
197
+ def data_quality(self) -> pd.DataFrame:
198
+ total_rows = self.data.shape[0]
199
+ rows = []
200
+
201
+ for col in self.data.columns:
202
+ nulls = self._count_nulls(col)
203
+
204
+ if self._is_pandas():
205
+ dtype = str(self.data[col].dtype)
206
+ unique = self.data[col].nunique()
207
+ else:
208
+ dtype = str(self.data.schema[col])
209
+ unique = self.data[col].n_unique()
210
+
211
+ rows.append({
212
+ "column": col,
213
+ "dtype": dtype,
214
+ "nulls": nulls,
215
+ "null_pct": nulls / total_rows,
216
+ "unique_values": unique,
217
+ "completeness_pct": 1 - (nulls / total_rows)
218
+ })
219
+
220
+ return pd.DataFrame(rows)
221
+
@@ -1,47 +1,28 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: statslibx
3
- Version: 0.1.6
4
- Summary: Librería de estadística descriptiva e inferencial para Python
5
- Home-page: https://github.com/Immanuel3008/StatsLibX
6
- Author: Emmanuel Ascendra Perez
7
- Author-email: ascendraemmanuel@gmail.com
3
+ Version: 0.1.7
4
+ Summary: StatsLibx - Librería de estadística descriptiva e inferencial
5
+ Author-email: Emmanuel Ascendra Perez <ascendraemmanuel@gmail.com>
6
+ License: MIT
8
7
  Classifier: Development Status :: 3 - Alpha
9
8
  Classifier: Intended Audience :: Science/Research
10
9
  Classifier: Topic :: Scientific/Engineering :: Mathematics
11
10
  Classifier: License :: OSI Approved :: MIT License
12
11
  Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.8
14
12
  Classifier: Programming Language :: Python :: 3.9
15
13
  Classifier: Programming Language :: Python :: 3.10
16
14
  Classifier: Programming Language :: Python :: 3.11
17
15
  Classifier: Programming Language :: Python :: 3.12
18
16
  Requires-Python: >=3.8
19
17
  Description-Content-Type: text/markdown
20
- Requires-Dist: numpy>=1.20.0
21
- Requires-Dist: pandas>=1.3.0
22
- Requires-Dist: scipy>=1.7.0
23
- Requires-Dist: matplotlib>=3.4.0
18
+ Requires-Dist: pandas>=1.5
19
+ Requires-Dist: polars>=0.20
24
20
  Provides-Extra: viz
25
- Requires-Dist: seaborn>=0.11.0; extra == "viz"
26
- Requires-Dist: plotly>=5.0.0; extra == "viz"
21
+ Requires-Dist: seaborn>=0.11; extra == "viz"
22
+ Requires-Dist: plotly>=5.0; extra == "viz"
27
23
  Provides-Extra: advanced
28
- Requires-Dist: scikit-learn>=1.0.0; extra == "advanced"
29
- Requires-Dist: statsmodels>=0.13.0; extra == "advanced"
30
- Provides-Extra: all
31
- Requires-Dist: seaborn>=0.11.0; extra == "all"
32
- Requires-Dist: plotly>=5.0.0; extra == "all"
33
- Requires-Dist: scikit-learn>=1.0.0; extra == "all"
34
- Requires-Dist: statsmodels>=0.13.0; extra == "all"
35
- Dynamic: author
36
- Dynamic: author-email
37
- Dynamic: classifier
38
- Dynamic: description
39
- Dynamic: description-content-type
40
- Dynamic: home-page
41
- Dynamic: provides-extra
42
- Dynamic: requires-dist
43
- Dynamic: requires-python
44
- Dynamic: summary
24
+ Requires-Dist: scikit-learn>=1.0; extra == "advanced"
25
+ Requires-Dist: statsmodels>=0.13; extra == "advanced"
45
26
 
46
27
  # 📦 Descripción para PyPI (Plantilla Profesional)
47
28
 
@@ -0,0 +1,18 @@
1
+ statslibx/__init__.py,sha256=vXAOPdog5n_b64FRybiWI4VNA_eou7eQuZBcQiQz79E,1297
2
+ statslibx/cli.py,sha256=DqXaoP85n9xgLDlFnEkeqj-HJG0_IKX0uSqxRcHbzII,1122
3
+ statslibx/descriptive.py,sha256=UTb104Gho0uNeSALlukgrYwXrGMDwmIEy39-yvHuy8M,60184
4
+ statslibx/inferential.py,sha256=0lpVAp2SiKDgWkH3z3JoVFAjMaXW2VboxtA2vwPwq04,49947
5
+ statslibx/io.py,sha256=v7pxpmlEMeKyfXftl3WbkUtC9FOh1pymz7MmKPPNw98,493
6
+ statslibx/utils.py,sha256=qDqF_XgvEJbdQURA2v0gF0sw0nNQR4-MFXDvVTl_00s,68480
7
+ statslibx/datasets/__init__.py,sha256=HlOjJFalKVAycJEi7_J_OB7ss8jgSWpPQnsHTynt0uo,2273
8
+ statslibx/datasets/course_completion.csv,sha256=jaqyxAh4YCsYuH5OFsjvGV7KUyM_7vQt6LgnqnNAFsI,22422135
9
+ statslibx/datasets/iris.csv,sha256=xSdC5QMVqZ-Vajg_rt91dVUmdfZAnvD5pHB23QhHmTA,3858
10
+ statslibx/datasets/penguins.csv,sha256=4HY2vYr3QmAJnqL4Z44uq7813vV5lAzHb2cGHuFsBsE,13478
11
+ statslibx/datasets/sp500_companies.csv,sha256=WKS72YOGnAbyLR6kD95fOpIYZt5oXGjPryyFVqLRF_k,803820
12
+ statslibx/datasets/titanic.csv,sha256=5seOS8ybyBMBCCWhgKZrsbu06m_OWyKtD9l0YXOImXU,29474
13
+ statslibx/preprocessing/__init__.py,sha256=B6qI_KuqWf0FFnLLFafIaPOIM9ABo73InKCscSypdqI,7107
14
+ statslibx-0.1.7.dist-info/METADATA,sha256=GN3chKZ7qSdoAKeD54rCxiwRoWk0wiFpLxHmxtc6Skc,2321
15
+ statslibx-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
+ statslibx-0.1.7.dist-info/entry_points.txt,sha256=bkCY7JDWNCZFE3I4sjgJ2oGrUgoBBbCbYmWkBAymT70,49
17
+ statslibx-0.1.7.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
18
+ statslibx-0.1.7.dist-info/RECORD,,
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ statslibx = statslibx.cli:main
@@ -1,14 +0,0 @@
1
- statslibx/__init__.py,sha256=gA9uNJ7Th8mJunugVps8UWgBNJtMeo_mHqU-QSkEXQE,1173
2
- statslibx/descriptive.py,sha256=Hjti-Cs-7-SzrTb0k4s92c4nasLthVwhYU75GS56LAc,40124
3
- statslibx/inferential.py,sha256=0lpVAp2SiKDgWkH3z3JoVFAjMaXW2VboxtA2vwPwq04,49947
4
- statslibx/utils.py,sha256=qDqF_XgvEJbdQURA2v0gF0sw0nNQR4-MFXDvVTl_00s,68480
5
- statslibx/datasets/__init__.py,sha256=wQ4p8hXIhJqV-msWzTvvnbv-l7jyWz5Rn3JZyMSYJ44,452
6
- statslibx/datasets/course_completion.csv,sha256=jaqyxAh4YCsYuH5OFsjvGV7KUyM_7vQt6LgnqnNAFsI,22422135
7
- statslibx/datasets/iris.csv,sha256=xSdC5QMVqZ-Vajg_rt91dVUmdfZAnvD5pHB23QhHmTA,3858
8
- statslibx/datasets/penguins.csv,sha256=4HY2vYr3QmAJnqL4Z44uq7813vV5lAzHb2cGHuFsBsE,13478
9
- statslibx/datasets/sp500_companies.csv,sha256=WKS72YOGnAbyLR6kD95fOpIYZt5oXGjPryyFVqLRF_k,803820
10
- statslibx/datasets/titanic.csv,sha256=5seOS8ybyBMBCCWhgKZrsbu06m_OWyKtD9l0YXOImXU,29474
11
- statslibx-0.1.6.dist-info/METADATA,sha256=7djbcDCGKwPIIjMnF3hjrsjpgeJFUYxEO9zrVTayUj0,2943
12
- statslibx-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- statslibx-0.1.6.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
14
- statslibx-0.1.6.dist-info/RECORD,,