statslibx 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +3 -0
- statslibx/cli.py +47 -0
- statslibx/datasets/__init__.py +57 -2
- statslibx/descriptive.py +500 -157
- statslibx/io.py +21 -0
- statslibx/preprocessing/__init__.py +221 -0
- {statslibx-0.1.6.dist-info → statslibx-0.1.7.dist-info}/METADATA +10 -29
- statslibx-0.1.7.dist-info/RECORD +18 -0
- statslibx-0.1.7.dist-info/entry_points.txt +2 -0
- statslibx-0.1.6.dist-info/RECORD +0 -14
- {statslibx-0.1.6.dist-info → statslibx-0.1.7.dist-info}/WHEEL +0 -0
- {statslibx-0.1.6.dist-info → statslibx-0.1.7.dist-info}/top_level.txt +0 -0
statslibx/io.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import polars as pl
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def load_file(path: str):
|
|
7
|
+
path = Path(path)
|
|
8
|
+
|
|
9
|
+
if not path.exists():
|
|
10
|
+
raise FileNotFoundError(f"{path} not found")
|
|
11
|
+
|
|
12
|
+
if path.suffix == ".csv":
|
|
13
|
+
return pd.read_csv(path)
|
|
14
|
+
|
|
15
|
+
if path.suffix == ".json":
|
|
16
|
+
return pd.read_json(path)
|
|
17
|
+
|
|
18
|
+
if path.suffix in {".txt", ".tsv"}:
|
|
19
|
+
return pd.read_csv(path, sep="\t")
|
|
20
|
+
|
|
21
|
+
raise ValueError(f"Unsupported file type: {path.suffix}")
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from typing import Optional, Union, List, Dict, Any
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import polars as pl
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Preprocessing:
|
|
8
|
+
|
|
9
|
+
def __init__(self, data: Union[pd.DataFrame, pl.DataFrame]):
|
|
10
|
+
if not isinstance(data, (pd.DataFrame, pl.DataFrame)):
|
|
11
|
+
raise TypeError("data must be a pandas or polars DataFrame")
|
|
12
|
+
self.data = data
|
|
13
|
+
|
|
14
|
+
# ------------------------------------------------------------------
|
|
15
|
+
# Internal helpers
|
|
16
|
+
# ------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
def _is_pandas(self) -> bool:
|
|
19
|
+
return isinstance(self.data, pd.DataFrame)
|
|
20
|
+
|
|
21
|
+
def _is_polars(self) -> bool:
|
|
22
|
+
return isinstance(self.data, pl.DataFrame)
|
|
23
|
+
|
|
24
|
+
def _count_nulls(self, column: str) -> int:
|
|
25
|
+
if self._is_pandas():
|
|
26
|
+
return int(self.data[column].isna().sum())
|
|
27
|
+
return int(self.data[column].null_count())
|
|
28
|
+
|
|
29
|
+
def _get_columns(self, columns):
|
|
30
|
+
if columns is None:
|
|
31
|
+
return list(self.data.columns)
|
|
32
|
+
if isinstance(columns, str):
|
|
33
|
+
return [columns]
|
|
34
|
+
return columns
|
|
35
|
+
|
|
36
|
+
# ------------------------------------------------------------------
|
|
37
|
+
# Inspection
|
|
38
|
+
# ------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
def detect_nulls(
|
|
41
|
+
self,
|
|
42
|
+
columns: Optional[Union[str, List[str]]] = None
|
|
43
|
+
) -> pd.DataFrame:
|
|
44
|
+
|
|
45
|
+
columns = self._get_columns(columns)
|
|
46
|
+
total = self.data.shape[0]
|
|
47
|
+
|
|
48
|
+
rows = []
|
|
49
|
+
for col in columns:
|
|
50
|
+
nulls = self._count_nulls(col)
|
|
51
|
+
rows.append({
|
|
52
|
+
"column": col,
|
|
53
|
+
"nulls": nulls,
|
|
54
|
+
"non_nulls": total - nulls,
|
|
55
|
+
"null_pct": nulls / total
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
return pd.DataFrame(rows)
|
|
59
|
+
|
|
60
|
+
def check_uniqueness(self) -> pd.DataFrame:
|
|
61
|
+
if self._is_pandas():
|
|
62
|
+
unique = self.data.nunique()
|
|
63
|
+
return pd.DataFrame({
|
|
64
|
+
"column": unique.index,
|
|
65
|
+
"unique_values": unique.values
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
unique = self.data.select(pl.all().n_unique())
|
|
69
|
+
return unique.to_pandas().melt(
|
|
70
|
+
var_name="column",
|
|
71
|
+
value_name="unique_values"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def preview_data(self, n: int = 5):
|
|
75
|
+
return self.data.head(n)
|
|
76
|
+
|
|
77
|
+
# ------------------------------------------------------------------
|
|
78
|
+
# Description
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def describe_numeric(self):
|
|
82
|
+
if self._is_pandas():
|
|
83
|
+
return self.data.select_dtypes(include=np.number).describe()
|
|
84
|
+
|
|
85
|
+
return self.data.select(pl.all().filter(pl.col(pl.NUMERIC))).describe()
|
|
86
|
+
|
|
87
|
+
def describe_categorical(self):
|
|
88
|
+
if self._is_pandas():
|
|
89
|
+
return self.data.select_dtypes(include="object").describe()
|
|
90
|
+
|
|
91
|
+
return self.data.select(pl.all().filter(pl.col(pl.Utf8))).describe()
|
|
92
|
+
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
# Transformations
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def fill_nulls(
|
|
98
|
+
self,
|
|
99
|
+
fill_with: Any,
|
|
100
|
+
columns: Optional[Union[str, List[str]]] = None
|
|
101
|
+
):
|
|
102
|
+
columns = self._get_columns(columns)
|
|
103
|
+
|
|
104
|
+
if self._is_pandas():
|
|
105
|
+
self.data[columns] = self.data[columns].fillna(fill_with)
|
|
106
|
+
|
|
107
|
+
else:
|
|
108
|
+
self.data = self.data.with_columns([
|
|
109
|
+
pl.col(col).fill_null(fill_with) for col in columns
|
|
110
|
+
])
|
|
111
|
+
|
|
112
|
+
return self
|
|
113
|
+
|
|
114
|
+
def normalize(self, column: str):
|
|
115
|
+
if self._is_pandas():
|
|
116
|
+
col = self.data[column]
|
|
117
|
+
self.data[column] = (col - col.min()) / (col.max() - col.min())
|
|
118
|
+
else:
|
|
119
|
+
self.data = self.data.with_columns(
|
|
120
|
+
((pl.col(column) - pl.col(column).min()) /
|
|
121
|
+
(pl.col(column).max() - pl.col(column).min()))
|
|
122
|
+
.alias(column)
|
|
123
|
+
)
|
|
124
|
+
return self
|
|
125
|
+
|
|
126
|
+
def standardize(self, column: str):
|
|
127
|
+
if self._is_pandas():
|
|
128
|
+
col = self.data[column]
|
|
129
|
+
self.data[column] = (col - col.mean()) / col.std()
|
|
130
|
+
else:
|
|
131
|
+
self.data = self.data.with_columns(
|
|
132
|
+
((pl.col(column) - pl.col(column).mean()) /
|
|
133
|
+
pl.col(column).std())
|
|
134
|
+
.alias(column)
|
|
135
|
+
)
|
|
136
|
+
return self
|
|
137
|
+
|
|
138
|
+
# ------------------------------------------------------------------
|
|
139
|
+
# Filtering
|
|
140
|
+
# ------------------------------------------------------------------
|
|
141
|
+
|
|
142
|
+
def filter_rows(self, condition):
|
|
143
|
+
if self._is_pandas():
|
|
144
|
+
self.data = self.data.loc[condition]
|
|
145
|
+
else:
|
|
146
|
+
self.data = self.data.filter(condition)
|
|
147
|
+
return self
|
|
148
|
+
|
|
149
|
+
def filter_columns(self, columns: List[str]):
|
|
150
|
+
if self._is_pandas():
|
|
151
|
+
self.data = self.data[columns]
|
|
152
|
+
else:
|
|
153
|
+
self.data = self.data.select(columns)
|
|
154
|
+
return self
|
|
155
|
+
|
|
156
|
+
def rename_columns(self, mapping: Dict[str, str]):
|
|
157
|
+
if self._is_pandas():
|
|
158
|
+
self.data = self.data.rename(columns=mapping)
|
|
159
|
+
else:
|
|
160
|
+
self.data = self.data.rename(mapping)
|
|
161
|
+
return self
|
|
162
|
+
|
|
163
|
+
# ------------------------------------------------------------------
|
|
164
|
+
# Outliers
|
|
165
|
+
# ------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
def detect_outliers(
|
|
168
|
+
self,
|
|
169
|
+
column: str,
|
|
170
|
+
method: str = "iqr"
|
|
171
|
+
) -> pd.DataFrame:
|
|
172
|
+
|
|
173
|
+
if self._is_pandas():
|
|
174
|
+
series = self.data[column]
|
|
175
|
+
else:
|
|
176
|
+
series = self.data[column].to_pandas()
|
|
177
|
+
|
|
178
|
+
if method == "iqr":
|
|
179
|
+
q1 = series.quantile(0.25)
|
|
180
|
+
q3 = series.quantile(0.75)
|
|
181
|
+
iqr = q3 - q1
|
|
182
|
+
mask = (series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)
|
|
183
|
+
|
|
184
|
+
elif method == "zscore":
|
|
185
|
+
z = (series - series.mean()) / series.std()
|
|
186
|
+
mask = z.abs() > 3
|
|
187
|
+
|
|
188
|
+
else:
|
|
189
|
+
raise ValueError("method must be 'iqr' or 'zscore'")
|
|
190
|
+
|
|
191
|
+
return self.data[mask]
|
|
192
|
+
|
|
193
|
+
# ------------------------------------------------------------------
|
|
194
|
+
# Data Quality Report
|
|
195
|
+
# ------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
def data_quality(self) -> pd.DataFrame:
|
|
198
|
+
total_rows = self.data.shape[0]
|
|
199
|
+
rows = []
|
|
200
|
+
|
|
201
|
+
for col in self.data.columns:
|
|
202
|
+
nulls = self._count_nulls(col)
|
|
203
|
+
|
|
204
|
+
if self._is_pandas():
|
|
205
|
+
dtype = str(self.data[col].dtype)
|
|
206
|
+
unique = self.data[col].nunique()
|
|
207
|
+
else:
|
|
208
|
+
dtype = str(self.data.schema[col])
|
|
209
|
+
unique = self.data[col].n_unique()
|
|
210
|
+
|
|
211
|
+
rows.append({
|
|
212
|
+
"column": col,
|
|
213
|
+
"dtype": dtype,
|
|
214
|
+
"nulls": nulls,
|
|
215
|
+
"null_pct": nulls / total_rows,
|
|
216
|
+
"unique_values": unique,
|
|
217
|
+
"completeness_pct": 1 - (nulls / total_rows)
|
|
218
|
+
})
|
|
219
|
+
|
|
220
|
+
return pd.DataFrame(rows)
|
|
221
|
+
|
|
@@ -1,47 +1,28 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: statslibx
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: Librería de estadística descriptiva e inferencial
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
Author-email: ascendraemmanuel@gmail.com
|
|
3
|
+
Version: 0.1.7
|
|
4
|
+
Summary: StatsLibx - Librería de estadística descriptiva e inferencial
|
|
5
|
+
Author-email: Emmanuel Ascendra Perez <ascendraemmanuel@gmail.com>
|
|
6
|
+
License: MIT
|
|
8
7
|
Classifier: Development Status :: 3 - Alpha
|
|
9
8
|
Classifier: Intended Audience :: Science/Research
|
|
10
9
|
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
11
10
|
Classifier: License :: OSI Approved :: MIT License
|
|
12
11
|
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
14
12
|
Classifier: Programming Language :: Python :: 3.9
|
|
15
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
17
15
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
16
|
Requires-Python: >=3.8
|
|
19
17
|
Description-Content-Type: text/markdown
|
|
20
|
-
Requires-Dist:
|
|
21
|
-
Requires-Dist:
|
|
22
|
-
Requires-Dist: scipy>=1.7.0
|
|
23
|
-
Requires-Dist: matplotlib>=3.4.0
|
|
18
|
+
Requires-Dist: pandas>=1.5
|
|
19
|
+
Requires-Dist: polars>=0.20
|
|
24
20
|
Provides-Extra: viz
|
|
25
|
-
Requires-Dist: seaborn>=0.11
|
|
26
|
-
Requires-Dist: plotly>=5.0
|
|
21
|
+
Requires-Dist: seaborn>=0.11; extra == "viz"
|
|
22
|
+
Requires-Dist: plotly>=5.0; extra == "viz"
|
|
27
23
|
Provides-Extra: advanced
|
|
28
|
-
Requires-Dist: scikit-learn>=1.0
|
|
29
|
-
Requires-Dist: statsmodels>=0.13
|
|
30
|
-
Provides-Extra: all
|
|
31
|
-
Requires-Dist: seaborn>=0.11.0; extra == "all"
|
|
32
|
-
Requires-Dist: plotly>=5.0.0; extra == "all"
|
|
33
|
-
Requires-Dist: scikit-learn>=1.0.0; extra == "all"
|
|
34
|
-
Requires-Dist: statsmodels>=0.13.0; extra == "all"
|
|
35
|
-
Dynamic: author
|
|
36
|
-
Dynamic: author-email
|
|
37
|
-
Dynamic: classifier
|
|
38
|
-
Dynamic: description
|
|
39
|
-
Dynamic: description-content-type
|
|
40
|
-
Dynamic: home-page
|
|
41
|
-
Dynamic: provides-extra
|
|
42
|
-
Dynamic: requires-dist
|
|
43
|
-
Dynamic: requires-python
|
|
44
|
-
Dynamic: summary
|
|
24
|
+
Requires-Dist: scikit-learn>=1.0; extra == "advanced"
|
|
25
|
+
Requires-Dist: statsmodels>=0.13; extra == "advanced"
|
|
45
26
|
|
|
46
27
|
# 📦 Descripción para PyPI (Plantilla Profesional)
|
|
47
28
|
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
statslibx/__init__.py,sha256=vXAOPdog5n_b64FRybiWI4VNA_eou7eQuZBcQiQz79E,1297
|
|
2
|
+
statslibx/cli.py,sha256=DqXaoP85n9xgLDlFnEkeqj-HJG0_IKX0uSqxRcHbzII,1122
|
|
3
|
+
statslibx/descriptive.py,sha256=UTb104Gho0uNeSALlukgrYwXrGMDwmIEy39-yvHuy8M,60184
|
|
4
|
+
statslibx/inferential.py,sha256=0lpVAp2SiKDgWkH3z3JoVFAjMaXW2VboxtA2vwPwq04,49947
|
|
5
|
+
statslibx/io.py,sha256=v7pxpmlEMeKyfXftl3WbkUtC9FOh1pymz7MmKPPNw98,493
|
|
6
|
+
statslibx/utils.py,sha256=qDqF_XgvEJbdQURA2v0gF0sw0nNQR4-MFXDvVTl_00s,68480
|
|
7
|
+
statslibx/datasets/__init__.py,sha256=HlOjJFalKVAycJEi7_J_OB7ss8jgSWpPQnsHTynt0uo,2273
|
|
8
|
+
statslibx/datasets/course_completion.csv,sha256=jaqyxAh4YCsYuH5OFsjvGV7KUyM_7vQt6LgnqnNAFsI,22422135
|
|
9
|
+
statslibx/datasets/iris.csv,sha256=xSdC5QMVqZ-Vajg_rt91dVUmdfZAnvD5pHB23QhHmTA,3858
|
|
10
|
+
statslibx/datasets/penguins.csv,sha256=4HY2vYr3QmAJnqL4Z44uq7813vV5lAzHb2cGHuFsBsE,13478
|
|
11
|
+
statslibx/datasets/sp500_companies.csv,sha256=WKS72YOGnAbyLR6kD95fOpIYZt5oXGjPryyFVqLRF_k,803820
|
|
12
|
+
statslibx/datasets/titanic.csv,sha256=5seOS8ybyBMBCCWhgKZrsbu06m_OWyKtD9l0YXOImXU,29474
|
|
13
|
+
statslibx/preprocessing/__init__.py,sha256=B6qI_KuqWf0FFnLLFafIaPOIM9ABo73InKCscSypdqI,7107
|
|
14
|
+
statslibx-0.1.7.dist-info/METADATA,sha256=GN3chKZ7qSdoAKeD54rCxiwRoWk0wiFpLxHmxtc6Skc,2321
|
|
15
|
+
statslibx-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
16
|
+
statslibx-0.1.7.dist-info/entry_points.txt,sha256=bkCY7JDWNCZFE3I4sjgJ2oGrUgoBBbCbYmWkBAymT70,49
|
|
17
|
+
statslibx-0.1.7.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
|
|
18
|
+
statslibx-0.1.7.dist-info/RECORD,,
|
statslibx-0.1.6.dist-info/RECORD
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
statslibx/__init__.py,sha256=gA9uNJ7Th8mJunugVps8UWgBNJtMeo_mHqU-QSkEXQE,1173
|
|
2
|
-
statslibx/descriptive.py,sha256=Hjti-Cs-7-SzrTb0k4s92c4nasLthVwhYU75GS56LAc,40124
|
|
3
|
-
statslibx/inferential.py,sha256=0lpVAp2SiKDgWkH3z3JoVFAjMaXW2VboxtA2vwPwq04,49947
|
|
4
|
-
statslibx/utils.py,sha256=qDqF_XgvEJbdQURA2v0gF0sw0nNQR4-MFXDvVTl_00s,68480
|
|
5
|
-
statslibx/datasets/__init__.py,sha256=wQ4p8hXIhJqV-msWzTvvnbv-l7jyWz5Rn3JZyMSYJ44,452
|
|
6
|
-
statslibx/datasets/course_completion.csv,sha256=jaqyxAh4YCsYuH5OFsjvGV7KUyM_7vQt6LgnqnNAFsI,22422135
|
|
7
|
-
statslibx/datasets/iris.csv,sha256=xSdC5QMVqZ-Vajg_rt91dVUmdfZAnvD5pHB23QhHmTA,3858
|
|
8
|
-
statslibx/datasets/penguins.csv,sha256=4HY2vYr3QmAJnqL4Z44uq7813vV5lAzHb2cGHuFsBsE,13478
|
|
9
|
-
statslibx/datasets/sp500_companies.csv,sha256=WKS72YOGnAbyLR6kD95fOpIYZt5oXGjPryyFVqLRF_k,803820
|
|
10
|
-
statslibx/datasets/titanic.csv,sha256=5seOS8ybyBMBCCWhgKZrsbu06m_OWyKtD9l0YXOImXU,29474
|
|
11
|
-
statslibx-0.1.6.dist-info/METADATA,sha256=7djbcDCGKwPIIjMnF3hjrsjpgeJFUYxEO9zrVTayUj0,2943
|
|
12
|
-
statslibx-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
-
statslibx-0.1.6.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
|
|
14
|
-
statslibx-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|