statslibx 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +12 -8
- statslibx/computacional.py +2 -0
- statslibx/datasets/__init__.py +227 -54
- statslibx/descriptive.py +8 -9
- statslibx/inferential.py +746 -307
- statslibx/preprocessing/__init__.py +12 -5
- statslibx/probability.py +2 -0
- statslibx/utils.py +112 -150
- {statslibx-0.1.7.dist-info → statslibx-0.1.8.dist-info}/METADATA +17 -3
- statslibx-0.1.8.dist-info/RECORD +15 -0
- statslibx/datasets/course_completion.csv +0 -100001
- statslibx/datasets/iris.csv +0 -151
- statslibx/datasets/penguins.csv +0 -345
- statslibx/datasets/sp500_companies.csv +0 -504
- statslibx/datasets/titanic.csv +0 -419
- statslibx-0.1.7.dist-info/RECORD +0 -18
- {statslibx-0.1.7.dist-info → statslibx-0.1.8.dist-info}/WHEEL +0 -0
- {statslibx-0.1.7.dist-info → statslibx-0.1.8.dist-info}/entry_points.txt +0 -0
- {statslibx-0.1.7.dist-info → statslibx-0.1.8.dist-info}/top_level.txt +0 -0
statslibx/__init__.py
CHANGED
|
@@ -1,30 +1,32 @@
|
|
|
1
1
|
"""
|
|
2
2
|
StatsLibx - Librería de Estadística para Python
|
|
3
3
|
Autor: Emmanuel Ascendra
|
|
4
|
-
Versión: 0.1.
|
|
4
|
+
Versión: 0.1.8
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.1.
|
|
7
|
+
__version__ = "0.1.8"
|
|
8
8
|
__author__ = "Emmanuel Ascendra"
|
|
9
9
|
|
|
10
10
|
# Importar las clases principales
|
|
11
11
|
from .descriptive import DescriptiveStats, DescriptiveSummary
|
|
12
12
|
from .inferential import InferentialStats, TestResult
|
|
13
|
+
from .probability import ProbabilityStats
|
|
14
|
+
from .computacional import ComputacionalStats
|
|
13
15
|
from .utils import UtilsStats
|
|
14
16
|
from .preprocessing import Preprocessing
|
|
15
|
-
from .datasets import load_dataset
|
|
17
|
+
from .datasets import load_dataset, generate_dataset
|
|
16
18
|
|
|
17
19
|
# Definir qué se expone cuando se hace: from statslib import *
|
|
18
20
|
__all__ = [
|
|
19
21
|
# Clases principales
|
|
20
22
|
'DescriptiveStats',
|
|
21
|
-
'InferentialStats',
|
|
22
|
-
'
|
|
23
|
-
'
|
|
24
|
-
'TestResult',
|
|
23
|
+
'InferentialStats',
|
|
24
|
+
'ProbabilityStats',
|
|
25
|
+
'ComputacionalStats',
|
|
25
26
|
'UtilsStats',
|
|
26
27
|
'Preprocessing',
|
|
27
|
-
'load_dataset'
|
|
28
|
+
'load_dataset',
|
|
29
|
+
'generate_dataset'
|
|
28
30
|
]
|
|
29
31
|
|
|
30
32
|
# Mensaje de bienvenida (opcional)
|
|
@@ -37,5 +39,7 @@ def welcome():
|
|
|
37
39
|
print(f" - DescriptiveStats: Estadística descriptiva")
|
|
38
40
|
print(f" - InferentialStats: Estadística inferencial")
|
|
39
41
|
print(f" - UtilsStats: Utilidades Extras")
|
|
42
|
+
print(f"\nMódulos disponibles:")
|
|
43
|
+
print(f" - Datasets: Carga de Datasets")
|
|
40
44
|
print(f" - Preprocessing: Preprocesamiento de datos")
|
|
41
45
|
print(f"\nPara más información: help(statslibx)")
|
statslibx/datasets/__init__.py
CHANGED
|
@@ -1,71 +1,244 @@
|
|
|
1
|
-
from typing import Optional, Union, Literal, List
|
|
2
|
-
import polars as pl
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import pkgutil
|
|
1
|
+
from typing import Optional, Union, Literal, List, Tuple
|
|
5
2
|
import io
|
|
3
|
+
import pkgutil
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import polars as pl
|
|
7
|
+
import numpy as np
|
|
8
|
+
from numpy.typing import NDArray
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_SUPPORTED_BACKENDS = ("pandas", "polars")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _validate_columns(
|
|
15
|
+
df: Union[pd.DataFrame, pl.DataFrame],
|
|
16
|
+
X_columns: List[str],
|
|
17
|
+
y_column: str
|
|
18
|
+
) -> None:
|
|
19
|
+
columns = set(df.columns)
|
|
20
|
+
missing = set(X_columns + [y_column]) - columns
|
|
21
|
+
if missing:
|
|
22
|
+
raise ValueError(f"Columnas no encontradas en el dataset: {missing}")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _X_y(
|
|
26
|
+
df: Union[pd.DataFrame, pl.DataFrame],
|
|
27
|
+
X_columns: List[str],
|
|
28
|
+
y_column: str
|
|
29
|
+
) -> Tuple[NDArray, NDArray]:
|
|
30
|
+
"""
|
|
31
|
+
Extrae X e y como arrays numpy desde pandas o polars.
|
|
32
|
+
"""
|
|
33
|
+
_validate_columns(df, X_columns, y_column)
|
|
34
|
+
|
|
35
|
+
if isinstance(df, pd.DataFrame):
|
|
36
|
+
X = df[X_columns].to_numpy()
|
|
37
|
+
y = df[y_column].to_numpy().ravel()
|
|
38
|
+
return X, y
|
|
39
|
+
|
|
40
|
+
elif isinstance(df, pl.DataFrame):
|
|
41
|
+
X = df.select(X_columns).to_numpy()
|
|
42
|
+
y = df.select(y_column).to_numpy().ravel()
|
|
43
|
+
return X, y
|
|
44
|
+
|
|
45
|
+
else:
|
|
46
|
+
raise TypeError(
|
|
47
|
+
"Backend no soportado. Use pandas.DataFrame o polars.DataFrame."
|
|
48
|
+
)
|
|
49
|
+
|
|
6
50
|
|
|
7
51
|
def load_dataset(
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
52
|
+
name: str,
|
|
53
|
+
backend: Literal["pandas", "polars"] = "pandas",
|
|
54
|
+
return_X_y: Optional[Tuple[List[str], str]] = None
|
|
55
|
+
) -> Union[pd.DataFrame, pl.DataFrame, Tuple[NDArray, NDArray]]:
|
|
56
|
+
"""
|
|
57
|
+
Carga un dataset interno del paquete.
|
|
58
|
+
|
|
59
|
+
Datasets disponibles:
|
|
13
60
|
- iris.csv
|
|
14
61
|
- penguins.csv
|
|
15
62
|
- sp500_companies.csv
|
|
16
63
|
- titanic.csv
|
|
17
64
|
- course_completion.csv
|
|
65
|
+
|
|
66
|
+
Parámetros
|
|
67
|
+
----------
|
|
68
|
+
name : str
|
|
69
|
+
Nombre del archivo CSV.
|
|
70
|
+
backend : {'pandas', 'polars'}, default='pandas'
|
|
71
|
+
Backend de DataFrame a utilizar.
|
|
72
|
+
return_X_y : tuple[list[str], str], optional
|
|
73
|
+
Si se especifica, devuelve (X, y) como arrays numpy,
|
|
74
|
+
|
|
75
|
+
Retorna
|
|
76
|
+
-------
|
|
77
|
+
DataFrame o (X, y)
|
|
18
78
|
"""
|
|
19
|
-
|
|
20
|
-
if data_bytes is None:
|
|
21
|
-
raise FileNotFoundError(f"Dataset '{name}' no encontrado.")
|
|
22
|
-
|
|
23
|
-
if backend == "pandas":
|
|
24
|
-
return pd.read_csv(io.BytesIO(data_bytes))
|
|
25
|
-
elif backend == "polars":
|
|
26
|
-
return pl.read_csv(io.BytesIO(data_bytes))
|
|
27
|
-
else:
|
|
79
|
+
if backend not in _SUPPORTED_BACKENDS:
|
|
28
80
|
raise ValueError(
|
|
29
|
-
"Backend no soportado.
|
|
81
|
+
f"Backend '{backend}' no soportado. "
|
|
82
|
+
f"Use uno de {_SUPPORTED_BACKENDS}."
|
|
30
83
|
)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
if backend == "pandas":
|
|
42
|
-
return pd.read_csv(io.BytesIO(data_bytes))
|
|
43
|
-
elif backend == "polars":
|
|
44
|
-
raise ValueError(
|
|
45
|
-
"Backend no soportado aun. Use 'pandas'."
|
|
84
|
+
|
|
85
|
+
# ---------- 1️⃣ Intentar cargar desde el paquete ----------
|
|
86
|
+
data_bytes = pkgutil.get_data("statslibx.datasets", name)
|
|
87
|
+
|
|
88
|
+
if data_bytes is not None:
|
|
89
|
+
df = (
|
|
90
|
+
pd.read_csv(io.BytesIO(data_bytes))
|
|
91
|
+
if backend == "pandas"
|
|
92
|
+
else pl.read_csv(io.BytesIO(data_bytes))
|
|
46
93
|
)
|
|
94
|
+
|
|
95
|
+
# ---------- 2️⃣ Si no está en el paquete, buscar en ruta actual ----------
|
|
47
96
|
else:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
97
|
+
try:
|
|
98
|
+
df = (
|
|
99
|
+
pd.read_csv(name)
|
|
100
|
+
if backend == "pandas"
|
|
101
|
+
else pl.read_csv(name)
|
|
102
|
+
)
|
|
103
|
+
except FileNotFoundError:
|
|
104
|
+
raise FileNotFoundError(
|
|
105
|
+
f"Dataset '{name}' no encontrado "
|
|
106
|
+
f"ni en statslibx.datasets ni en la ruta actual."
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# ---------- 3️⃣ Devolver X, y si se solicita ----------
|
|
110
|
+
if return_X_y is not None:
|
|
111
|
+
X_columns, y_column = return_X_y
|
|
112
|
+
return _X_y(df, X_columns, y_column)
|
|
113
|
+
|
|
114
|
+
return df
|
|
115
|
+
|
|
116
|
+
# =========================
|
|
117
|
+
# Datasets específicos
|
|
118
|
+
# =========================
|
|
119
|
+
|
|
120
|
+
def load_iris(
|
|
121
|
+
backend: Literal["pandas", "polars"] = "pandas",
|
|
122
|
+
return_X_y: Optional[Tuple[List[str], str]] = None
|
|
123
|
+
):
|
|
124
|
+
return load_dataset(
|
|
125
|
+
"iris.csv",
|
|
126
|
+
backend=backend,
|
|
127
|
+
return_X_y=return_X_y
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
|
|
52
131
|
def load_penguins(
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
132
|
+
backend: Literal["pandas", "polars"] = "pandas",
|
|
133
|
+
return_X_y: Optional[Tuple[List[str], str]] = None
|
|
134
|
+
):
|
|
135
|
+
return load_dataset(
|
|
136
|
+
"penguins.csv",
|
|
137
|
+
backend=backend,
|
|
138
|
+
return_X_y=return_X_y
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
from typing import Optional
|
|
143
|
+
|
|
144
|
+
def generate_dataset(n_rows, schema, seed=None, save: Optional[bool] = False, filename: Optional[str] = None):
|
|
145
|
+
if seed is not None:
|
|
146
|
+
if not isinstance(seed, int):
|
|
147
|
+
raise TypeError("seed debe ser un entero o None")
|
|
148
|
+
np.random.seed(seed)
|
|
149
|
+
else:
|
|
150
|
+
np.random.seed(42)
|
|
151
|
+
|
|
152
|
+
if not isinstance(schema, dict):
|
|
153
|
+
raise TypeError("schema debe ser un diccionario")
|
|
60
154
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
data = {}
|
|
158
|
+
|
|
159
|
+
for col, config in schema.items():
|
|
160
|
+
if "dist" not in config:
|
|
161
|
+
raise ValueError(f"La columna '{col}' no tiene 'dist' definido")
|
|
162
|
+
|
|
163
|
+
dist = config["dist"]
|
|
164
|
+
dtype = config.get("type", "float")
|
|
165
|
+
nround = config.get("round", 0)
|
|
166
|
+
|
|
167
|
+
# ---------- DISTRIBUCIONES ----------
|
|
168
|
+
if dist == "normal":
|
|
169
|
+
values = np.random.normal(
|
|
170
|
+
loc=config.get("mean", 0),
|
|
171
|
+
scale=config.get("std", 1),
|
|
172
|
+
size=n_rows
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
elif dist == "uniform":
|
|
176
|
+
values = np.random.uniform(
|
|
177
|
+
low=config.get("low", 0),
|
|
178
|
+
high=config.get("high", 1),
|
|
179
|
+
size=n_rows
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
elif dist == "exponential":
|
|
183
|
+
values = np.random.exponential(
|
|
184
|
+
scale=config.get("scale", 1),
|
|
185
|
+
size=n_rows
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
elif dist == "lognormal":
|
|
189
|
+
values = np.random.lognormal(
|
|
190
|
+
mean=config.get("mean", 0),
|
|
191
|
+
sigma=config.get("std", 1),
|
|
192
|
+
size=n_rows
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
elif dist == "poisson":
|
|
196
|
+
values = np.random.poisson(
|
|
197
|
+
lam=config.get("lam", 1),
|
|
198
|
+
size=n_rows
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
elif dist == "binomial":
|
|
202
|
+
values = np.random.binomial(
|
|
203
|
+
n=config.get("n", 1),
|
|
204
|
+
p=config.get("p", 0.5),
|
|
205
|
+
size=n_rows
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
elif dist == "categorical":
|
|
209
|
+
if "choices" not in config:
|
|
210
|
+
raise ValueError(f"'choices' es requerido para categorical ({col})")
|
|
211
|
+
values = np.random.choice(
|
|
212
|
+
config["choices"],
|
|
213
|
+
size=n_rows
|
|
214
|
+
)
|
|
215
|
+
data[col] = values
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
else:
|
|
219
|
+
raise ValueError(f"Distribución no soportada: {dist}")
|
|
220
|
+
|
|
221
|
+
# ---------- CASTEO DE TIPO ----------
|
|
222
|
+
if dtype == "int":
|
|
223
|
+
values = np.round(values).astype(int)
|
|
224
|
+
elif dtype == "float":
|
|
225
|
+
values = values.astype(float)
|
|
226
|
+
else:
|
|
227
|
+
raise ValueError(f"Tipo no soportado: {dtype}")
|
|
228
|
+
|
|
229
|
+
# ---------- REDONDEO ----------
|
|
230
|
+
if nround > 0:
|
|
231
|
+
values = np.round(values, nround)
|
|
232
|
+
else:
|
|
233
|
+
values = np.round(values, 2)
|
|
234
|
+
|
|
235
|
+
data[col] = values
|
|
236
|
+
|
|
237
|
+
if save and filename:
|
|
238
|
+
df = pd.DataFrame(data)
|
|
239
|
+
df.to_csv(f"{filename}.csv", index=False)
|
|
67
240
|
else:
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
)
|
|
241
|
+
df = pd.DataFrame(data)
|
|
242
|
+
df.to_csv("dataset.csv", index=False)
|
|
71
243
|
|
|
244
|
+
return pd.DataFrame(data)
|
statslibx/descriptive.py
CHANGED
|
@@ -13,7 +13,6 @@ import plotly.express as px
|
|
|
13
13
|
|
|
14
14
|
class DescriptiveStats:
|
|
15
15
|
"""
|
|
16
|
-
Clase para estadística descriptiva univariada y multivariada
|
|
17
16
|
Class for univariate and multivariate descriptive statistics
|
|
18
17
|
"""
|
|
19
18
|
|
|
@@ -23,7 +22,7 @@ class DescriptiveStats:
|
|
|
23
22
|
thousand: str = None,
|
|
24
23
|
backend: Literal['pandas', 'polars'] = 'pandas'):
|
|
25
24
|
"""
|
|
26
|
-
#
|
|
25
|
+
# Initialize DataFrame
|
|
27
26
|
|
|
28
27
|
## **Parameters:**
|
|
29
28
|
|
|
@@ -54,7 +53,8 @@ class DescriptiveStats:
|
|
|
54
53
|
if data.ndim == 1:
|
|
55
54
|
data = pd.DataFrame({'var': data})
|
|
56
55
|
else:
|
|
57
|
-
data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
|
|
56
|
+
data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])]) \
|
|
57
|
+
if isinstance(data, pd.DataFrame) else pl.DataFrame(data, )
|
|
58
58
|
|
|
59
59
|
self.data = data
|
|
60
60
|
self.backend = backend
|
|
@@ -394,7 +394,7 @@ class DescriptiveStats:
|
|
|
394
394
|
|
|
395
395
|
|
|
396
396
|
|
|
397
|
-
def help(self, lang="es-
|
|
397
|
+
def help(self, lang="es-ES"):
|
|
398
398
|
"""
|
|
399
399
|
Muestra ayuda completa de la clase DescriptiveStats
|
|
400
400
|
|
|
@@ -407,7 +407,8 @@ class DescriptiveStats:
|
|
|
407
407
|
if lang in ["en-US", "English", "english"]:
|
|
408
408
|
lang = "en-US"
|
|
409
409
|
else:
|
|
410
|
-
lang = ""
|
|
410
|
+
lang = "es-ES"
|
|
411
|
+
help_text = " "
|
|
411
412
|
|
|
412
413
|
match lang:
|
|
413
414
|
case "es-ES":
|
|
@@ -614,10 +615,9 @@ class DescriptiveStats:
|
|
|
614
615
|
╚════════════════════════════════════════════════════════════════════════════╝
|
|
615
616
|
"""
|
|
616
617
|
case "en-US":
|
|
617
|
-
# --- Falta por traducir
|
|
618
618
|
help_text = """
|
|
619
619
|
╔════════════════════════════════════════════════════════════════════════════╗
|
|
620
|
-
║ 📊
|
|
620
|
+
║ 📊 CLASS DescriptiveStats - COMPLETE HELP ║
|
|
621
621
|
╚════════════════════════════════════════════════════════════════════════════╝
|
|
622
622
|
|
|
623
623
|
📝 DESCRIPTION:
|
|
@@ -821,8 +821,7 @@ class DescriptiveStats:
|
|
|
821
821
|
|
|
822
822
|
print(help_text)
|
|
823
823
|
|
|
824
|
-
|
|
825
|
-
|
|
824
|
+
|
|
826
825
|
class DescriptiveSummary:
|
|
827
826
|
"""Clase para formatear salida de estadística descriptiva"""
|
|
828
827
|
|