statslibx 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +12 -8
- statslibx/computacional.py +126 -0
- statslibx/datasets/__init__.py +243 -54
- statslibx/descriptive.py +80 -15
- statslibx/inferential.py +812 -312
- statslibx/preprocessing/__init__.py +12 -5
- statslibx/utils.py +183 -163
- {statslibx-0.1.7.dist-info → statslibx-0.2.0.dist-info}/METADATA +19 -5
- statslibx-0.2.0.dist-info/RECORD +19 -0
- {statslibx-0.1.7.dist-info → statslibx-0.2.0.dist-info}/WHEEL +1 -1
- statslibx-0.1.7.dist-info/RECORD +0 -18
- {statslibx-0.1.7.dist-info → statslibx-0.2.0.dist-info}/entry_points.txt +0 -0
- {statslibx-0.1.7.dist-info → statslibx-0.2.0.dist-info}/top_level.txt +0 -0
statslibx/__init__.py
CHANGED
|
@@ -1,30 +1,31 @@
|
|
|
1
1
|
"""
|
|
2
2
|
StatsLibx - Librería de Estadística para Python
|
|
3
3
|
Autor: Emmanuel Ascendra
|
|
4
|
-
Versión: 0.
|
|
4
|
+
Versión: 0.2.0
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
__version__ = "0.
|
|
7
|
+
__version__ = "0.2.0"
|
|
8
8
|
__author__ = "Emmanuel Ascendra"
|
|
9
9
|
|
|
10
10
|
# Importar las clases principales
|
|
11
11
|
from .descriptive import DescriptiveStats, DescriptiveSummary
|
|
12
12
|
from .inferential import InferentialStats, TestResult
|
|
13
|
+
from .computacional import ComputationalStats
|
|
13
14
|
from .utils import UtilsStats
|
|
14
15
|
from .preprocessing import Preprocessing
|
|
15
|
-
from .datasets import load_dataset
|
|
16
|
+
from .datasets import load_dataset, generate_dataset
|
|
16
17
|
|
|
17
18
|
# Definir qué se expone cuando se hace: from statslib import *
|
|
18
19
|
__all__ = [
|
|
19
20
|
# Clases principales
|
|
20
21
|
'DescriptiveStats',
|
|
21
|
-
'InferentialStats',
|
|
22
|
-
'
|
|
23
|
-
'
|
|
24
|
-
'TestResult',
|
|
22
|
+
'InferentialStats',
|
|
23
|
+
'ProbabilityStats',
|
|
24
|
+
'ComputationalStats',
|
|
25
25
|
'UtilsStats',
|
|
26
26
|
'Preprocessing',
|
|
27
|
-
'load_dataset'
|
|
27
|
+
'load_dataset',
|
|
28
|
+
'generate_dataset'
|
|
28
29
|
]
|
|
29
30
|
|
|
30
31
|
# Mensaje de bienvenida (opcional)
|
|
@@ -36,6 +37,9 @@ def welcome():
|
|
|
36
37
|
print(f"\nClases disponibles:")
|
|
37
38
|
print(f" - DescriptiveStats: Estadística descriptiva")
|
|
38
39
|
print(f" - InferentialStats: Estadística inferencial")
|
|
40
|
+
print(f" - ComputacionalStats: En desarrollo")
|
|
39
41
|
print(f" - UtilsStats: Utilidades Extras")
|
|
42
|
+
print(f"\nMódulos disponibles:")
|
|
43
|
+
print(f" - Datasets: Carga de Datasets")
|
|
40
44
|
print(f" - Preprocessing: Preprocesamiento de datos")
|
|
41
45
|
print(f"\nPara más información: help(statslibx)")
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Union, Optional, Literal
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import polars as pl
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
class ComputationalStats:
|
|
8
|
+
"""
|
|
9
|
+
Class for computational statistics
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, data: Union[pd.DataFrame, np.ndarray],
|
|
13
|
+
sep: str = None,
|
|
14
|
+
decimal: str = None,
|
|
15
|
+
thousand: str = None,
|
|
16
|
+
backend: Literal['pandas', 'polars'] = 'pandas'):
|
|
17
|
+
"""
|
|
18
|
+
# Initialize DataFrame
|
|
19
|
+
|
|
20
|
+
## **Parameters:**
|
|
21
|
+
|
|
22
|
+
- **data** : Data to analyze
|
|
23
|
+
- **sep** : Column separator
|
|
24
|
+
- **decimal** : Decimal separator
|
|
25
|
+
- **thousand** : Thousand separator
|
|
26
|
+
- **backend** : 'pandas' or 'polars' for processing
|
|
27
|
+
(Proximamente estara habilitado polars para big data)
|
|
28
|
+
|
|
29
|
+
**Examples:**
|
|
30
|
+
|
|
31
|
+
``Example 1:
|
|
32
|
+
stats = DescriptiveStats(data)
|
|
33
|
+
``
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
if isinstance(data, str) and os.path.exists(data):
|
|
37
|
+
data = ComputationalStats.from_file(data).data
|
|
38
|
+
|
|
39
|
+
if isinstance(data, pl.DataFrame):
|
|
40
|
+
raise TypeError(
|
|
41
|
+
"Polars aún no soportado. Use pandas.DataFrame."
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if isinstance(data, np.ndarray):
|
|
46
|
+
if data.ndim == 1:
|
|
47
|
+
data = pd.DataFrame({'var': data})
|
|
48
|
+
else:
|
|
49
|
+
data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])]) \
|
|
50
|
+
if isinstance(data, pd.DataFrame) else pl.DataFrame(data, )
|
|
51
|
+
|
|
52
|
+
self.data = data
|
|
53
|
+
self.backend = backend
|
|
54
|
+
self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
|
|
55
|
+
self.sep = sep
|
|
56
|
+
self.decimal = decimal
|
|
57
|
+
self.thousand = thousand
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_file(self, path: str):
|
|
61
|
+
"""
|
|
62
|
+
Carga automática de archivos y devuelve instancia de Intelligence.
|
|
63
|
+
Soporta CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
|
|
64
|
+
Automatic file upload and returns Intelligence instance.
|
|
65
|
+
Supports CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
|
|
66
|
+
|
|
67
|
+
Parametros / Parameters:
|
|
68
|
+
------------------------
|
|
69
|
+
path : str
|
|
70
|
+
Ruta del archivo
|
|
71
|
+
File path
|
|
72
|
+
"""
|
|
73
|
+
if not os.path.exists(path):
|
|
74
|
+
raise FileNotFoundError(f"Archivo no encontrado / File not found: {path}")
|
|
75
|
+
|
|
76
|
+
ext = os.path.splitext(path)[1].lower()
|
|
77
|
+
|
|
78
|
+
if ext == ".csv":
|
|
79
|
+
df = pd.read_csv(path, sep=self.sep, decimal=self.decimal, thousand=self.thousand)
|
|
80
|
+
|
|
81
|
+
elif ext in [".xlsx", ".xls"]:
|
|
82
|
+
df = pd.read_excel(path, decimal=self.decimal, thousand=self.thousand)
|
|
83
|
+
|
|
84
|
+
elif ext in [".txt", ".tsv"]:
|
|
85
|
+
df = pd.read_table(path, sep=self.sep, decimal=self.decimal, thousand=self.thousand)
|
|
86
|
+
|
|
87
|
+
elif ext == ".json":
|
|
88
|
+
df = pd.read_json(path)
|
|
89
|
+
|
|
90
|
+
elif ext == ".parquet":
|
|
91
|
+
df = pd.read_parquet(path)
|
|
92
|
+
|
|
93
|
+
elif ext == ".feather":
|
|
94
|
+
df = pd.read_feather(path)
|
|
95
|
+
|
|
96
|
+
else:
|
|
97
|
+
raise ValueError(f"Formato no soportado / Unsupported format: {ext}")
|
|
98
|
+
|
|
99
|
+
return ComputationalStats(df)
|
|
100
|
+
|
|
101
|
+
def monte_carlo(self, function, n: int = 100, return_simulations: bool = False, **kwargs) -> pd.DataFrame:
|
|
102
|
+
"""
|
|
103
|
+
Realiza simulaciones de Monte Carlo para una función y devuelve un DataFrame con las simulaciones y sus resultados.
|
|
104
|
+
"""
|
|
105
|
+
samples = []
|
|
106
|
+
|
|
107
|
+
for _ in range(n):
|
|
108
|
+
sample = function(**kwargs)
|
|
109
|
+
samples.append(float(sample))
|
|
110
|
+
|
|
111
|
+
mean = sum(samples) / n
|
|
112
|
+
variance = sum((x - mean)**2 for x in samples) / n
|
|
113
|
+
std = variance**0.5
|
|
114
|
+
|
|
115
|
+
if return_simulations:
|
|
116
|
+
return {
|
|
117
|
+
"mean": float(mean),
|
|
118
|
+
"std": float(std),
|
|
119
|
+
"samples": samples
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
else:
|
|
123
|
+
return {
|
|
124
|
+
"mean": float(mean),
|
|
125
|
+
"std": float(std)
|
|
126
|
+
}
|
statslibx/datasets/__init__.py
CHANGED
|
@@ -1,71 +1,260 @@
|
|
|
1
|
-
from typing import Optional, Union, Literal, List
|
|
2
|
-
import
|
|
3
|
-
import pandas as pd
|
|
1
|
+
from typing import Optional, Union, Literal, List, Tuple
|
|
2
|
+
import io
|
|
4
3
|
import pkgutil
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import polars as pl
|
|
7
|
+
import numpy as np
|
|
8
|
+
from numpy.typing import NDArray
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_SUPPORTED_BACKENDS = ("pandas", "polars")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _validate_columns(
|
|
15
|
+
df: Union[pd.DataFrame, pl.DataFrame],
|
|
16
|
+
X_columns: List[str],
|
|
17
|
+
y_column: str
|
|
18
|
+
) -> None:
|
|
19
|
+
columns = set(df.columns)
|
|
20
|
+
missing = set(X_columns + [y_column]) - columns
|
|
21
|
+
if missing:
|
|
22
|
+
raise ValueError(f"Columnas no encontradas en el dataset: {missing}")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _X_y(
|
|
26
|
+
df: Union[pd.DataFrame, pl.DataFrame],
|
|
27
|
+
X_columns: List[str],
|
|
28
|
+
y_column: str
|
|
29
|
+
) -> Tuple[NDArray, NDArray]:
|
|
30
|
+
"""
|
|
31
|
+
Extrae X e y como arrays numpy desde pandas o polars.
|
|
32
|
+
"""
|
|
33
|
+
_validate_columns(df, X_columns, y_column)
|
|
34
|
+
|
|
35
|
+
if isinstance(df, pd.DataFrame):
|
|
36
|
+
X = df[X_columns].to_numpy()
|
|
37
|
+
y = df[y_column].to_numpy().ravel()
|
|
38
|
+
return X, y
|
|
39
|
+
|
|
40
|
+
elif isinstance(df, pl.DataFrame):
|
|
41
|
+
X = df.select(X_columns).to_numpy()
|
|
42
|
+
y = df.select(y_column).to_numpy().ravel()
|
|
43
|
+
return X, y
|
|
44
|
+
|
|
45
|
+
else:
|
|
46
|
+
raise TypeError(
|
|
47
|
+
"Backend no soportado. Use pandas.DataFrame o polars.DataFrame."
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
5
51
|
import io
|
|
52
|
+
import pkgutil
|
|
53
|
+
import pandas as pd
|
|
54
|
+
import polars as pl
|
|
55
|
+
from typing import Literal, Optional, Tuple, List, Union
|
|
56
|
+
from numpy.typing import NDArray
|
|
57
|
+
|
|
58
|
+
_SUPPORTED_BACKENDS = {"pandas", "polars"}
|
|
6
59
|
|
|
7
60
|
def load_dataset(
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
61
|
+
name: str,
|
|
62
|
+
backend: Literal["pandas", "polars"] = "pandas",
|
|
63
|
+
return_X_y: Optional[Tuple[List[str], str]] = None,
|
|
64
|
+
save: Optional[bool] = False,
|
|
65
|
+
filename: Optional[str] = None
|
|
66
|
+
) -> Union[pd.DataFrame, pl.DataFrame, Tuple[NDArray, NDArray]]:
|
|
67
|
+
"""
|
|
68
|
+
Carga un dataset interno del paquete.
|
|
69
|
+
|
|
70
|
+
Datasets disponibles:
|
|
13
71
|
- iris.csv
|
|
14
72
|
- penguins.csv
|
|
15
73
|
- sp500_companies.csv
|
|
16
74
|
- titanic.csv
|
|
17
75
|
- course_completion.csv
|
|
76
|
+
|
|
77
|
+
Parámetros
|
|
78
|
+
----------
|
|
79
|
+
name : str
|
|
80
|
+
Nombre del archivo CSV.
|
|
81
|
+
backend : {'pandas', 'polars'}, default='pandas'
|
|
82
|
+
Backend de DataFrame a utilizar.
|
|
83
|
+
return_X_y : tuple[list[str], str], optional
|
|
84
|
+
Si se especifica, devuelve (X, y) como arrays numpy,
|
|
85
|
+
|
|
86
|
+
Retorna
|
|
87
|
+
-------
|
|
88
|
+
DataFrame o (X, y)
|
|
18
89
|
"""
|
|
19
|
-
|
|
20
|
-
if
|
|
21
|
-
raise FileNotFoundError(f"Dataset '{name}' no encontrado.")
|
|
22
|
-
|
|
23
|
-
if backend == "pandas":
|
|
24
|
-
return pd.read_csv(io.BytesIO(data_bytes))
|
|
25
|
-
elif backend == "polars":
|
|
26
|
-
return pl.read_csv(io.BytesIO(data_bytes))
|
|
27
|
-
else:
|
|
90
|
+
|
|
91
|
+
if backend not in _SUPPORTED_BACKENDS:
|
|
28
92
|
raise ValueError(
|
|
29
|
-
"Backend no soportado.
|
|
93
|
+
f"Backend '{backend}' no soportado. "
|
|
94
|
+
f"Use uno de {_SUPPORTED_BACKENDS}."
|
|
30
95
|
)
|
|
31
|
-
|
|
96
|
+
|
|
97
|
+
df = None
|
|
98
|
+
|
|
99
|
+
# ---------- 1️⃣ Intentar cargar desde el paquete ----------
|
|
100
|
+
try:
|
|
101
|
+
data_bytes = pkgutil.get_data("statslibx.datasets", name)
|
|
102
|
+
if data_bytes is not None:
|
|
103
|
+
df = (
|
|
104
|
+
pd.read_csv(io.BytesIO(data_bytes))
|
|
105
|
+
if backend == "pandas"
|
|
106
|
+
else pl.read_csv(io.BytesIO(data_bytes))
|
|
107
|
+
)
|
|
108
|
+
except FileNotFoundError:
|
|
109
|
+
pass # seguimos al siguiente intento
|
|
110
|
+
|
|
111
|
+
# ---------- 2️⃣ Intentar cargar desde ruta local ----------
|
|
112
|
+
if df is None:
|
|
113
|
+
try:
|
|
114
|
+
df = (
|
|
115
|
+
pd.read_csv(name)
|
|
116
|
+
if backend == "pandas"
|
|
117
|
+
else pl.read_csv(name)
|
|
118
|
+
)
|
|
119
|
+
except FileNotFoundError:
|
|
120
|
+
raise FileNotFoundError(
|
|
121
|
+
f"Dataset '{name}' no encontrado "
|
|
122
|
+
f"ni en statslibx.datasets ni en la ruta actual."
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# ---------- 3️⃣ Devolver X, y si se solicita ----------
|
|
126
|
+
if return_X_y is not None:
|
|
127
|
+
X_columns, y_column = return_X_y
|
|
128
|
+
return _X_y(df, X_columns, y_column)
|
|
129
|
+
|
|
130
|
+
return df
|
|
131
|
+
|
|
132
|
+
# =========================
|
|
133
|
+
# Datasets específicos
|
|
134
|
+
# =========================
|
|
135
|
+
|
|
32
136
|
def load_iris(
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
elif backend == "polars":
|
|
44
|
-
raise ValueError(
|
|
45
|
-
"Backend no soportado aun. Use 'pandas'."
|
|
46
|
-
)
|
|
47
|
-
else:
|
|
48
|
-
raise ValueError(
|
|
49
|
-
"Backend no soportado. Use 'pandas' o 'polars'."
|
|
50
|
-
)
|
|
51
|
-
|
|
137
|
+
backend: Literal["pandas", "polars"] = "pandas",
|
|
138
|
+
return_X_y: Optional[Tuple[List[str], str]] = None
|
|
139
|
+
):
|
|
140
|
+
return load_dataset(
|
|
141
|
+
"iris.csv",
|
|
142
|
+
backend=backend,
|
|
143
|
+
return_X_y=return_X_y
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
52
147
|
def load_penguins(
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
148
|
+
backend: Literal["pandas", "polars"] = "pandas",
|
|
149
|
+
return_X_y: Optional[Tuple[List[str], str]] = None
|
|
150
|
+
):
|
|
151
|
+
return load_dataset(
|
|
152
|
+
"penguins.csv",
|
|
153
|
+
backend=backend,
|
|
154
|
+
return_X_y=return_X_y
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
from typing import Optional
|
|
159
|
+
|
|
160
|
+
def generate_dataset(n_rows, schema, seed=None, save: Optional[bool] = False, filename: Optional[str] = None):
|
|
161
|
+
if seed is not None:
|
|
162
|
+
if not isinstance(seed, int):
|
|
163
|
+
raise TypeError("seed debe ser un entero o None")
|
|
164
|
+
np.random.seed(seed)
|
|
165
|
+
else:
|
|
166
|
+
np.random.seed(42)
|
|
167
|
+
|
|
168
|
+
if not isinstance(schema, dict):
|
|
169
|
+
raise TypeError("schema debe ser un diccionario")
|
|
60
170
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
data = {}
|
|
174
|
+
|
|
175
|
+
for col, config in schema.items():
|
|
176
|
+
if "dist" not in config:
|
|
177
|
+
raise ValueError(f"La columna '{col}' no tiene 'dist' definido")
|
|
178
|
+
|
|
179
|
+
dist = config["dist"]
|
|
180
|
+
dtype = config.get("type", "float")
|
|
181
|
+
nround = config.get("round", 0)
|
|
182
|
+
|
|
183
|
+
# ---------- DISTRIBUCIONES ----------
|
|
184
|
+
if dist == "normal":
|
|
185
|
+
values = np.random.normal(
|
|
186
|
+
loc=config.get("mean", 0),
|
|
187
|
+
scale=config.get("std", 1),
|
|
188
|
+
size=n_rows
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
elif dist == "uniform":
|
|
192
|
+
values = np.random.uniform(
|
|
193
|
+
low=config.get("low", 0),
|
|
194
|
+
high=config.get("high", 1),
|
|
195
|
+
size=n_rows
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
elif dist == "exponential":
|
|
199
|
+
values = np.random.exponential(
|
|
200
|
+
scale=config.get("scale", 1),
|
|
201
|
+
size=n_rows
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
elif dist == "lognormal":
|
|
205
|
+
values = np.random.lognormal(
|
|
206
|
+
mean=config.get("mean", 0),
|
|
207
|
+
sigma=config.get("std", 1),
|
|
208
|
+
size=n_rows
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
elif dist == "poisson":
|
|
212
|
+
values = np.random.poisson(
|
|
213
|
+
lam=config.get("lam", 1),
|
|
214
|
+
size=n_rows
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
elif dist == "binomial":
|
|
218
|
+
values = np.random.binomial(
|
|
219
|
+
n=config.get("n", 1),
|
|
220
|
+
p=config.get("p", 0.5),
|
|
221
|
+
size=n_rows
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
elif dist == "categorical":
|
|
225
|
+
if "choices" not in config:
|
|
226
|
+
raise ValueError(f"'choices' es requerido para categorical ({col})")
|
|
227
|
+
values = np.random.choice(
|
|
228
|
+
config["choices"],
|
|
229
|
+
size=n_rows
|
|
230
|
+
)
|
|
231
|
+
data[col] = values
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
else:
|
|
235
|
+
raise ValueError(f"Distribución no soportada: {dist}")
|
|
236
|
+
|
|
237
|
+
# ---------- CASTEO DE TIPO ----------
|
|
238
|
+
if dtype == "int":
|
|
239
|
+
values = np.round(values).astype(int)
|
|
240
|
+
elif dtype == "float":
|
|
241
|
+
values = values.astype(float)
|
|
242
|
+
else:
|
|
243
|
+
raise ValueError(f"Tipo no soportado: {dtype}")
|
|
244
|
+
|
|
245
|
+
# ---------- REDONDEO ----------
|
|
246
|
+
if nround > 0:
|
|
247
|
+
values = np.round(values, nround)
|
|
248
|
+
else:
|
|
249
|
+
values = np.round(values, 2)
|
|
250
|
+
|
|
251
|
+
data[col] = values
|
|
252
|
+
|
|
253
|
+
if save and filename:
|
|
254
|
+
df = pd.DataFrame(data)
|
|
255
|
+
df.to_csv(f"{filename}.csv", index=False)
|
|
67
256
|
else:
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
)
|
|
257
|
+
df = pd.DataFrame(data)
|
|
258
|
+
df.to_csv("dataset.csv", index=False)
|
|
71
259
|
|
|
260
|
+
return pd.DataFrame(data)
|
statslibx/descriptive.py
CHANGED
|
@@ -12,18 +12,82 @@ import base64
|
|
|
12
12
|
import plotly.express as px
|
|
13
13
|
|
|
14
14
|
class DescriptiveStats:
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
|
|
15
|
+
"""
|
|
16
|
+
DescriptiveStats
|
|
17
|
+
A class for performing univariate and multivariate descriptive statistical analysis.
|
|
18
|
+
It provides tools for exploratory data analysis, measures of central tendency,
|
|
19
|
+
dispersion, distribution shape, and linear regression.
|
|
20
|
+
Attributes:
|
|
21
|
+
-----------
|
|
22
|
+
data : pd.DataFrame
|
|
23
|
+
The dataset to analyze.
|
|
24
|
+
sep : str, optional
|
|
25
|
+
Column separator for file input.
|
|
26
|
+
decimal : str, optional
|
|
27
|
+
Decimal separator for file input.
|
|
28
|
+
thousand : str, optional
|
|
29
|
+
Thousand separator for file input.
|
|
30
|
+
backend : str, optional
|
|
31
|
+
Backend to use for processing ('pandas' or 'polars'). Default is 'pandas'.
|
|
32
|
+
lang : str, optional
|
|
33
|
+
Language for output ('es-ES' or 'en-US'). Default is 'es-ES'.
|
|
34
|
+
|
|
35
|
+
Methods:
|
|
36
|
+
--------
|
|
37
|
+
from_file(path: str)
|
|
38
|
+
Load data from a file and return an instance of DescriptiveStats.
|
|
39
|
+
|
|
40
|
+
mean(column: Optional[str] = None) -> Union[float, pd.Series]
|
|
41
|
+
Calculate the arithmetic mean of a column or all numeric columns.
|
|
42
|
+
|
|
43
|
+
median(column: Optional[str] = None) -> Union[float, pd.Series]
|
|
44
|
+
Calculate the median of a column or all numeric columns.
|
|
45
|
+
|
|
46
|
+
mode(column: Optional[str] = None)
|
|
47
|
+
Calculate the mode of a column or all numeric columns.
|
|
48
|
+
|
|
49
|
+
variance(column: Optional[str] = None) -> Union[float, pd.Series]
|
|
50
|
+
Calculate the variance of a column or all numeric columns.
|
|
51
|
+
|
|
52
|
+
std(column: Optional[str] = None) -> Union[float, pd.Series]
|
|
53
|
+
Calculate the standard deviation of a column or all numeric columns.
|
|
54
|
+
|
|
55
|
+
skewness(column: Optional[str] = None) -> Union[float, pd.Series]
|
|
56
|
+
Calculate the skewness of a column or all numeric columns.
|
|
57
|
+
|
|
58
|
+
kurtosis(column: Optional[str] = None) -> Union[float, pd.Series]
|
|
59
|
+
Calculate the kurtosis of a column or all numeric columns.
|
|
60
|
+
|
|
61
|
+
quantile(q: Union[float, List[float]], column: Optional[str] = None)
|
|
62
|
+
Calculate quantiles for a column or all numeric columns.
|
|
63
|
+
|
|
64
|
+
outliers(column: str, method: Literal['iqr', 'zscore'] = 'iqr', threshold: float = 1.5) -> pd.Series
|
|
65
|
+
Detect outliers in a column using IQR or z-score methods.
|
|
66
|
+
|
|
67
|
+
correlation(method: Literal['pearson', 'spearman', 'kendall'] = 'pearson', columns: Optional[List[str]] = None) -> pd.DataFrame
|
|
68
|
+
Compute the correlation matrix for specified columns or all numeric columns.
|
|
69
|
+
|
|
70
|
+
covariance(columns: Optional[List[str]] = None) -> pd.DataFrame
|
|
71
|
+
Compute the covariance matrix for specified columns or all numeric columns.
|
|
72
|
+
|
|
73
|
+
summary(columns: Optional[List[str]] = None, show_plot: bool = False, plot_backend: str = 'seaborn') -> 'DescriptiveSummary'
|
|
74
|
+
Generate a complete descriptive statistics summary for specified columns or all numeric columns.
|
|
75
|
+
|
|
76
|
+
linear_regression(X: Union[str, List[str]], y: str, engine: Literal['statsmodels', 'scikit-learn'] = 'statsmodels', fit_intercept: bool = True, show_plot: bool = False, plot_backend: str = 'seaborn', handle_missing: Literal['drop', 'error', 'warn'] = 'drop') -> tuple
|
|
77
|
+
Perform simple or multiple linear regression with optional visualization.
|
|
78
|
+
|
|
79
|
+
help()
|
|
80
|
+
Display the complete help documentation for the DescriptiveStats class.
|
|
18
81
|
"""
|
|
19
82
|
|
|
20
83
|
def __init__(self, data: Union[pd.DataFrame, np.ndarray],
|
|
21
84
|
sep: str = None,
|
|
22
85
|
decimal: str = None,
|
|
23
86
|
thousand: str = None,
|
|
24
|
-
backend: Literal['pandas', 'polars'] = 'pandas'
|
|
87
|
+
backend: Literal['pandas', 'polars'] = 'pandas',
|
|
88
|
+
lang: Literal['es-ES', 'en-US'] = 'es-ES'):
|
|
25
89
|
"""
|
|
26
|
-
#
|
|
90
|
+
# Initialize DataFrame
|
|
27
91
|
|
|
28
92
|
## **Parameters:**
|
|
29
93
|
|
|
@@ -54,7 +118,8 @@ class DescriptiveStats:
|
|
|
54
118
|
if data.ndim == 1:
|
|
55
119
|
data = pd.DataFrame({'var': data})
|
|
56
120
|
else:
|
|
57
|
-
data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
|
|
121
|
+
data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])]) \
|
|
122
|
+
if isinstance(data, pd.DataFrame) else pl.DataFrame(data, )
|
|
58
123
|
|
|
59
124
|
self.data = data
|
|
60
125
|
self.backend = backend
|
|
@@ -62,6 +127,7 @@ class DescriptiveStats:
|
|
|
62
127
|
self.sep = sep
|
|
63
128
|
self.decimal = decimal
|
|
64
129
|
self.thousand = thousand
|
|
130
|
+
self.lang = lang
|
|
65
131
|
|
|
66
132
|
@classmethod
|
|
67
133
|
def from_file(self, path: str):
|
|
@@ -394,7 +460,7 @@ class DescriptiveStats:
|
|
|
394
460
|
|
|
395
461
|
|
|
396
462
|
|
|
397
|
-
def help(self
|
|
463
|
+
def help(self):
|
|
398
464
|
"""
|
|
399
465
|
Muestra ayuda completa de la clase DescriptiveStats
|
|
400
466
|
|
|
@@ -404,12 +470,13 @@ class DescriptiveStats:
|
|
|
404
470
|
Idioma Usuario: Codigo de Idioma (es-Es) o "Español"
|
|
405
471
|
User Language: Languaje Code (en-Us) or "English"
|
|
406
472
|
"""
|
|
407
|
-
if lang in ["en-US", "English", "english"]:
|
|
408
|
-
lang = "en-US"
|
|
473
|
+
if self.lang in ["en-US", "English", "english"]:
|
|
474
|
+
self.lang = "en-US"
|
|
409
475
|
else:
|
|
410
|
-
lang = ""
|
|
476
|
+
self.lang = "es-ES"
|
|
477
|
+
help_text = " "
|
|
411
478
|
|
|
412
|
-
match lang:
|
|
479
|
+
match self.lang:
|
|
413
480
|
case "es-ES":
|
|
414
481
|
help_text = """
|
|
415
482
|
╔════════════════════════════════════════════════════════════════════════════╗
|
|
@@ -614,10 +681,9 @@ class DescriptiveStats:
|
|
|
614
681
|
╚════════════════════════════════════════════════════════════════════════════╝
|
|
615
682
|
"""
|
|
616
683
|
case "en-US":
|
|
617
|
-
# --- Falta por traducir
|
|
618
684
|
help_text = """
|
|
619
685
|
╔════════════════════════════════════════════════════════════════════════════╗
|
|
620
|
-
║ 📊
|
|
686
|
+
║ 📊 CLASS DescriptiveStats - COMPLETE HELP ║
|
|
621
687
|
╚════════════════════════════════════════════════════════════════════════════╝
|
|
622
688
|
|
|
623
689
|
📝 DESCRIPTION:
|
|
@@ -821,8 +887,7 @@ class DescriptiveStats:
|
|
|
821
887
|
|
|
822
888
|
print(help_text)
|
|
823
889
|
|
|
824
|
-
|
|
825
|
-
|
|
890
|
+
|
|
826
891
|
class DescriptiveSummary:
|
|
827
892
|
"""Clase para formatear salida de estadística descriptiva"""
|
|
828
893
|
|