statslibx 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +5 -2
- statslibx/cli.py +47 -0
- statslibx/datasets/__init__.py +57 -2
- statslibx/descriptive.py +716 -286
- statslibx/inferential.py +100 -72
- statslibx/io.py +21 -0
- statslibx/preprocessing/__init__.py +221 -0
- statslibx/utils.py +427 -60
- {statslibx-0.1.5.dist-info → statslibx-0.1.7.dist-info}/METADATA +10 -29
- statslibx-0.1.7.dist-info/RECORD +18 -0
- statslibx-0.1.7.dist-info/entry_points.txt +2 -0
- statslibx-0.1.5.dist-info/RECORD +0 -14
- {statslibx-0.1.5.dist-info → statslibx-0.1.7.dist-info}/WHEEL +0 -0
- {statslibx-0.1.5.dist-info → statslibx-0.1.7.dist-info}/top_level.txt +0 -0
statslibx/descriptive.py
CHANGED
|
@@ -1,30 +1,55 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
import pandas as pd
|
|
3
|
+
import polars as pl
|
|
3
4
|
from typing import Optional, Union, Literal, List
|
|
4
5
|
from datetime import datetime
|
|
6
|
+
import flet as ft
|
|
5
7
|
import os
|
|
8
|
+
import matplotlib.pyplot as plt
|
|
9
|
+
import seaborn as sns
|
|
10
|
+
import io
|
|
11
|
+
import base64
|
|
12
|
+
import plotly.express as px
|
|
6
13
|
|
|
7
14
|
class DescriptiveStats:
|
|
8
15
|
"""
|
|
9
16
|
Clase para estadística descriptiva univariada y multivariada
|
|
17
|
+
Class for univariate and multivariate descriptive statistics
|
|
10
18
|
"""
|
|
11
19
|
|
|
12
|
-
def __init__(self, data: Union[pd.DataFrame, np.ndarray],
|
|
13
|
-
|
|
20
|
+
def __init__(self, data: Union[pd.DataFrame, np.ndarray],
|
|
21
|
+
sep: str = None,
|
|
22
|
+
decimal: str = None,
|
|
23
|
+
thousand: str = None,
|
|
24
|
+
backend: Literal['pandas', 'polars'] = 'pandas'):
|
|
14
25
|
"""
|
|
15
|
-
|
|
26
|
+
# Inicialize DataFrame
|
|
16
27
|
|
|
17
|
-
Parameters
|
|
18
|
-
|
|
19
|
-
data :
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
28
|
+
## **Parameters:**
|
|
29
|
+
|
|
30
|
+
- **data** : Data to analyze
|
|
31
|
+
- **sep** : Column separator
|
|
32
|
+
- **decimal** : Decimal separator
|
|
33
|
+
- **thousand** : Thousand separator
|
|
34
|
+
- **backend** : 'pandas' or 'polars' for processing
|
|
35
|
+
(Proximamente estara habilitado polars para big data)
|
|
36
|
+
|
|
37
|
+
**Examples:**
|
|
38
|
+
|
|
39
|
+
``Example 1:
|
|
40
|
+
stats = DescriptiveStats(data)
|
|
41
|
+
``
|
|
23
42
|
"""
|
|
24
43
|
|
|
25
44
|
if isinstance(data, str) and os.path.exists(data):
|
|
26
45
|
data = DescriptiveStats.from_file(data).data
|
|
27
46
|
|
|
47
|
+
if isinstance(data, pl.DataFrame):
|
|
48
|
+
raise TypeError(
|
|
49
|
+
"Polars aún no soportado. Use pandas.DataFrame."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
28
53
|
if isinstance(data, np.ndarray):
|
|
29
54
|
if data.ndim == 1:
|
|
30
55
|
data = pd.DataFrame({'var': data})
|
|
@@ -34,26 +59,37 @@ class DescriptiveStats:
|
|
|
34
59
|
self.data = data
|
|
35
60
|
self.backend = backend
|
|
36
61
|
self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
|
|
62
|
+
self.sep = sep
|
|
63
|
+
self.decimal = decimal
|
|
64
|
+
self.thousand = thousand
|
|
37
65
|
|
|
38
|
-
@
|
|
39
|
-
def from_file(path: str):
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_file(self, path: str):
|
|
40
68
|
"""
|
|
41
69
|
Carga automática de archivos y devuelve instancia de Intelligence.
|
|
42
70
|
Soporta CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
|
|
71
|
+
Automatic file upload and returns Intelligence instance.
|
|
72
|
+
Supports CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
|
|
73
|
+
|
|
74
|
+
Parametros / Parameters:
|
|
75
|
+
------------------------
|
|
76
|
+
path : str
|
|
77
|
+
Ruta del archivo
|
|
78
|
+
File path
|
|
43
79
|
"""
|
|
44
80
|
if not os.path.exists(path):
|
|
45
|
-
raise FileNotFoundError(f"Archivo no encontrado: {path}")
|
|
81
|
+
raise FileNotFoundError(f"Archivo no encontrado / File not found: {path}")
|
|
46
82
|
|
|
47
83
|
ext = os.path.splitext(path)[1].lower()
|
|
48
84
|
|
|
49
85
|
if ext == ".csv":
|
|
50
|
-
df = pd.read_csv(path)
|
|
86
|
+
df = pd.read_csv(path, sep=self.sep, decimal=self.decimal, thousand=self.thousand)
|
|
51
87
|
|
|
52
88
|
elif ext in [".xlsx", ".xls"]:
|
|
53
|
-
df = pd.read_excel(path)
|
|
89
|
+
df = pd.read_excel(path, decimal=self.decimal, thousand=self.thousand)
|
|
54
90
|
|
|
55
91
|
elif ext in [".txt", ".tsv"]:
|
|
56
|
-
df = pd.read_table(path)
|
|
92
|
+
df = pd.read_table(path, sep=self.sep, decimal=self.decimal, thousand=self.thousand)
|
|
57
93
|
|
|
58
94
|
elif ext == ".json":
|
|
59
95
|
df = pd.read_json(path)
|
|
@@ -65,56 +101,124 @@ class DescriptiveStats:
|
|
|
65
101
|
df = pd.read_feather(path)
|
|
66
102
|
|
|
67
103
|
else:
|
|
68
|
-
raise ValueError(f"Formato no soportado: {ext}")
|
|
104
|
+
raise ValueError(f"Formato no soportado / Unsupported format: {ext}")
|
|
69
105
|
|
|
70
106
|
return DescriptiveStats(df)
|
|
71
107
|
|
|
72
108
|
# ============= MÉTODOS UNIVARIADOS =============
|
|
73
109
|
|
|
74
110
|
def mean(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
75
|
-
"""
|
|
111
|
+
"""
|
|
112
|
+
Media aritmética / Arithmetic mean
|
|
113
|
+
|
|
114
|
+
Parametros / Parameters:
|
|
115
|
+
------------------------
|
|
116
|
+
**column** : str
|
|
117
|
+
Nombre de la columna
|
|
118
|
+
Name of the column
|
|
119
|
+
"""
|
|
76
120
|
if column:
|
|
77
121
|
return self.data[column].mean()
|
|
78
122
|
return self.data[self._numeric_cols].mean()
|
|
79
123
|
|
|
80
124
|
def median(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
81
|
-
"""
|
|
125
|
+
"""
|
|
126
|
+
Mediana / Median
|
|
127
|
+
|
|
128
|
+
Parametros / Parameters:
|
|
129
|
+
------------------------
|
|
130
|
+
**column** : str
|
|
131
|
+
Nombre de la columna
|
|
132
|
+
Name of the column
|
|
133
|
+
"""
|
|
82
134
|
if column:
|
|
83
135
|
return self.data[column].median()
|
|
84
136
|
return self.data[self._numeric_cols].median()
|
|
85
137
|
|
|
86
138
|
def mode(self, column: Optional[str] = None):
|
|
87
|
-
"""
|
|
139
|
+
"""
|
|
140
|
+
Moda / Mode
|
|
141
|
+
|
|
142
|
+
Parametros / Parameters:
|
|
143
|
+
------------------------
|
|
144
|
+
column : str
|
|
145
|
+
Nombre de la columna
|
|
146
|
+
Name of the column
|
|
147
|
+
"""
|
|
88
148
|
if column:
|
|
89
149
|
return self.data[column].mode()[0]
|
|
90
150
|
return self.data[self._numeric_cols].mode().iloc[0]
|
|
91
151
|
|
|
92
152
|
def variance(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
93
|
-
"""
|
|
153
|
+
"""
|
|
154
|
+
Varianza / Variance
|
|
155
|
+
|
|
156
|
+
Parametros / Parameters:
|
|
157
|
+
------------------------
|
|
158
|
+
column : str
|
|
159
|
+
Nombre de la columna
|
|
160
|
+
Name of the column
|
|
161
|
+
"""
|
|
94
162
|
if column:
|
|
95
163
|
return self.data[column].var()
|
|
96
164
|
return self.data[self._numeric_cols].var()
|
|
97
165
|
|
|
98
166
|
def std(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
99
|
-
"""
|
|
167
|
+
"""
|
|
168
|
+
Desviación estándar / Standard deviation
|
|
169
|
+
|
|
170
|
+
Parametros / Parameters:
|
|
171
|
+
------------------------
|
|
172
|
+
column : str
|
|
173
|
+
Nombre de la columna
|
|
174
|
+
Name of the column
|
|
175
|
+
|
|
176
|
+
"""
|
|
100
177
|
if column:
|
|
101
178
|
return self.data[column].std()
|
|
102
179
|
return self.data[self._numeric_cols].std()
|
|
103
180
|
|
|
104
181
|
def skewness(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
105
|
-
"""
|
|
182
|
+
"""
|
|
183
|
+
Asimetría / Asymmetry
|
|
184
|
+
|
|
185
|
+
Parametros / Parameters:
|
|
186
|
+
------------------------
|
|
187
|
+
column : str
|
|
188
|
+
Nombre de la columna
|
|
189
|
+
Name of the column
|
|
190
|
+
"""
|
|
106
191
|
if column:
|
|
107
192
|
return self.data[column].skew()
|
|
108
193
|
return self.data[self._numeric_cols].skew()
|
|
109
194
|
|
|
110
195
|
def kurtosis(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
111
|
-
"""
|
|
196
|
+
"""
|
|
197
|
+
Curtosis / Kurtosis
|
|
198
|
+
|
|
199
|
+
Parametros / Parameters:
|
|
200
|
+
------------------------
|
|
201
|
+
column : str
|
|
202
|
+
Nombre de la columna
|
|
203
|
+
Name of the column
|
|
204
|
+
"""
|
|
112
205
|
if column:
|
|
113
206
|
return self.data[column].kurtosis()
|
|
114
207
|
return self.data[self._numeric_cols].kurtosis()
|
|
115
208
|
|
|
116
209
|
def quantile(self, q: Union[float, List[float]], column: Optional[str] = None):
|
|
117
|
-
"""
|
|
210
|
+
"""
|
|
211
|
+
Cuantiles - Percentiles / Quantiles - Percentiles
|
|
212
|
+
|
|
213
|
+
Parametros / Parameters:
|
|
214
|
+
------------------------
|
|
215
|
+
q : float / List[float]
|
|
216
|
+
Cuantiles a calcular
|
|
217
|
+
Quantiles to calculate
|
|
218
|
+
column : str
|
|
219
|
+
Nombre de la columna
|
|
220
|
+
Name of the column
|
|
221
|
+
"""
|
|
118
222
|
if column:
|
|
119
223
|
return self.data[column].quantile(q)
|
|
120
224
|
return self.data[self._numeric_cols].quantile(q)
|
|
@@ -122,16 +226,19 @@ class DescriptiveStats:
|
|
|
122
226
|
def outliers(self, column: str, method: Literal['iqr', 'zscore'] = 'iqr',
|
|
123
227
|
threshold: float = 1.5) -> pd.Series:
|
|
124
228
|
"""
|
|
125
|
-
Detectar outliers en una columna
|
|
229
|
+
Detectar outliers en una columna / Detecting outliers in a column
|
|
230
|
+
|
|
126
231
|
|
|
127
|
-
Parameters:
|
|
128
|
-
|
|
232
|
+
Parametros / Parameters:
|
|
233
|
+
------------------------
|
|
129
234
|
column : str
|
|
130
235
|
Nombre de la columna
|
|
236
|
+
Name of the column
|
|
131
237
|
method : str
|
|
132
238
|
'iqr' o 'zscore'
|
|
133
239
|
threshold : float
|
|
134
240
|
1.5 para IQR, 3 para zscore típicamente
|
|
241
|
+
1.5 for IQR, 3 for zscore typically
|
|
135
242
|
"""
|
|
136
243
|
col_data = self.data[column]
|
|
137
244
|
|
|
@@ -151,22 +258,31 @@ class DescriptiveStats:
|
|
|
151
258
|
# ============= MÉTODOS MULTIVARIADOS =============
|
|
152
259
|
|
|
153
260
|
def correlation(self, method: Literal['pearson', 'spearman', 'kendall'] = 'pearson',
|
|
154
|
-
|
|
261
|
+
columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
155
262
|
"""
|
|
156
|
-
Matriz de correlación
|
|
263
|
+
Matriz de correlación / Correlation matrix
|
|
157
264
|
|
|
158
|
-
Parameters:
|
|
159
|
-
|
|
265
|
+
Parametros / Parameters:
|
|
266
|
+
------------------------
|
|
160
267
|
method : str
|
|
161
268
|
'pearson', 'spearman' o 'kendall'
|
|
162
269
|
columns : list, optional
|
|
163
270
|
Lista de columnas a incluir
|
|
271
|
+
List of columns to include
|
|
164
272
|
"""
|
|
165
273
|
data_subset = self.data[columns] if columns else self.data[self._numeric_cols]
|
|
166
274
|
return data_subset.corr(method=method)
|
|
167
275
|
|
|
168
276
|
def covariance(self, columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
169
|
-
"""
|
|
277
|
+
"""
|
|
278
|
+
Matriz de covarianza
|
|
279
|
+
|
|
280
|
+
Parametros / Parameters:
|
|
281
|
+
------------------------
|
|
282
|
+
columns: list, optional
|
|
283
|
+
Lista de columnas a incluir
|
|
284
|
+
List of columns to include
|
|
285
|
+
"""
|
|
170
286
|
data_subset = self.data[columns] if columns else self.data[self._numeric_cols]
|
|
171
287
|
return data_subset.cov()
|
|
172
288
|
|
|
@@ -176,14 +292,16 @@ class DescriptiveStats:
|
|
|
176
292
|
show_plot: bool = False,
|
|
177
293
|
plot_backend: str = 'seaborn') -> 'DescriptiveSummary':
|
|
178
294
|
"""
|
|
179
|
-
Resumen completo de estadísticas descriptivas
|
|
295
|
+
Resumen completo de estadísticas descriptivas / Complete descriptive statistics summary
|
|
180
296
|
|
|
181
|
-
Parameters:
|
|
182
|
-
|
|
297
|
+
Parametros / Parameters:
|
|
298
|
+
------------------------
|
|
183
299
|
columns : list, optional
|
|
184
300
|
Columnas específicas a resumir
|
|
301
|
+
Specific columns to summarize
|
|
185
302
|
show_plot : bool
|
|
186
303
|
Si mostrar gráficos
|
|
304
|
+
If to show graphics
|
|
187
305
|
plot_backend : str
|
|
188
306
|
'seaborn', 'plotly' o 'matplotlib'
|
|
189
307
|
"""
|
|
@@ -213,143 +331,87 @@ class DescriptiveStats:
|
|
|
213
331
|
# ============= REGRESIÓN LINEAL =============
|
|
214
332
|
|
|
215
333
|
def linear_regression(self,
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
334
|
+
X: Union[str, List[str]],
|
|
335
|
+
y: str,
|
|
336
|
+
engine: Literal['statsmodels', 'scikit-learn'] = 'statsmodels',
|
|
337
|
+
fit_intercept: bool = True,
|
|
338
|
+
show_plot: bool = False,
|
|
339
|
+
plot_backend: str = 'seaborn',
|
|
340
|
+
handle_missing: Literal['drop', 'error', 'warn'] = 'drop') -> tuple:
|
|
223
341
|
"""
|
|
224
|
-
Regresión lineal simple o múltiple
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
342
|
+
Regresión lineal simple o múltiple con opción de mostrar gráfico / Simple or multiple \
|
|
343
|
+
linear regression with option to show graph
|
|
344
|
+
|
|
345
|
+
Parametros / Parameters:
|
|
346
|
+
------------------------
|
|
347
|
+
X: str, list, optional
|
|
348
|
+
Nombre de la variable independiente
|
|
349
|
+
|
|
350
|
+
y: str
|
|
351
|
+
Nombre de la variable dependiente
|
|
352
|
+
|
|
353
|
+
engine: str
|
|
354
|
+
Motor de la regresion
|
|
355
|
+
|
|
356
|
+
fit_intercept: bool
|
|
357
|
+
Intercepto de la regresion
|
|
358
|
+
|
|
359
|
+
show_plot: bool
|
|
360
|
+
Visualizar la regresion (recomendable, solo [X,y])
|
|
361
|
+
|
|
362
|
+
handle_missing:
|
|
363
|
+
'drop', 'error' o 'warn'
|
|
245
364
|
"""
|
|
246
365
|
if isinstance(X, str):
|
|
247
366
|
X = [X]
|
|
248
|
-
|
|
249
|
-
# Verificar
|
|
250
|
-
missing_columns = []
|
|
251
|
-
if y not in self.data.columns:
|
|
252
|
-
missing_columns.append(y)
|
|
253
|
-
for x_col in X:
|
|
254
|
-
if x_col not in self.data.columns:
|
|
255
|
-
missing_columns.append(x_col)
|
|
256
|
-
|
|
367
|
+
|
|
368
|
+
# Verificar columnas
|
|
369
|
+
missing_columns = [col for col in [y] + X if col not in self.data.columns]
|
|
257
370
|
if missing_columns:
|
|
258
371
|
raise ValueError(f"Columnas no encontradas: {missing_columns}")
|
|
259
|
-
|
|
260
|
-
#
|
|
372
|
+
|
|
373
|
+
# Preparar datos
|
|
261
374
|
regression_data = self.data[[y] + X].copy()
|
|
262
|
-
|
|
263
|
-
# Manejar valores infinitos
|
|
264
375
|
numeric_cols = regression_data.select_dtypes(include=[np.number]).columns
|
|
265
376
|
for col in numeric_cols:
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
regression_data[col] = regression_data[col].replace([np.inf, -np.inf], np.nan)
|
|
271
|
-
|
|
272
|
-
# Manejar valores faltantes
|
|
273
|
-
missing_before = regression_data.isnull().sum()
|
|
274
|
-
total_missing = missing_before.sum()
|
|
275
|
-
|
|
276
|
-
if total_missing > 0:
|
|
277
|
-
missing_info = "\n".join([f" - {col}: {missing_before[col]} missing"
|
|
278
|
-
for col in missing_before[missing_before > 0].index])
|
|
279
|
-
|
|
377
|
+
regression_data[col] = regression_data[col].replace([np.inf, -np.inf], np.nan)
|
|
378
|
+
|
|
379
|
+
# Manejo de valores faltantes
|
|
380
|
+
if regression_data.isnull().any().any():
|
|
280
381
|
if handle_missing == 'error':
|
|
281
|
-
raise ValueError(
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
regression_data_clean = regression_data.dropna()
|
|
290
|
-
|
|
291
|
-
else:
|
|
292
|
-
raise ValueError(f"Método de manejo de missing values no reconocido: {handle_missing}")
|
|
293
|
-
|
|
294
|
-
# Informar sobre la limpieza
|
|
295
|
-
rows_before = len(regression_data)
|
|
296
|
-
rows_after = len(regression_data_clean)
|
|
297
|
-
rows_removed = rows_before - rows_after
|
|
298
|
-
|
|
299
|
-
if rows_removed > 0:
|
|
300
|
-
print(f"Limpieza de datos: {rows_removed} filas eliminadas ({rows_after} filas restantes)")
|
|
301
|
-
|
|
302
|
-
if rows_after < len(X) + 1: # +1 para el intercepto
|
|
303
|
-
raise ValueError(
|
|
304
|
-
f"Muy pocas filas después de limpieza: {rows_after}. "
|
|
305
|
-
f"Se necesitan al menos {len(X) + 1} filas para regresión."
|
|
306
|
-
)
|
|
307
|
-
else:
|
|
308
|
-
regression_data_clean = regression_data
|
|
309
|
-
|
|
310
|
-
# Extraer datos limpios
|
|
311
|
-
X_data = regression_data_clean[X].values
|
|
312
|
-
y_data = regression_data_clean[y].values
|
|
313
|
-
|
|
314
|
-
# Validar que los datos son numéricos
|
|
315
|
-
if not np.issubdtype(X_data.dtype, np.number):
|
|
316
|
-
raise ValueError("Las variables independientes deben ser numéricas")
|
|
317
|
-
if not np.issubdtype(y_data.dtype, np.number):
|
|
318
|
-
raise ValueError("La variable dependiente debe ser numérica")
|
|
319
|
-
|
|
320
|
-
# Validar que no hay más missing values
|
|
321
|
-
if np.isnan(X_data).any() or np.isnan(y_data).any():
|
|
322
|
-
raise ValueError("Todavía hay valores NaN después de la limpieza")
|
|
323
|
-
|
|
324
|
-
# Validar que no hay valores infinitos
|
|
325
|
-
if np.isinf(X_data).any() or np.isinf(y_data).any():
|
|
326
|
-
raise ValueError("Todavía hay valores infinitos después de la limpieza")
|
|
327
|
-
|
|
328
|
-
# Crear y ajustar el modelo
|
|
329
|
-
result = LinearRegressionResult(
|
|
330
|
-
X_data, y_data, X, y,
|
|
331
|
-
engine=engine,
|
|
332
|
-
fit_intercept=fit_intercept
|
|
333
|
-
)
|
|
382
|
+
raise ValueError("Datos contienen valores faltantes")
|
|
383
|
+
regression_data = regression_data.dropna()
|
|
384
|
+
|
|
385
|
+
X_data = regression_data[X].values
|
|
386
|
+
y_data = regression_data[y].values
|
|
387
|
+
|
|
388
|
+
# Ajustar modelo
|
|
389
|
+
result = LinearRegressionResult(X_data, y_data, X, y, engine=engine, fit_intercept=fit_intercept)
|
|
334
390
|
result.fit()
|
|
335
391
|
result.show_plot = show_plot
|
|
336
392
|
result.plot_backend = plot_backend
|
|
337
|
-
|
|
338
|
-
# Agregar información de limpieza al resultado
|
|
339
|
-
result.data_info = {
|
|
340
|
-
'original_rows': len(self.data),
|
|
341
|
-
'clean_rows': len(regression_data_clean),
|
|
342
|
-
'rows_removed': len(self.data) - len(regression_data_clean),
|
|
343
|
-
'missing_handled': total_missing > 0
|
|
344
|
-
}
|
|
345
|
-
|
|
346
393
|
return result
|
|
394
|
+
|
|
395
|
+
|
|
347
396
|
|
|
348
|
-
def help(self):
|
|
397
|
+
def help(self, lang="es-Es"):
|
|
349
398
|
"""
|
|
350
399
|
Muestra ayuda completa de la clase DescriptiveStats
|
|
400
|
+
|
|
401
|
+
Parametros / Parameters:
|
|
402
|
+
------------------------
|
|
403
|
+
lang: str
|
|
404
|
+
Idioma Usuario: Codigo de Idioma (es-Es) o "Español"
|
|
405
|
+
User Language: Languaje Code (en-Us) or "English"
|
|
351
406
|
"""
|
|
352
|
-
|
|
407
|
+
if lang in ["en-US", "English", "english"]:
|
|
408
|
+
lang = "en-US"
|
|
409
|
+
else:
|
|
410
|
+
lang = ""
|
|
411
|
+
|
|
412
|
+
match lang:
|
|
413
|
+
case "es-ES":
|
|
414
|
+
help_text = """
|
|
353
415
|
╔════════════════════════════════════════════════════════════════════════════╗
|
|
354
416
|
║ 📊 CLASE DescriptiveStats - AYUDA COMPLETA ║
|
|
355
417
|
╚════════════════════════════════════════════════════════════════════════════╝
|
|
@@ -405,6 +467,15 @@ class DescriptiveStats:
|
|
|
405
467
|
|
|
406
468
|
Incluye: conteo, media, mediana, moda, desv. est., varianza,
|
|
407
469
|
mínimo, Q1, Q3, máximo, IQR, asimetría, curtosis
|
|
470
|
+
• .summary().to_dataframe(format)
|
|
471
|
+
Format:
|
|
472
|
+
- Wide
|
|
473
|
+
- Long
|
|
474
|
+
- Compact
|
|
475
|
+
|
|
476
|
+
• .summary().to_categorical_summary()
|
|
477
|
+
• .summary().to_styled_df()
|
|
478
|
+
|
|
408
479
|
|
|
409
480
|
┌────────────────────────────────────────────────────────────────────────────┐
|
|
410
481
|
│ 4. 📈 REGRESIÓN LINEAL │
|
|
@@ -428,121 +499,329 @@ class DescriptiveStats:
|
|
|
428
499
|
|
|
429
500
|
💡 EJEMPLOS DE USO:
|
|
430
501
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
502
|
+
┌─ Ejemplo 1: Inicialización ─────────────────────────────────────────────┐
|
|
503
|
+
│ import pandas as pd │
|
|
504
|
+
│ from descriptive import DescriptiveStats │
|
|
505
|
+
│ │
|
|
506
|
+
│ # Con DataFrame │
|
|
507
|
+
│ df = pd.read_csv('datos.csv') │
|
|
508
|
+
│ stats = DescriptiveStats(df) │
|
|
509
|
+
│ │
|
|
510
|
+
│ # Con array numpy │
|
|
511
|
+
│ import numpy as np │
|
|
512
|
+
│ datos = np.random.normal(0, 1, 1000) │
|
|
513
|
+
│ stats = DescriptiveStats(datos) │
|
|
514
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
515
|
+
|
|
516
|
+
┌─ Ejemplo 2: Análisis Univariado ────────────────────────────────────────┐
|
|
517
|
+
│ # Estadísticas de una columna │
|
|
518
|
+
│ media = stats.mean('edad') │
|
|
519
|
+
│ mediana = stats.median('edad') │
|
|
520
|
+
│ desv_est = stats.std('edad') │
|
|
521
|
+
│ │
|
|
522
|
+
│ # Cuartiles │
|
|
523
|
+
│ q25 = stats.quantile(0.25, 'edad') │
|
|
524
|
+
│ q75 = stats.quantile(0.75, 'edad') │
|
|
525
|
+
│ │
|
|
526
|
+
│ # Detectar outliers │
|
|
527
|
+
│ outliers_mask = stats.outliers('edad', method='iqr', threshold=1.5) │
|
|
528
|
+
│ print(f"Outliers detectados: {outliers_mask.sum()}") │
|
|
529
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
530
|
+
|
|
531
|
+
┌─ Ejemplo 3: Resumen Completo ───────────────────────────────────────────┐
|
|
532
|
+
│ # Resumen de todas las variables numéricas │
|
|
533
|
+
│ resumen = stats.summary() │
|
|
534
|
+
│ print(resumen) │
|
|
535
|
+
│ │
|
|
536
|
+
│ # Resumen de columnas específicas con visualización │
|
|
537
|
+
│ resumen = stats.summary( │
|
|
538
|
+
│ columns=['edad', 'salario', 'experiencia'], │
|
|
539
|
+
│ show_plot=True, │
|
|
540
|
+
│ plot_backend='seaborn' │
|
|
541
|
+
│ ) │
|
|
542
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
543
|
+
|
|
544
|
+
┌─ Ejemplo 4: Análisis Multivariado ──────────────────────────────────────┐
|
|
545
|
+
│ # Matriz de correlación │
|
|
546
|
+
│ corr_pearson = stats.correlation(method='pearson') │
|
|
547
|
+
│ corr_spearman = stats.correlation(method='spearman') │
|
|
548
|
+
│ │
|
|
549
|
+
│ # Matriz de covarianza │
|
|
550
|
+
│ cov_matrix = stats.covariance() │
|
|
551
|
+
│ │
|
|
552
|
+
│ # Correlación entre variables específicas │
|
|
553
|
+
│ corr_subset = stats.correlation( │
|
|
554
|
+
│ method='pearson', │
|
|
555
|
+
│ columns=['edad', 'salario', 'experiencia'] │
|
|
556
|
+
│ ) │
|
|
557
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
558
|
+
|
|
559
|
+
┌─ Ejemplo 5: Regresión Lineal Simple ────────────────────────────────────┐
|
|
560
|
+
│ # Regresión simple: salario ~ experiencia │
|
|
561
|
+
│ modelo = stats.linear_regression( │
|
|
562
|
+
│ y='salario', │
|
|
563
|
+
│ X='experiencia', │
|
|
564
|
+
│ engine='statsmodels', │
|
|
565
|
+
│ show_plot=True │
|
|
566
|
+
│ ) │
|
|
567
|
+
│ │
|
|
568
|
+
│ # Ver resultados │
|
|
569
|
+
│ print(modelo.summary()) │
|
|
570
|
+
│ │
|
|
571
|
+
│ # Acceder a coeficientes │
|
|
572
|
+
│ print(f"Intercepto: {modelo.intercept_}") │
|
|
573
|
+
│ print(f"Pendiente: {modelo.coef_[0]}") │
|
|
574
|
+
│ print(f"R²: {modelo.r_squared}") │
|
|
575
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
576
|
+
|
|
577
|
+
┌─ Ejemplo 6: Regresión Lineal Múltiple ──────────────────────────────────┐
|
|
578
|
+
│ # Regresión múltiple: salario ~ experiencia + edad + educacion │
|
|
579
|
+
│ modelo = stats.linear_regression( │
|
|
580
|
+
│ y='salario', │
|
|
581
|
+
│ X=['experiencia', 'edad', 'educacion'], │
|
|
582
|
+
│ engine='statsmodels', │
|
|
583
|
+
│ fit_intercept=True, │
|
|
584
|
+
│ handle_missing='drop' │
|
|
585
|
+
│ ) │
|
|
586
|
+
│ │
|
|
587
|
+
│ print(modelo.summary()) │
|
|
588
|
+
│ │
|
|
589
|
+
│ # Hacer predicciones │
|
|
590
|
+
│ import numpy as np │
|
|
591
|
+
│ X_nuevo = np.array([[5, 30, 16], [10, 35, 18]]) # experiencia, edad │
|
|
592
|
+
│ predicciones = modelo.predict(X_nuevo) │
|
|
593
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
594
|
+
|
|
595
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
596
|
+
|
|
597
|
+
🎯 CARACTERÍSTICAS CLAVE:
|
|
598
|
+
|
|
599
|
+
✓ Análisis univariado completo
|
|
600
|
+
✓ Análisis multivariado (correlación, covarianza)
|
|
601
|
+
✓ Detección de outliers con múltiples métodos
|
|
602
|
+
✓ Regresión lineal con statsmodels o scikit-learn
|
|
603
|
+
✓ Manejo automático de valores faltantes
|
|
604
|
+
✓ Soporte para pandas DataFrame y numpy arrays
|
|
605
|
+
✓ Salidas formateadas profesionales
|
|
606
|
+
✓ Visualizaciones opcionales
|
|
607
|
+
|
|
608
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
609
|
+
|
|
610
|
+
📚 DOCUMENTACIÓN ADICIONAL:
|
|
611
|
+
Para más información sobre métodos específicos, use:
|
|
612
|
+
help(DescriptiveStats.nombre_metodo)
|
|
613
|
+
|
|
614
|
+
╚════════════════════════════════════════════════════════════════════════════╝
|
|
615
|
+
"""
|
|
616
|
+
case "en-US":
|
|
617
|
+
# --- Falta por traducir
|
|
618
|
+
help_text = """
|
|
619
|
+
╔════════════════════════════════════════════════════════════════════════════╗
|
|
620
|
+
║ 📊 DescriptiveStats CLASS - COMPLETE HELP ║
|
|
621
|
+
╚════════════════════════════════════════════════════════════════════════════╝
|
|
622
|
+
|
|
623
|
+
📝 DESCRIPTION:
|
|
624
|
+
Class for univariate and multivariate descriptive statistical analysis.
|
|
625
|
+
Provides tools for exploratory data analysis, measures of
|
|
626
|
+
central tendency, dispersion, shape of distribution and linear regression.
|
|
627
|
+
|
|
628
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
629
|
+
|
|
630
|
+
📋 MAIN METHODS:
|
|
631
|
+
|
|
632
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
633
|
+
│ 1. 📊 UNIVARIATE STATISTICS │
|
|
634
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
635
|
+
|
|
636
|
+
🔹 Measures of Central Tendency:
|
|
637
|
+
• .mean(column=None) → Arithmetic mean
|
|
638
|
+
• .median(column=None) → Median (center value)
|
|
639
|
+
• .mode(column=None) → Mode (most frequent value)
|
|
640
|
+
|
|
641
|
+
🔹 Dispersion Measurements:
|
|
642
|
+
• .std(column=None) → Standard deviation
|
|
643
|
+
• .variance(column=None) → Variance
|
|
644
|
+
• .quantile(q, column=None) → Quantiles/Percentiles
|
|
645
|
+
|
|
646
|
+
🔹 Shape Measurements:
|
|
647
|
+
• .skewness(column=None) → Asymmetry (bias)
|
|
648
|
+
• .kurtosis(column=None) → Kurtosis (pointing)
|
|
649
|
+
|
|
650
|
+
🔹 Outlier Detection:
|
|
651
|
+
• .outliers(column, method='iqr', threshold=1.5)
|
|
652
|
+
Methods: 'iqr' (interquartile range) or 'zscore' (z-score)
|
|
653
|
+
|
|
654
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
655
|
+
│ 2. 🔗 MULTIVARIATE STATISTICS │
|
|
656
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
657
|
+
|
|
658
|
+
🔹 .correlation(method='pearson', columns=None)
|
|
659
|
+
Correlation matrix between variables
|
|
660
|
+
Methods: 'pearson', 'spearman', 'kendall'
|
|
661
|
+
|
|
662
|
+
🔹 .covariance(columns=None)
|
|
663
|
+
Covariance matrix between variables
|
|
664
|
+
|
|
665
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
666
|
+
│ 3. 📋 COMPLETE SUMMARY │
|
|
667
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
668
|
+
|
|
669
|
+
🔹 .summary(columns=None, show_plot=False, plot_backend='seaborn')
|
|
670
|
+
Complete descriptive summary with all statistics
|
|
671
|
+
|
|
672
|
+
Includes: count, mean, median, mode, dev. est., variance,
|
|
673
|
+
minimum, Q1, Q3, maximum, IQR, skewness, kurtosis
|
|
674
|
+
|
|
675
|
+
🔹 .summary().to_dataframe(format)
|
|
676
|
+
Format:
|
|
677
|
+
- Wide
|
|
678
|
+
- Long
|
|
679
|
+
- Compact
|
|
680
|
+
🔹 .summary().to_categorical_summary()
|
|
681
|
+
🔹 .summary().to_styled_df()
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
685
|
+
│ 4. 📈 LINEAR REGRESSION │
|
|
686
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
687
|
+
|
|
688
|
+
🔹 .linear_regression(y, X, engine='statsmodels',
|
|
689
|
+
fit_intercept=True, show_plot=False,
|
|
690
|
+
plot_backend='seaborn', handle_missing='drop')
|
|
691
|
+
|
|
692
|
+
Simple or multiple linear regression with full analysis
|
|
693
|
+
|
|
694
|
+
Parameters:
|
|
695
|
+
X : Independent variable(s) (str or list)
|
|
696
|
+
y: Dependent variable (str)
|
|
697
|
+
engine: 'statsmodels' or 'scikit-learn'
|
|
698
|
+
fit_intercept : Include intercept (bool)
|
|
699
|
+
show_plot : Show diagnostic plots (bool)
|
|
700
|
+
handle_missing : 'drop', 'error', 'warn'
|
|
701
|
+
|
|
702
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
703
|
+
|
|
704
|
+
💡 EXAMPLES OF USE:
|
|
705
|
+
|
|
706
|
+
┌─ Example 1: Initialization ─────────────────────────────────────────────┐
|
|
707
|
+
│ import pandas as pd │
|
|
708
|
+
│ from statslibx.descriptive import DescriptiveStats │
|
|
709
|
+
│ from statslibx.datasets import load_dataset │
|
|
710
|
+
│ │
|
|
711
|
+
│ # With DataFrame │
|
|
712
|
+
│ df = load_dataset('datos.csv') │
|
|
713
|
+
│ stats = DescriptiveStats(df) │
|
|
714
|
+
│ │
|
|
715
|
+
│ # With array numpy │
|
|
716
|
+
│ import numpy as np │
|
|
717
|
+
│ datos = np.random.normal(0, 1, 1000) │
|
|
718
|
+
│ stats = DescriptiveStats(datos) │
|
|
719
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
720
|
+
|
|
721
|
+
┌─ Example 2: Univariate Analysis ────────────────────────────────────────┐
|
|
722
|
+
│ # Statistics of a column │
|
|
723
|
+
│ mean = stats.mean('edad') │
|
|
724
|
+
│ median = stats.median('edad') │
|
|
725
|
+
│ desv_est = stats.std('edad') │
|
|
726
|
+
│ │
|
|
727
|
+
│ # Quartiles │
|
|
728
|
+
│ q25 = stats.quantile(0.25, 'edad') │
|
|
729
|
+
│ q75 = stats.quantile(0.75, 'edad') │
|
|
730
|
+
│ │
|
|
731
|
+
│ # To detect outsolves │
|
|
732
|
+
│ outliers_mask = stats.outliers('edad', method='iqr', threshold=1.5) │
|
|
733
|
+
│ print(f"Outliers detected: {outliers_mask.sum()}") │
|
|
734
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
735
|
+
|
|
736
|
+
┌─ Example 3: Complete Summary ───────────────────────────────────────────┐
|
|
737
|
+
│ # Summary of all numerical variables │
|
|
738
|
+
│ summary = stats.summary() │
|
|
739
|
+
│ print(summary) │
|
|
740
|
+
│ │
|
|
741
|
+
│ # Resumen de columnas específicas con visualización │
|
|
742
|
+
│ resumen = stats.summary( │
|
|
743
|
+
│ columns=['edad', 'salario', 'experiencia'], │
|
|
744
|
+
│ show_plot=True, │
|
|
745
|
+
│ plot_backend='seaborn' │
|
|
746
|
+
│ ) │
|
|
747
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
748
|
+
|
|
749
|
+
┌─ Ejemplo 4: Análisis Multivariado ──────────────────────────────────────┐
|
|
750
|
+
│ # Matriz de correlación │
|
|
751
|
+
│ corr_pearson = stats.correlation(method='pearson') │
|
|
752
|
+
│ corr_spearman = stats.correlation(method='spearman') │
|
|
753
|
+
│ │
|
|
754
|
+
│ # Matriz de covarianza │
|
|
755
|
+
│ cov_matrix = stats.covariance() │
|
|
756
|
+
│ │
|
|
757
|
+
│ # Correlación entre variables específicas │
|
|
758
|
+
│ corr_subset = stats.correlation( │
|
|
759
|
+
│ method='pearson', │
|
|
760
|
+
│ columns=['edad', 'salario', 'experiencia'] │
|
|
761
|
+
│ ) │
|
|
762
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
763
|
+
|
|
764
|
+
┌─ Ejemplo 5: Regresión Lineal Simple ────────────────────────────────────┐
|
|
765
|
+
│ # Regresión simple: salario ~ experiencia │
|
|
766
|
+
│ modelo = stats.linear_regression( │
|
|
767
|
+
│ y='salario', │
|
|
768
|
+
│ X='experiencia', │
|
|
769
|
+
│ engine='statsmodels', │
|
|
770
|
+
│ show_plot=True │
|
|
771
|
+
│ ) │
|
|
772
|
+
│ │
|
|
773
|
+
│ # Ver resultados │
|
|
774
|
+
│ print(modelo.summary()) │
|
|
775
|
+
│ │
|
|
776
|
+
│ # Acceder a coeficientes │
|
|
777
|
+
│ print(f"Intercepto: {modelo.intercept_}") │
|
|
778
|
+
│ print(f"Pendiente: {modelo.coef_[0]}") │
|
|
779
|
+
│ print(f"R²: {modelo.r_squared}") │
|
|
780
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
781
|
+
|
|
782
|
+
┌─ Ejemplo 6: Regresión Lineal Múltiple ──────────────────────────────────┐
|
|
783
|
+
│ # Regresión múltiple: salario ~ experiencia + edad + educacion │
|
|
784
|
+
│ modelo = stats.linear_regression( │
|
|
785
|
+
│ y='salario', │
|
|
786
|
+
│ X=['experiencia', 'edad', 'educacion'], │
|
|
787
|
+
│ engine='statsmodels', │
|
|
788
|
+
│ fit_intercept=True, │
|
|
789
|
+
│ handle_missing='drop' │
|
|
790
|
+
│ ) │
|
|
791
|
+
│ │
|
|
792
|
+
│ print(modelo.summary()) │
|
|
793
|
+
│ │
|
|
794
|
+
│ # Hacer predicciones │
|
|
795
|
+
│ import numpy as np │
|
|
796
|
+
│ X_nuevo = np.array([[5, 30, 16], [10, 35, 18]]) # experiencia, edad │
|
|
797
|
+
│ predicciones = modelo.predict(X_nuevo) │
|
|
798
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
523
799
|
|
|
524
800
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
525
801
|
|
|
526
802
|
🎯 CARACTERÍSTICAS CLAVE:
|
|
527
803
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
804
|
+
✓ Análisis univariado completo
|
|
805
|
+
✓ Análisis multivariado (correlación, covarianza)
|
|
806
|
+
✓ Detección de outliers con múltiples métodos
|
|
807
|
+
✓ Regresión lineal con statsmodels o scikit-learn
|
|
808
|
+
✓ Manejo automático de valores faltantes
|
|
809
|
+
✓ Soporte para pandas DataFrame y numpy arrays
|
|
810
|
+
✓ Salidas formateadas profesionales
|
|
811
|
+
✓ Visualizaciones opcionales
|
|
536
812
|
|
|
537
813
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
538
814
|
|
|
539
815
|
📚 DOCUMENTACIÓN ADICIONAL:
|
|
540
|
-
|
|
541
|
-
|
|
816
|
+
Para más información sobre métodos específicos, use:
|
|
817
|
+
help(DescriptiveStats.nombre_metodo)
|
|
542
818
|
|
|
543
819
|
╚════════════════════════════════════════════════════════════════════════════╝
|
|
544
820
|
"""
|
|
821
|
+
|
|
545
822
|
print(help_text)
|
|
823
|
+
|
|
824
|
+
|
|
546
825
|
|
|
547
826
|
class DescriptiveSummary:
|
|
548
827
|
"""Clase para formatear salida de estadística descriptiva"""
|
|
@@ -599,14 +878,151 @@ class DescriptiveSummary:
|
|
|
599
878
|
output.append("=" * 100)
|
|
600
879
|
return "\n".join(output)
|
|
601
880
|
|
|
881
|
+
def to_dataframe(self, format='wide'):
|
|
882
|
+
"""
|
|
883
|
+
Convierte los resultados a DataFrame.
|
|
884
|
+
|
|
885
|
+
Parameters:
|
|
886
|
+
-----------
|
|
887
|
+
format : str, default 'wide'
|
|
888
|
+
- 'wide': Variables en columnas, estadísticas en filas
|
|
889
|
+
- 'long': Formato largo (variable, estadística, valor)
|
|
890
|
+
- 'compact': Variables en filas, estadísticas en columnas
|
|
891
|
+
"""
|
|
892
|
+
if format == 'wide':
|
|
893
|
+
return self._to_wide_df()
|
|
894
|
+
elif format == 'long':
|
|
895
|
+
return self._to_long_df()
|
|
896
|
+
elif format == 'compact':
|
|
897
|
+
return self._to_compact_df()
|
|
898
|
+
else:
|
|
899
|
+
raise ValueError("format debe ser 'wide', 'long' o 'compact'")
|
|
900
|
+
|
|
901
|
+
def _to_wide_df(self):
|
|
902
|
+
"""
|
|
903
|
+
Formato ancho: Variables en columnas, estadísticas en filas.
|
|
904
|
+
|
|
905
|
+
Ejemplo:
|
|
906
|
+
Variable1 Variable2 Variable3
|
|
907
|
+
count 150.0 150.0 150.0
|
|
908
|
+
mean 5.8 3.1 3.8
|
|
909
|
+
median 5.8 3.0 4.0
|
|
910
|
+
...
|
|
911
|
+
"""
|
|
912
|
+
df = pd.DataFrame(self.results)
|
|
913
|
+
|
|
914
|
+
# Ordenar índice por categorías
|
|
915
|
+
order = [
|
|
916
|
+
'count', 'mean', 'median', 'mode', # Tendencia central
|
|
917
|
+
'std', 'variance', 'iqr', # Dispersión
|
|
918
|
+
'min', 'q1', 'q3', 'max', # Cuartiles
|
|
919
|
+
'skewness', 'kurtosis' # Forma
|
|
920
|
+
]
|
|
921
|
+
|
|
922
|
+
# Reordenar filas según el orden definido
|
|
923
|
+
df = df.reindex([stat for stat in order if stat in df.index])
|
|
924
|
+
|
|
925
|
+
return df
|
|
926
|
+
|
|
927
|
+
def _to_compact_df(self):
|
|
928
|
+
"""
|
|
929
|
+
Formato compacto: Variables en filas, estadísticas en columnas.
|
|
930
|
+
|
|
931
|
+
Ejemplo:
|
|
932
|
+
count mean median mode std variance ...
|
|
933
|
+
Var1 150.0 5.8 5.8 5.0 0.8 0.68 ...
|
|
934
|
+
Var2 150.0 3.1 3.0 3.0 0.4 0.19 ...
|
|
935
|
+
Var3 150.0 3.8 4.0 1.0 1.8 3.11 ...
|
|
936
|
+
"""
|
|
937
|
+
df_data = []
|
|
938
|
+
|
|
939
|
+
for var_name, stats in self.results.items():
|
|
940
|
+
row = {'Variable': var_name}
|
|
941
|
+
row.update(stats)
|
|
942
|
+
df_data.append(row)
|
|
943
|
+
|
|
944
|
+
df = pd.DataFrame(df_data)
|
|
945
|
+
df = df.set_index('Variable')
|
|
946
|
+
|
|
947
|
+
# Ordenar columnas por categorías
|
|
948
|
+
order = [
|
|
949
|
+
'count', 'mean', 'median', 'mode',
|
|
950
|
+
'std', 'variance', 'iqr',
|
|
951
|
+
'min', 'q1', 'q3', 'max',
|
|
952
|
+
'skewness', 'kurtosis'
|
|
953
|
+
]
|
|
954
|
+
|
|
955
|
+
df = df[[col for col in order if col in df.columns]]
|
|
956
|
+
|
|
957
|
+
return df
|
|
958
|
+
|
|
959
|
+
def _to_long_df(self):
|
|
960
|
+
"""
|
|
961
|
+
Formato largo: Una fila por cada combinación variable-estadística.
|
|
962
|
+
|
|
963
|
+
Ejemplo:
|
|
964
|
+
Variable Estadistica Valor
|
|
965
|
+
0 Var1 count 150.00
|
|
966
|
+
1 Var1 mean 5.84
|
|
967
|
+
2 Var1 median 5.80
|
|
968
|
+
...
|
|
969
|
+
"""
|
|
970
|
+
data = []
|
|
971
|
+
|
|
972
|
+
for var_name, stats in self.results.items():
|
|
973
|
+
for stat_name, value in stats.items():
|
|
974
|
+
data.append({
|
|
975
|
+
'Variable': var_name,
|
|
976
|
+
'Estadistica': stat_name,
|
|
977
|
+
'Valor': value
|
|
978
|
+
})
|
|
979
|
+
|
|
980
|
+
return pd.DataFrame(data)
|
|
981
|
+
|
|
982
|
+
def to_styled_df(self):
|
|
983
|
+
"""
|
|
984
|
+
Devuelve un DataFrame con formato wide y estilo aplicado.
|
|
985
|
+
Útil para notebooks de Jupyter.
|
|
986
|
+
"""
|
|
987
|
+
df = self._to_wide_df()
|
|
988
|
+
|
|
989
|
+
styled = df.style.format("{:.4f}") \
|
|
990
|
+
.background_gradient(cmap='YlOrRd', axis=1) \
|
|
991
|
+
.set_caption(f"Estadística Descriptiva - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
992
|
+
|
|
993
|
+
return styled
|
|
994
|
+
|
|
995
|
+
def to_categorical_summary(self):
|
|
996
|
+
"""
|
|
997
|
+
Crea un resumen organizado por categorías de estadísticas.
|
|
998
|
+
|
|
999
|
+
Returns:
|
|
1000
|
+
--------
|
|
1001
|
+
dict of DataFrames
|
|
1002
|
+
"""
|
|
1003
|
+
df_wide = self._to_wide_df()
|
|
1004
|
+
|
|
1005
|
+
return {
|
|
1006
|
+
'Tendencia Central': df_wide.loc[['count', 'mean', 'median', 'mode']],
|
|
1007
|
+
'Dispersión': df_wide.loc[['std', 'variance', 'iqr']],
|
|
1008
|
+
'Cuartiles': df_wide.loc[['min', 'q1', 'q3', 'max']],
|
|
1009
|
+
'Forma': df_wide.loc[['skewness', 'kurtosis']]
|
|
1010
|
+
}
|
|
1011
|
+
|
|
602
1012
|
|
|
603
1013
|
import numpy as np
|
|
604
1014
|
from datetime import datetime
|
|
605
1015
|
|
|
606
1016
|
|
|
1017
|
+
import numpy as np
|
|
1018
|
+
import pandas as pd
|
|
1019
|
+
from datetime import datetime
|
|
1020
|
+
import matplotlib.pyplot as plt
|
|
1021
|
+
import seaborn as sns
|
|
1022
|
+
|
|
607
1023
|
class LinearRegressionResult:
|
|
608
1024
|
"""Clase para resultados de regresión lineal"""
|
|
609
|
-
|
|
1025
|
+
|
|
610
1026
|
def __init__(self, X, y, X_names, y_name, engine='statsmodels', fit_intercept=True):
|
|
611
1027
|
self.X = X
|
|
612
1028
|
self.y = y
|
|
@@ -618,7 +1034,7 @@ class LinearRegressionResult:
|
|
|
618
1034
|
self.results = None
|
|
619
1035
|
self.show_plot = False
|
|
620
1036
|
self.plot_backend = 'seaborn'
|
|
621
|
-
|
|
1037
|
+
|
|
622
1038
|
# Atributos que se llenarán después del fit
|
|
623
1039
|
self.coef_ = None
|
|
624
1040
|
self.intercept_ = None
|
|
@@ -633,7 +1049,7 @@ class LinearRegressionResult:
|
|
|
633
1049
|
self.std_errors = None
|
|
634
1050
|
self.t_values = None
|
|
635
1051
|
self.p_values = None
|
|
636
|
-
|
|
1052
|
+
|
|
637
1053
|
def fit(self):
|
|
638
1054
|
"""Ajustar el modelo"""
|
|
639
1055
|
if self.engine == 'statsmodels':
|
|
@@ -643,7 +1059,7 @@ class LinearRegressionResult:
|
|
|
643
1059
|
X = sm.add_constant(X)
|
|
644
1060
|
self.model = sm.OLS(self.y, X)
|
|
645
1061
|
self.results = self.model.fit()
|
|
646
|
-
|
|
1062
|
+
|
|
647
1063
|
# Extraer atributos
|
|
648
1064
|
if self.fit_intercept:
|
|
649
1065
|
self.intercept_ = self.results.params[0]
|
|
@@ -657,7 +1073,7 @@ class LinearRegressionResult:
|
|
|
657
1073
|
self.std_errors = self.results.bse
|
|
658
1074
|
self.t_values = self.results.tvalues
|
|
659
1075
|
self.p_values = self.results.pvalues
|
|
660
|
-
|
|
1076
|
+
|
|
661
1077
|
self.r_squared = self.results.rsquared
|
|
662
1078
|
self.adj_r_squared = self.results.rsquared_adj
|
|
663
1079
|
self.f_statistic = self.results.fvalue
|
|
@@ -666,24 +1082,24 @@ class LinearRegressionResult:
|
|
|
666
1082
|
self.bic = self.results.bic
|
|
667
1083
|
self.residuals = self.results.resid
|
|
668
1084
|
self.predictions = self.results.fittedvalues
|
|
669
|
-
|
|
1085
|
+
|
|
670
1086
|
else: # scikit-learn
|
|
671
1087
|
from sklearn.linear_model import LinearRegression
|
|
672
1088
|
self.model = LinearRegression(fit_intercept=self.fit_intercept)
|
|
673
1089
|
self.model.fit(self.X, self.y)
|
|
674
|
-
|
|
1090
|
+
|
|
675
1091
|
self.coef_ = self.model.coef_
|
|
676
1092
|
self.intercept_ = self.model.intercept_
|
|
677
|
-
self.r_squared = self.model.score(self.X, self.y)
|
|
678
1093
|
self.predictions = self.model.predict(self.X)
|
|
679
1094
|
self.residuals = self.y - self.predictions
|
|
680
|
-
|
|
681
|
-
|
|
1095
|
+
self.r_squared = self.model.score(self.X, self.y)
|
|
1096
|
+
|
|
1097
|
+
# Calcular R^2 ajustado
|
|
682
1098
|
n, k = self.X.shape
|
|
683
1099
|
self.adj_r_squared = 1 - (1 - self.r_squared) * (n - 1) / (n - k - 1)
|
|
684
|
-
|
|
1100
|
+
|
|
685
1101
|
return self
|
|
686
|
-
|
|
1102
|
+
|
|
687
1103
|
def predict(self, X_new):
|
|
688
1104
|
"""Hacer predicciones con nuevos datos"""
|
|
689
1105
|
if self.engine == 'statsmodels':
|
|
@@ -693,16 +1109,12 @@ class LinearRegressionResult:
|
|
|
693
1109
|
return self.results.predict(X_new)
|
|
694
1110
|
else:
|
|
695
1111
|
return self.model.predict(X_new)
|
|
696
|
-
|
|
1112
|
+
|
|
697
1113
|
def summary(self):
|
|
698
1114
|
"""Mostrar resumen estilo OLS"""
|
|
699
1115
|
return self.__repr__()
|
|
700
|
-
|
|
1116
|
+
|
|
701
1117
|
def __repr__(self):
|
|
702
|
-
return self._format_output()
|
|
703
|
-
|
|
704
|
-
def _format_output(self):
|
|
705
|
-
"""Formato estilo OLS de statsmodels"""
|
|
706
1118
|
output = []
|
|
707
1119
|
output.append("=" * 100)
|
|
708
1120
|
output.append("RESULTADOS DE REGRESIÓN LINEAL".center(100))
|
|
@@ -712,7 +1124,7 @@ class LinearRegressionResult:
|
|
|
712
1124
|
output.append(f"Motor: {self.engine}")
|
|
713
1125
|
output.append(f"Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
714
1126
|
output.append("-" * 100)
|
|
715
|
-
|
|
1127
|
+
|
|
716
1128
|
# Información del modelo
|
|
717
1129
|
output.append("\nINFORMACIÓN DEL MODELO:")
|
|
718
1130
|
output.append("-" * 100)
|
|
@@ -720,24 +1132,22 @@ class LinearRegressionResult:
|
|
|
720
1132
|
output.append("-" * 100)
|
|
721
1133
|
output.append(f"{'R-cuadrado':<50} {self.r_squared:>20.6f}")
|
|
722
1134
|
output.append(f"{'R-cuadrado Ajustado':<50} {self.adj_r_squared:>20.6f}")
|
|
723
|
-
|
|
1135
|
+
|
|
724
1136
|
if self.f_statistic is not None:
|
|
725
1137
|
output.append(f"{'Estadístico F':<50} {self.f_statistic:>20.6f}")
|
|
726
1138
|
output.append(f"{'Prob (F-estadístico)':<50} {self.f_pvalue:>20.6e}")
|
|
727
|
-
|
|
1139
|
+
|
|
728
1140
|
if self.aic is not None:
|
|
729
1141
|
output.append(f"{'AIC':<50} {self.aic:>20.6f}")
|
|
730
1142
|
output.append(f"{'BIC':<50} {self.bic:>20.6f}")
|
|
731
|
-
|
|
1143
|
+
|
|
732
1144
|
# Coeficientes
|
|
733
1145
|
output.append("\nCOEFICIENTES:")
|
|
734
1146
|
output.append("-" * 100)
|
|
735
|
-
|
|
736
1147
|
if self.std_errors is not None:
|
|
737
1148
|
output.append(f"{'Variable':<20} {'Coef.':>15} {'Std Err':>15} {'t':>15} {'P>|t|':>15}")
|
|
738
1149
|
output.append("-" * 100)
|
|
739
1150
|
output.append(f"{'const':<20} {self.intercept_:>15.6f} {'-':>15} {'-':>15} {'-':>15}")
|
|
740
|
-
|
|
741
1151
|
for i, name in enumerate(self.X_names):
|
|
742
1152
|
output.append(
|
|
743
1153
|
f"{name:<20} {self.coef_[i]:>15.6f} {self.std_errors[i]:>15.6f} "
|
|
@@ -747,10 +1157,9 @@ class LinearRegressionResult:
|
|
|
747
1157
|
output.append(f"{'Variable':<20} {'Coeficiente':>20}")
|
|
748
1158
|
output.append("-" * 100)
|
|
749
1159
|
output.append(f"{'const':<20} {self.intercept_:>20.6f}")
|
|
750
|
-
|
|
751
1160
|
for i, name in enumerate(self.X_names):
|
|
752
1161
|
output.append(f"{name:<20} {self.coef_[i]:>20.6f}")
|
|
753
|
-
|
|
1162
|
+
|
|
754
1163
|
# Análisis de residuos
|
|
755
1164
|
output.append("\nANÁLISIS DE RESIDUOS:")
|
|
756
1165
|
output.append("-" * 100)
|
|
@@ -760,10 +1169,31 @@ class LinearRegressionResult:
|
|
|
760
1169
|
output.append(f"{'Desv. Std. de Residuos':<50} {np.std(self.residuals):>20.6f}")
|
|
761
1170
|
output.append(f"{'Mínimo Residuo':<50} {np.min(self.residuals):>20.6f}")
|
|
762
1171
|
output.append(f"{'Máximo Residuo':<50} {np.max(self.residuals):>20.6f}")
|
|
763
|
-
|
|
764
1172
|
output.append("=" * 100)
|
|
765
|
-
|
|
1173
|
+
|
|
766
1174
|
if self.show_plot:
|
|
1175
|
+
self.plot()
|
|
767
1176
|
output.append("\n[Gráficos diagnósticos generados]")
|
|
768
|
-
|
|
769
|
-
return "\n".join(output)
|
|
1177
|
+
|
|
1178
|
+
return "\n".join(output)
|
|
1179
|
+
|
|
1180
|
+
def plot(self):
|
|
1181
|
+
"""Generar gráficos de regresión y residuales"""
|
|
1182
|
+
if len(self.X_names) == 1:
|
|
1183
|
+
# Scatter + línea de regresión
|
|
1184
|
+
df_plot = pd.DataFrame({
|
|
1185
|
+
self.X_names[0]: self.X.flatten(),
|
|
1186
|
+
self.y_name: self.y,
|
|
1187
|
+
'Predicciones': self.predictions
|
|
1188
|
+
})
|
|
1189
|
+
sns.lmplot(x=self.X_names[0], y=self.y_name, data=df_plot, ci=None)
|
|
1190
|
+
plt.title(f"Regresión lineal: {self.y_name} ~ {self.X_names[0]}")
|
|
1191
|
+
plt.show()
|
|
1192
|
+
else:
|
|
1193
|
+
# Para regresión múltiple, solo gráfico residuos vs predicciones
|
|
1194
|
+
plt.scatter(self.predictions, self.residuals)
|
|
1195
|
+
plt.axhline(0, color='red', linestyle='--')
|
|
1196
|
+
plt.xlabel("Predicciones")
|
|
1197
|
+
plt.ylabel("Residuos")
|
|
1198
|
+
plt.title("Residuos vs Predicciones")
|
|
1199
|
+
plt.show()
|