statslibx 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +2 -2
- statslibx/datasets/__init__.py +1 -0
- statslibx/datasets/course_completion.csv +100001 -0
- statslibx/descriptive.py +274 -148
- statslibx/inferential.py +139 -72
- statslibx/utils.py +288 -82
- {statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/METADATA +1 -1
- statslibx-0.1.6.dist-info/RECORD +14 -0
- statslibx-0.1.4.dist-info/RECORD +0 -13
- {statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/WHEEL +0 -0
- {statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/top_level.txt +0 -0
statslibx/descriptive.py
CHANGED
|
@@ -2,6 +2,13 @@ import numpy as np
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from typing import Optional, Union, Literal, List
|
|
4
4
|
from datetime import datetime
|
|
5
|
+
import flet as ft
|
|
6
|
+
import os
|
|
7
|
+
import matplotlib.pyplot as plt
|
|
8
|
+
import seaborn as sns
|
|
9
|
+
import io
|
|
10
|
+
import base64
|
|
11
|
+
import plotly.express as px
|
|
5
12
|
|
|
6
13
|
class DescriptiveStats:
|
|
7
14
|
"""
|
|
@@ -20,6 +27,10 @@ class DescriptiveStats:
|
|
|
20
27
|
backend : str
|
|
21
28
|
'pandas' o 'polars' para procesamiento
|
|
22
29
|
"""
|
|
30
|
+
|
|
31
|
+
if isinstance(data, str) and os.path.exists(data):
|
|
32
|
+
data = DescriptiveStats.from_file(data).data
|
|
33
|
+
|
|
23
34
|
if isinstance(data, np.ndarray):
|
|
24
35
|
if data.ndim == 1:
|
|
25
36
|
data = pd.DataFrame({'var': data})
|
|
@@ -29,6 +40,40 @@ class DescriptiveStats:
|
|
|
29
40
|
self.data = data
|
|
30
41
|
self.backend = backend
|
|
31
42
|
self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def from_file(path: str):
|
|
46
|
+
"""
|
|
47
|
+
Carga automática de archivos y devuelve instancia de Intelligence.
|
|
48
|
+
Soporta CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
|
|
49
|
+
"""
|
|
50
|
+
if not os.path.exists(path):
|
|
51
|
+
raise FileNotFoundError(f"Archivo no encontrado: {path}")
|
|
52
|
+
|
|
53
|
+
ext = os.path.splitext(path)[1].lower()
|
|
54
|
+
|
|
55
|
+
if ext == ".csv":
|
|
56
|
+
df = pd.read_csv(path)
|
|
57
|
+
|
|
58
|
+
elif ext in [".xlsx", ".xls"]:
|
|
59
|
+
df = pd.read_excel(path)
|
|
60
|
+
|
|
61
|
+
elif ext in [".txt", ".tsv"]:
|
|
62
|
+
df = pd.read_table(path)
|
|
63
|
+
|
|
64
|
+
elif ext == ".json":
|
|
65
|
+
df = pd.read_json(path)
|
|
66
|
+
|
|
67
|
+
elif ext == ".parquet":
|
|
68
|
+
df = pd.read_parquet(path)
|
|
69
|
+
|
|
70
|
+
elif ext == ".feather":
|
|
71
|
+
df = pd.read_feather(path)
|
|
72
|
+
|
|
73
|
+
else:
|
|
74
|
+
raise ValueError(f"Formato no soportado: {ext}")
|
|
75
|
+
|
|
76
|
+
return DescriptiveStats(df)
|
|
32
77
|
|
|
33
78
|
# ============= MÉTODOS UNIVARIADOS =============
|
|
34
79
|
|
|
@@ -174,137 +219,58 @@ class DescriptiveStats:
|
|
|
174
219
|
# ============= REGRESIÓN LINEAL =============
|
|
175
220
|
|
|
176
221
|
def linear_regression(self,
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
222
|
+
X: Union[str, List[str]],
|
|
223
|
+
y: str,
|
|
224
|
+
engine: Literal['statsmodels', 'scikit-learn'] = 'statsmodels',
|
|
225
|
+
fit_intercept: bool = True,
|
|
226
|
+
show_plot: bool = False,
|
|
227
|
+
plot_backend: str = 'seaborn',
|
|
228
|
+
handle_missing: Literal['drop', 'error', 'warn'] = 'drop') -> tuple:
|
|
184
229
|
"""
|
|
185
|
-
Regresión lineal simple o múltiple
|
|
186
|
-
|
|
187
|
-
Parameters:
|
|
188
|
-
-----------
|
|
189
|
-
y : str
|
|
190
|
-
Variable dependiente
|
|
191
|
-
X : str o list
|
|
192
|
-
Variable(s) independiente(s)
|
|
193
|
-
engine : str
|
|
194
|
-
'statsmodels' o 'scikit-learn'
|
|
195
|
-
fit_intercept : bool
|
|
196
|
-
Si incluir intercepto
|
|
197
|
-
show_plot : bool
|
|
198
|
-
Mostrar gráficos diagnósticos
|
|
199
|
-
plot_backend : str
|
|
200
|
-
Backend para visualización
|
|
201
|
-
|
|
202
|
-
Returns:
|
|
203
|
-
--------
|
|
204
|
-
LinearRegressionResult
|
|
205
|
-
Objeto con resultados y método summary()
|
|
230
|
+
Regresión lineal simple o múltiple con opción de mostrar gráfico.
|
|
231
|
+
Siempre devuelve un tuple: (LinearRegressionResult, figura o None)
|
|
206
232
|
"""
|
|
207
233
|
if isinstance(X, str):
|
|
208
234
|
X = [X]
|
|
209
|
-
|
|
210
|
-
# Verificar
|
|
211
|
-
missing_columns = []
|
|
212
|
-
if y not in self.data.columns:
|
|
213
|
-
missing_columns.append(y)
|
|
214
|
-
for x_col in X:
|
|
215
|
-
if x_col not in self.data.columns:
|
|
216
|
-
missing_columns.append(x_col)
|
|
217
|
-
|
|
235
|
+
|
|
236
|
+
# Verificar columnas
|
|
237
|
+
missing_columns = [col for col in [y] + X if col not in self.data.columns]
|
|
218
238
|
if missing_columns:
|
|
219
239
|
raise ValueError(f"Columnas no encontradas: {missing_columns}")
|
|
220
|
-
|
|
221
|
-
#
|
|
240
|
+
|
|
241
|
+
# Preparar datos
|
|
222
242
|
regression_data = self.data[[y] + X].copy()
|
|
223
|
-
|
|
224
|
-
# Manejar valores infinitos
|
|
225
243
|
numeric_cols = regression_data.select_dtypes(include=[np.number]).columns
|
|
226
244
|
for col in numeric_cols:
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
regression_data[col] = regression_data[col].replace([np.inf, -np.inf], np.nan)
|
|
232
|
-
|
|
233
|
-
# Manejar valores faltantes
|
|
234
|
-
missing_before = regression_data.isnull().sum()
|
|
235
|
-
total_missing = missing_before.sum()
|
|
236
|
-
|
|
237
|
-
if total_missing > 0:
|
|
238
|
-
missing_info = "\n".join([f" - {col}: {missing_before[col]} missing"
|
|
239
|
-
for col in missing_before[missing_before > 0].index])
|
|
240
|
-
|
|
245
|
+
regression_data[col] = regression_data[col].replace([np.inf, -np.inf], np.nan)
|
|
246
|
+
|
|
247
|
+
# Manejo de valores faltantes
|
|
248
|
+
if regression_data.isnull().any().any():
|
|
241
249
|
if handle_missing == 'error':
|
|
242
|
-
raise ValueError(
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
regression_data_clean = regression_data.dropna()
|
|
251
|
-
|
|
252
|
-
else:
|
|
253
|
-
raise ValueError(f"Método de manejo de missing values no reconocido: {handle_missing}")
|
|
254
|
-
|
|
255
|
-
# Informar sobre la limpieza
|
|
256
|
-
rows_before = len(regression_data)
|
|
257
|
-
rows_after = len(regression_data_clean)
|
|
258
|
-
rows_removed = rows_before - rows_after
|
|
259
|
-
|
|
260
|
-
if rows_removed > 0:
|
|
261
|
-
print(f"Limpieza de datos: {rows_removed} filas eliminadas ({rows_after} filas restantes)")
|
|
262
|
-
|
|
263
|
-
if rows_after < len(X) + 1: # +1 para el intercepto
|
|
264
|
-
raise ValueError(
|
|
265
|
-
f"Muy pocas filas después de limpieza: {rows_after}. "
|
|
266
|
-
f"Se necesitan al menos {len(X) + 1} filas para regresión."
|
|
267
|
-
)
|
|
268
|
-
else:
|
|
269
|
-
regression_data_clean = regression_data
|
|
270
|
-
|
|
271
|
-
# Extraer datos limpios
|
|
272
|
-
X_data = regression_data_clean[X].values
|
|
273
|
-
y_data = regression_data_clean[y].values
|
|
274
|
-
|
|
275
|
-
# Validar que los datos son numéricos
|
|
276
|
-
if not np.issubdtype(X_data.dtype, np.number):
|
|
277
|
-
raise ValueError("Las variables independientes deben ser numéricas")
|
|
278
|
-
if not np.issubdtype(y_data.dtype, np.number):
|
|
279
|
-
raise ValueError("La variable dependiente debe ser numérica")
|
|
280
|
-
|
|
281
|
-
# Validar que no hay más missing values
|
|
282
|
-
if np.isnan(X_data).any() or np.isnan(y_data).any():
|
|
283
|
-
raise ValueError("Todavía hay valores NaN después de la limpieza")
|
|
284
|
-
|
|
285
|
-
# Validar que no hay valores infinitos
|
|
286
|
-
if np.isinf(X_data).any() or np.isinf(y_data).any():
|
|
287
|
-
raise ValueError("Todavía hay valores infinitos después de la limpieza")
|
|
288
|
-
|
|
289
|
-
# Crear y ajustar el modelo
|
|
290
|
-
result = LinearRegressionResult(
|
|
291
|
-
X_data, y_data, X, y,
|
|
292
|
-
engine=engine,
|
|
293
|
-
fit_intercept=fit_intercept
|
|
294
|
-
)
|
|
250
|
+
raise ValueError("Datos contienen valores faltantes")
|
|
251
|
+
regression_data = regression_data.dropna()
|
|
252
|
+
|
|
253
|
+
X_data = regression_data[X].values
|
|
254
|
+
y_data = regression_data[y].values
|
|
255
|
+
|
|
256
|
+
# Ajustar modelo
|
|
257
|
+
result = LinearRegressionResult(X_data, y_data, X, y, engine=engine, fit_intercept=fit_intercept)
|
|
295
258
|
result.fit()
|
|
296
259
|
result.show_plot = show_plot
|
|
297
260
|
result.plot_backend = plot_backend
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
261
|
+
|
|
262
|
+
figura = None
|
|
263
|
+
# Graficar si es regresión simple
|
|
264
|
+
if show_plot and len(X) == 1 and plot_backend.lower() == 'seaborn':
|
|
265
|
+
import matplotlib.pyplot as plt
|
|
266
|
+
g = sns.lmplot(x=X[0], y=y, data=regression_data, ci=None)
|
|
267
|
+
g.figure.suptitle(f"Regresión lineal: {y} ~ {X[0]}", y=1.02)
|
|
268
|
+
plt.tight_layout()
|
|
269
|
+
figura = g.figure
|
|
270
|
+
|
|
271
|
+
return result, figura
|
|
272
|
+
|
|
273
|
+
|
|
308
274
|
|
|
309
275
|
def help(self):
|
|
310
276
|
"""
|
|
@@ -366,6 +332,15 @@ class DescriptiveStats:
|
|
|
366
332
|
|
|
367
333
|
Incluye: conteo, media, mediana, moda, desv. est., varianza,
|
|
368
334
|
mínimo, Q1, Q3, máximo, IQR, asimetría, curtosis
|
|
335
|
+
• .summary().to_dataframe(format)
|
|
336
|
+
Format:
|
|
337
|
+
- Wide
|
|
338
|
+
- Long
|
|
339
|
+
- Compact
|
|
340
|
+
|
|
341
|
+
• .summary().to_categorical_summary()
|
|
342
|
+
• .summary().to_styled_df()
|
|
343
|
+
|
|
369
344
|
|
|
370
345
|
┌────────────────────────────────────────────────────────────────────────────┐
|
|
371
346
|
│ 4. 📈 REGRESIÓN LINEAL │
|
|
@@ -560,14 +535,151 @@ class DescriptiveSummary:
|
|
|
560
535
|
output.append("=" * 100)
|
|
561
536
|
return "\n".join(output)
|
|
562
537
|
|
|
538
|
+
def to_dataframe(self, format='wide'):
|
|
539
|
+
"""
|
|
540
|
+
Convierte los resultados a DataFrame.
|
|
541
|
+
|
|
542
|
+
Parameters:
|
|
543
|
+
-----------
|
|
544
|
+
format : str, default 'wide'
|
|
545
|
+
- 'wide': Variables en columnas, estadísticas en filas
|
|
546
|
+
- 'long': Formato largo (variable, estadística, valor)
|
|
547
|
+
- 'compact': Variables en filas, estadísticas en columnas
|
|
548
|
+
"""
|
|
549
|
+
if format == 'wide':
|
|
550
|
+
return self._to_wide_df()
|
|
551
|
+
elif format == 'long':
|
|
552
|
+
return self._to_long_df()
|
|
553
|
+
elif format == 'compact':
|
|
554
|
+
return self._to_compact_df()
|
|
555
|
+
else:
|
|
556
|
+
raise ValueError("format debe ser 'wide', 'long' o 'compact'")
|
|
557
|
+
|
|
558
|
+
def _to_wide_df(self):
|
|
559
|
+
"""
|
|
560
|
+
Formato ancho: Variables en columnas, estadísticas en filas.
|
|
561
|
+
|
|
562
|
+
Ejemplo:
|
|
563
|
+
Variable1 Variable2 Variable3
|
|
564
|
+
count 150.0 150.0 150.0
|
|
565
|
+
mean 5.8 3.1 3.8
|
|
566
|
+
median 5.8 3.0 4.0
|
|
567
|
+
...
|
|
568
|
+
"""
|
|
569
|
+
df = pd.DataFrame(self.results)
|
|
570
|
+
|
|
571
|
+
# Ordenar índice por categorías
|
|
572
|
+
order = [
|
|
573
|
+
'count', 'mean', 'median', 'mode', # Tendencia central
|
|
574
|
+
'std', 'variance', 'iqr', # Dispersión
|
|
575
|
+
'min', 'q1', 'q3', 'max', # Cuartiles
|
|
576
|
+
'skewness', 'kurtosis' # Forma
|
|
577
|
+
]
|
|
578
|
+
|
|
579
|
+
# Reordenar filas según el orden definido
|
|
580
|
+
df = df.reindex([stat for stat in order if stat in df.index])
|
|
581
|
+
|
|
582
|
+
return df
|
|
583
|
+
|
|
584
|
+
def _to_compact_df(self):
|
|
585
|
+
"""
|
|
586
|
+
Formato compacto: Variables en filas, estadísticas en columnas.
|
|
587
|
+
|
|
588
|
+
Ejemplo:
|
|
589
|
+
count mean median mode std variance ...
|
|
590
|
+
Var1 150.0 5.8 5.8 5.0 0.8 0.68 ...
|
|
591
|
+
Var2 150.0 3.1 3.0 3.0 0.4 0.19 ...
|
|
592
|
+
Var3 150.0 3.8 4.0 1.0 1.8 3.11 ...
|
|
593
|
+
"""
|
|
594
|
+
df_data = []
|
|
595
|
+
|
|
596
|
+
for var_name, stats in self.results.items():
|
|
597
|
+
row = {'Variable': var_name}
|
|
598
|
+
row.update(stats)
|
|
599
|
+
df_data.append(row)
|
|
600
|
+
|
|
601
|
+
df = pd.DataFrame(df_data)
|
|
602
|
+
df = df.set_index('Variable')
|
|
603
|
+
|
|
604
|
+
# Ordenar columnas por categorías
|
|
605
|
+
order = [
|
|
606
|
+
'count', 'mean', 'median', 'mode',
|
|
607
|
+
'std', 'variance', 'iqr',
|
|
608
|
+
'min', 'q1', 'q3', 'max',
|
|
609
|
+
'skewness', 'kurtosis'
|
|
610
|
+
]
|
|
611
|
+
|
|
612
|
+
df = df[[col for col in order if col in df.columns]]
|
|
613
|
+
|
|
614
|
+
return df
|
|
615
|
+
|
|
616
|
+
def _to_long_df(self):
|
|
617
|
+
"""
|
|
618
|
+
Formato largo: Una fila por cada combinación variable-estadística.
|
|
619
|
+
|
|
620
|
+
Ejemplo:
|
|
621
|
+
Variable Estadistica Valor
|
|
622
|
+
0 Var1 count 150.00
|
|
623
|
+
1 Var1 mean 5.84
|
|
624
|
+
2 Var1 median 5.80
|
|
625
|
+
...
|
|
626
|
+
"""
|
|
627
|
+
data = []
|
|
628
|
+
|
|
629
|
+
for var_name, stats in self.results.items():
|
|
630
|
+
for stat_name, value in stats.items():
|
|
631
|
+
data.append({
|
|
632
|
+
'Variable': var_name,
|
|
633
|
+
'Estadistica': stat_name,
|
|
634
|
+
'Valor': value
|
|
635
|
+
})
|
|
636
|
+
|
|
637
|
+
return pd.DataFrame(data)
|
|
638
|
+
|
|
639
|
+
def to_styled_df(self):
|
|
640
|
+
"""
|
|
641
|
+
Devuelve un DataFrame con formato wide y estilo aplicado.
|
|
642
|
+
Útil para notebooks de Jupyter.
|
|
643
|
+
"""
|
|
644
|
+
df = self._to_wide_df()
|
|
645
|
+
|
|
646
|
+
styled = df.style.format("{:.4f}") \
|
|
647
|
+
.background_gradient(cmap='YlOrRd', axis=1) \
|
|
648
|
+
.set_caption(f"Estadística Descriptiva - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
649
|
+
|
|
650
|
+
return styled
|
|
651
|
+
|
|
652
|
+
def to_categorical_summary(self):
|
|
653
|
+
"""
|
|
654
|
+
Crea un resumen organizado por categorías de estadísticas.
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
--------
|
|
658
|
+
dict of DataFrames
|
|
659
|
+
"""
|
|
660
|
+
df_wide = self._to_wide_df()
|
|
661
|
+
|
|
662
|
+
return {
|
|
663
|
+
'Tendencia Central': df_wide.loc[['count', 'mean', 'median', 'mode']],
|
|
664
|
+
'Dispersión': df_wide.loc[['std', 'variance', 'iqr']],
|
|
665
|
+
'Cuartiles': df_wide.loc[['min', 'q1', 'q3', 'max']],
|
|
666
|
+
'Forma': df_wide.loc[['skewness', 'kurtosis']]
|
|
667
|
+
}
|
|
668
|
+
|
|
563
669
|
|
|
564
670
|
import numpy as np
|
|
565
671
|
from datetime import datetime
|
|
566
672
|
|
|
567
673
|
|
|
674
|
+
import numpy as np
|
|
675
|
+
import pandas as pd
|
|
676
|
+
from datetime import datetime
|
|
677
|
+
import matplotlib.pyplot as plt
|
|
678
|
+
import seaborn as sns
|
|
679
|
+
|
|
568
680
|
class LinearRegressionResult:
|
|
569
681
|
"""Clase para resultados de regresión lineal"""
|
|
570
|
-
|
|
682
|
+
|
|
571
683
|
def __init__(self, X, y, X_names, y_name, engine='statsmodels', fit_intercept=True):
|
|
572
684
|
self.X = X
|
|
573
685
|
self.y = y
|
|
@@ -579,7 +691,7 @@ class LinearRegressionResult:
|
|
|
579
691
|
self.results = None
|
|
580
692
|
self.show_plot = False
|
|
581
693
|
self.plot_backend = 'seaborn'
|
|
582
|
-
|
|
694
|
+
|
|
583
695
|
# Atributos que se llenarán después del fit
|
|
584
696
|
self.coef_ = None
|
|
585
697
|
self.intercept_ = None
|
|
@@ -594,7 +706,7 @@ class LinearRegressionResult:
|
|
|
594
706
|
self.std_errors = None
|
|
595
707
|
self.t_values = None
|
|
596
708
|
self.p_values = None
|
|
597
|
-
|
|
709
|
+
|
|
598
710
|
def fit(self):
|
|
599
711
|
"""Ajustar el modelo"""
|
|
600
712
|
if self.engine == 'statsmodels':
|
|
@@ -604,7 +716,7 @@ class LinearRegressionResult:
|
|
|
604
716
|
X = sm.add_constant(X)
|
|
605
717
|
self.model = sm.OLS(self.y, X)
|
|
606
718
|
self.results = self.model.fit()
|
|
607
|
-
|
|
719
|
+
|
|
608
720
|
# Extraer atributos
|
|
609
721
|
if self.fit_intercept:
|
|
610
722
|
self.intercept_ = self.results.params[0]
|
|
@@ -618,7 +730,7 @@ class LinearRegressionResult:
|
|
|
618
730
|
self.std_errors = self.results.bse
|
|
619
731
|
self.t_values = self.results.tvalues
|
|
620
732
|
self.p_values = self.results.pvalues
|
|
621
|
-
|
|
733
|
+
|
|
622
734
|
self.r_squared = self.results.rsquared
|
|
623
735
|
self.adj_r_squared = self.results.rsquared_adj
|
|
624
736
|
self.f_statistic = self.results.fvalue
|
|
@@ -627,24 +739,24 @@ class LinearRegressionResult:
|
|
|
627
739
|
self.bic = self.results.bic
|
|
628
740
|
self.residuals = self.results.resid
|
|
629
741
|
self.predictions = self.results.fittedvalues
|
|
630
|
-
|
|
742
|
+
|
|
631
743
|
else: # scikit-learn
|
|
632
744
|
from sklearn.linear_model import LinearRegression
|
|
633
745
|
self.model = LinearRegression(fit_intercept=self.fit_intercept)
|
|
634
746
|
self.model.fit(self.X, self.y)
|
|
635
|
-
|
|
747
|
+
|
|
636
748
|
self.coef_ = self.model.coef_
|
|
637
749
|
self.intercept_ = self.model.intercept_
|
|
638
|
-
self.r_squared = self.model.score(self.X, self.y)
|
|
639
750
|
self.predictions = self.model.predict(self.X)
|
|
640
751
|
self.residuals = self.y - self.predictions
|
|
641
|
-
|
|
642
|
-
|
|
752
|
+
self.r_squared = self.model.score(self.X, self.y)
|
|
753
|
+
|
|
754
|
+
# Calcular R^2 ajustado
|
|
643
755
|
n, k = self.X.shape
|
|
644
756
|
self.adj_r_squared = 1 - (1 - self.r_squared) * (n - 1) / (n - k - 1)
|
|
645
|
-
|
|
757
|
+
|
|
646
758
|
return self
|
|
647
|
-
|
|
759
|
+
|
|
648
760
|
def predict(self, X_new):
|
|
649
761
|
"""Hacer predicciones con nuevos datos"""
|
|
650
762
|
if self.engine == 'statsmodels':
|
|
@@ -654,16 +766,12 @@ class LinearRegressionResult:
|
|
|
654
766
|
return self.results.predict(X_new)
|
|
655
767
|
else:
|
|
656
768
|
return self.model.predict(X_new)
|
|
657
|
-
|
|
769
|
+
|
|
658
770
|
def summary(self):
|
|
659
771
|
"""Mostrar resumen estilo OLS"""
|
|
660
772
|
return self.__repr__()
|
|
661
|
-
|
|
773
|
+
|
|
662
774
|
def __repr__(self):
|
|
663
|
-
return self._format_output()
|
|
664
|
-
|
|
665
|
-
def _format_output(self):
|
|
666
|
-
"""Formato estilo OLS de statsmodels"""
|
|
667
775
|
output = []
|
|
668
776
|
output.append("=" * 100)
|
|
669
777
|
output.append("RESULTADOS DE REGRESIÓN LINEAL".center(100))
|
|
@@ -673,7 +781,7 @@ class LinearRegressionResult:
|
|
|
673
781
|
output.append(f"Motor: {self.engine}")
|
|
674
782
|
output.append(f"Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
675
783
|
output.append("-" * 100)
|
|
676
|
-
|
|
784
|
+
|
|
677
785
|
# Información del modelo
|
|
678
786
|
output.append("\nINFORMACIÓN DEL MODELO:")
|
|
679
787
|
output.append("-" * 100)
|
|
@@ -681,24 +789,22 @@ class LinearRegressionResult:
|
|
|
681
789
|
output.append("-" * 100)
|
|
682
790
|
output.append(f"{'R-cuadrado':<50} {self.r_squared:>20.6f}")
|
|
683
791
|
output.append(f"{'R-cuadrado Ajustado':<50} {self.adj_r_squared:>20.6f}")
|
|
684
|
-
|
|
792
|
+
|
|
685
793
|
if self.f_statistic is not None:
|
|
686
794
|
output.append(f"{'Estadístico F':<50} {self.f_statistic:>20.6f}")
|
|
687
795
|
output.append(f"{'Prob (F-estadístico)':<50} {self.f_pvalue:>20.6e}")
|
|
688
|
-
|
|
796
|
+
|
|
689
797
|
if self.aic is not None:
|
|
690
798
|
output.append(f"{'AIC':<50} {self.aic:>20.6f}")
|
|
691
799
|
output.append(f"{'BIC':<50} {self.bic:>20.6f}")
|
|
692
|
-
|
|
800
|
+
|
|
693
801
|
# Coeficientes
|
|
694
802
|
output.append("\nCOEFICIENTES:")
|
|
695
803
|
output.append("-" * 100)
|
|
696
|
-
|
|
697
804
|
if self.std_errors is not None:
|
|
698
805
|
output.append(f"{'Variable':<20} {'Coef.':>15} {'Std Err':>15} {'t':>15} {'P>|t|':>15}")
|
|
699
806
|
output.append("-" * 100)
|
|
700
807
|
output.append(f"{'const':<20} {self.intercept_:>15.6f} {'-':>15} {'-':>15} {'-':>15}")
|
|
701
|
-
|
|
702
808
|
for i, name in enumerate(self.X_names):
|
|
703
809
|
output.append(
|
|
704
810
|
f"{name:<20} {self.coef_[i]:>15.6f} {self.std_errors[i]:>15.6f} "
|
|
@@ -708,10 +814,9 @@ class LinearRegressionResult:
|
|
|
708
814
|
output.append(f"{'Variable':<20} {'Coeficiente':>20}")
|
|
709
815
|
output.append("-" * 100)
|
|
710
816
|
output.append(f"{'const':<20} {self.intercept_:>20.6f}")
|
|
711
|
-
|
|
712
817
|
for i, name in enumerate(self.X_names):
|
|
713
818
|
output.append(f"{name:<20} {self.coef_[i]:>20.6f}")
|
|
714
|
-
|
|
819
|
+
|
|
715
820
|
# Análisis de residuos
|
|
716
821
|
output.append("\nANÁLISIS DE RESIDUOS:")
|
|
717
822
|
output.append("-" * 100)
|
|
@@ -721,10 +826,31 @@ class LinearRegressionResult:
|
|
|
721
826
|
output.append(f"{'Desv. Std. de Residuos':<50} {np.std(self.residuals):>20.6f}")
|
|
722
827
|
output.append(f"{'Mínimo Residuo':<50} {np.min(self.residuals):>20.6f}")
|
|
723
828
|
output.append(f"{'Máximo Residuo':<50} {np.max(self.residuals):>20.6f}")
|
|
724
|
-
|
|
725
829
|
output.append("=" * 100)
|
|
726
|
-
|
|
830
|
+
|
|
727
831
|
if self.show_plot:
|
|
832
|
+
self.plot()
|
|
728
833
|
output.append("\n[Gráficos diagnósticos generados]")
|
|
729
|
-
|
|
730
|
-
return "\n".join(output)
|
|
834
|
+
|
|
835
|
+
return "\n".join(output)
|
|
836
|
+
|
|
837
|
+
def plot(self):
|
|
838
|
+
"""Generar gráficos de regresión y residuales"""
|
|
839
|
+
if len(self.X_names) == 1:
|
|
840
|
+
# Scatter + línea de regresión
|
|
841
|
+
df_plot = pd.DataFrame({
|
|
842
|
+
self.X_names[0]: self.X.flatten(),
|
|
843
|
+
self.y_name: self.y,
|
|
844
|
+
'Predicciones': self.predictions
|
|
845
|
+
})
|
|
846
|
+
sns.lmplot(x=self.X_names[0], y=self.y_name, data=df_plot, ci=None)
|
|
847
|
+
plt.title(f"Regresión lineal: {self.y_name} ~ {self.X_names[0]}")
|
|
848
|
+
plt.show()
|
|
849
|
+
else:
|
|
850
|
+
# Para regresión múltiple, solo gráfico residuos vs predicciones
|
|
851
|
+
plt.scatter(self.predictions, self.residuals)
|
|
852
|
+
plt.axhline(0, color='red', linestyle='--')
|
|
853
|
+
plt.xlabel("Predicciones")
|
|
854
|
+
plt.ylabel("Residuos")
|
|
855
|
+
plt.title("Residuos vs Predicciones")
|
|
856
|
+
plt.show()
|