statslibx 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {statslib → statslibx}/__init__.py +5 -4
- {statslib → statslibx}/descriptive.py +182 -31
- statslibx/inferential.py +974 -0
- statslibx/utils.py +1180 -0
- {statslibx-0.1.0.dist-info → statslibx-0.1.2.dist-info}/METADATA +34 -3
- statslibx-0.1.2.dist-info/RECORD +8 -0
- statslibx-0.1.2.dist-info/top_level.txt +1 -0
- statslib/inferential.py +0 -547
- statslib/utils.py +0 -889
- statslibx-0.1.0.dist-info/RECORD +0 -8
- statslibx-0.1.0.dist-info/top_level.txt +0 -1
- {statslibx-0.1.0.dist-info → statslibx-0.1.2.dist-info}/WHEEL +0 -0
statslibx/utils.py
ADDED
|
@@ -0,0 +1,1180 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
from typing import Union, List, Optional, Literal, Tuple
|
|
5
|
+
import warnings
|
|
6
|
+
import os
|
|
7
|
+
from scipy import stats
|
|
8
|
+
import seaborn as sns
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class UtilsStats:
|
|
12
|
+
"""
|
|
13
|
+
Clase utilitaria para operaciones estadísticas comunes y visualización
|
|
14
|
+
|
|
15
|
+
Esta clase proporciona métodos para validación de datos, análisis estadísticos
|
|
16
|
+
básicos y visualización de resultados.
|
|
17
|
+
|
|
18
|
+
Examples:
|
|
19
|
+
---------
|
|
20
|
+
>>> utils = UtilsStats()
|
|
21
|
+
>>> data = np.random.normal(0, 1, 100)
|
|
22
|
+
>>> utils.check_normality(data)
|
|
23
|
+
>>> utils.plot_distribution(data)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self):
|
|
27
|
+
"""Inicializar la clase utilitaria"""
|
|
28
|
+
self._plot_backend = 'seaborn'
|
|
29
|
+
self._default_figsize = (12, 5)
|
|
30
|
+
self._save_fig = False
|
|
31
|
+
self._fig_format = 'png'
|
|
32
|
+
self._fig_dpi = 300
|
|
33
|
+
self._figures_dir = 'figures'
|
|
34
|
+
|
|
35
|
+
# Configuración de estilo para matplotlib
|
|
36
|
+
plt.style.use('default')
|
|
37
|
+
self._setup_plotting_style()
|
|
38
|
+
|
|
39
|
+
def _setup_plotting_style(self):
|
|
40
|
+
"""Configurar estilos de plotting por defecto"""
|
|
41
|
+
plt.rcParams['figure.figsize'] = [self._default_figsize[0], self._default_figsize[1]]
|
|
42
|
+
plt.rcParams['figure.dpi'] = self._fig_dpi
|
|
43
|
+
plt.rcParams['savefig.dpi'] = self._fig_dpi
|
|
44
|
+
plt.rcParams['font.size'] = 10
|
|
45
|
+
plt.rcParams['axes.grid'] = True
|
|
46
|
+
plt.rcParams['grid.alpha'] = 0.3
|
|
47
|
+
plt.rcParams['lines.linewidth'] = 2
|
|
48
|
+
|
|
49
|
+
def set_plot_backend(self, backend: Literal['matplotlib', 'seaborn', 'plotly']):
|
|
50
|
+
"""
|
|
51
|
+
Establecer el backend de visualización por defecto
|
|
52
|
+
"""
|
|
53
|
+
self._plot_backend = backend
|
|
54
|
+
|
|
55
|
+
def set_default_figsize(self, figsize: Tuple[int, int]):
|
|
56
|
+
"""
|
|
57
|
+
Establecer el tamaño de figura por defecto
|
|
58
|
+
"""
|
|
59
|
+
self._default_figsize = figsize
|
|
60
|
+
plt.rcParams['figure.figsize'] = [figsize[0], figsize[1]]
|
|
61
|
+
|
|
62
|
+
def set_save_fig_options(self, save_fig: Optional[bool] = False,
|
|
63
|
+
fig_format: str = 'png',
|
|
64
|
+
fig_dpi: int = 300,
|
|
65
|
+
figures_dir: str = 'figures'):
|
|
66
|
+
"""
|
|
67
|
+
Configurar opciones para guardar figuras
|
|
68
|
+
"""
|
|
69
|
+
self._save_fig = save_fig
|
|
70
|
+
self._fig_format = fig_format
|
|
71
|
+
self._fig_dpi = fig_dpi
|
|
72
|
+
self._figures_dir = figures_dir
|
|
73
|
+
|
|
74
|
+
def _save_figure(self, fig, filename: str, **kwargs):
|
|
75
|
+
"""
|
|
76
|
+
Guardar figura si save_fig está activado
|
|
77
|
+
"""
|
|
78
|
+
if self._save_fig:
|
|
79
|
+
try:
|
|
80
|
+
os.makedirs(self._figures_dir, exist_ok=True)
|
|
81
|
+
filepath = os.path.join(self._figures_dir, f"{filename}.{self._fig_format}")
|
|
82
|
+
|
|
83
|
+
fig.savefig(
|
|
84
|
+
filepath,
|
|
85
|
+
format=self._fig_format,
|
|
86
|
+
dpi=self._fig_dpi,
|
|
87
|
+
bbox_inches='tight',
|
|
88
|
+
facecolor='white',
|
|
89
|
+
**kwargs
|
|
90
|
+
)
|
|
91
|
+
print(f"✓ Figura guardada: {filepath}")
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
print(f"✗ Error guardando figura: {e}")
|
|
95
|
+
|
|
96
|
+
# ============= MÉTODOS DE ANÁLISIS ESTADÍSTICO =============
|
|
97
|
+
|
|
98
|
+
def validate_dataframe(self, data: Union[pd.DataFrame, np.ndarray, list]) -> pd.DataFrame:
|
|
99
|
+
"""Valida y convierte datos a DataFrame"""
|
|
100
|
+
if isinstance(data, pd.DataFrame):
|
|
101
|
+
return data
|
|
102
|
+
elif isinstance(data, np.ndarray):
|
|
103
|
+
if data.ndim == 1:
|
|
104
|
+
return pd.DataFrame({'var': data})
|
|
105
|
+
elif data.ndim == 2:
|
|
106
|
+
return pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
|
|
107
|
+
else:
|
|
108
|
+
raise ValueError("Solo se soportan arrays 1D y 2D")
|
|
109
|
+
elif isinstance(data, list):
|
|
110
|
+
return pd.DataFrame(data)
|
|
111
|
+
else:
|
|
112
|
+
raise TypeError(f"Tipo de dato no soportado: {type(data)}")
|
|
113
|
+
|
|
114
|
+
def format_number(self, num: float, decimals: int = 6, scientific: bool = False) -> str:
|
|
115
|
+
"""Formatea un número con decimales especificados"""
|
|
116
|
+
if scientific and abs(num) < 0.001:
|
|
117
|
+
return f"{num:.{decimals}e}"
|
|
118
|
+
return f"{num:.{decimals}f}"
|
|
119
|
+
|
|
120
|
+
def check_normality(self, data: Union[pd.Series, np.ndarray], alpha: float = 0.05) -> dict:
|
|
121
|
+
"""Verifica si los datos siguen distribución normal usando Shapiro-Wilk"""
|
|
122
|
+
if isinstance(data, pd.Series):
|
|
123
|
+
data = data.dropna().values
|
|
124
|
+
else:
|
|
125
|
+
data = np.array(data)
|
|
126
|
+
data = data[~np.isnan(data)]
|
|
127
|
+
|
|
128
|
+
shapiro_stat, shapiro_p = stats.shapiro(data)
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
'is_normal': shapiro_p > alpha,
|
|
132
|
+
'shapiro_statistic': shapiro_stat,
|
|
133
|
+
'shapiro_pvalue': shapiro_p,
|
|
134
|
+
'alpha': alpha,
|
|
135
|
+
'interpretation': 'Normal' if shapiro_p > alpha else 'No Normal'
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
def calculate_confidence_intervals(self, data: Union[pd.Series, np.ndarray],
|
|
139
|
+
confidence_level: float = 0.95,
|
|
140
|
+
method: str = 'parametric') -> dict:
|
|
141
|
+
"""
|
|
142
|
+
Calcula intervalos de confianza para la media
|
|
143
|
+
"""
|
|
144
|
+
if isinstance(data, pd.Series):
|
|
145
|
+
data_clean = data.dropna().values
|
|
146
|
+
else:
|
|
147
|
+
data_clean = data[~np.isnan(data)]
|
|
148
|
+
|
|
149
|
+
n = len(data_clean)
|
|
150
|
+
mean = np.mean(data_clean)
|
|
151
|
+
std = np.std(data_clean, ddof=1)
|
|
152
|
+
|
|
153
|
+
if method == 'parametric':
|
|
154
|
+
se = std / np.sqrt(n)
|
|
155
|
+
z_value = stats.t.ppf((1 + confidence_level) / 2, n - 1)
|
|
156
|
+
margin_error = z_value * se
|
|
157
|
+
|
|
158
|
+
ci_lower = mean - margin_error
|
|
159
|
+
ci_upper = mean + margin_error
|
|
160
|
+
|
|
161
|
+
elif method == 'bootstrap':
|
|
162
|
+
n_bootstraps = 1000
|
|
163
|
+
bootstrap_means = []
|
|
164
|
+
|
|
165
|
+
for _ in range(n_bootstraps):
|
|
166
|
+
bootstrap_sample = np.random.choice(data_clean, size=n, replace=True)
|
|
167
|
+
bootstrap_means.append(np.mean(bootstrap_sample))
|
|
168
|
+
|
|
169
|
+
alpha = 1 - confidence_level
|
|
170
|
+
ci_lower = np.percentile(bootstrap_means, (alpha / 2) * 100)
|
|
171
|
+
ci_upper = np.percentile(bootstrap_means, (1 - alpha / 2) * 100)
|
|
172
|
+
margin_error = (ci_upper - ci_lower) / 2
|
|
173
|
+
|
|
174
|
+
else:
|
|
175
|
+
raise ValueError("Método debe ser 'parametric' o 'bootstrap'")
|
|
176
|
+
|
|
177
|
+
return {
|
|
178
|
+
'mean': mean,
|
|
179
|
+
'std': std,
|
|
180
|
+
'n': n,
|
|
181
|
+
'confidence_level': confidence_level,
|
|
182
|
+
'ci_lower': ci_lower,
|
|
183
|
+
'ci_upper': ci_upper,
|
|
184
|
+
'margin_error': margin_error,
|
|
185
|
+
'method': method
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
def detect_outliers(self, data: Union[pd.Series, np.ndarray],
|
|
189
|
+
method: Literal['iqr', 'zscore', 'isolation_forest'] = 'iqr',
|
|
190
|
+
**kwargs) -> np.ndarray:
|
|
191
|
+
"""
|
|
192
|
+
Detecta outliers usando diferentes métodos
|
|
193
|
+
|
|
194
|
+
Parameters:
|
|
195
|
+
-----------
|
|
196
|
+
data : array-like
|
|
197
|
+
Datos a analizar
|
|
198
|
+
method : str
|
|
199
|
+
'iqr', 'zscore', o 'isolation_forest'
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
--------
|
|
203
|
+
np.ndarray
|
|
204
|
+
Array booleano indicando outliers
|
|
205
|
+
"""
|
|
206
|
+
if isinstance(data, pd.Series):
|
|
207
|
+
data = data.values
|
|
208
|
+
|
|
209
|
+
data_clean = data[~np.isnan(data)]
|
|
210
|
+
|
|
211
|
+
if method == 'iqr':
|
|
212
|
+
q1 = np.percentile(data_clean, 25)
|
|
213
|
+
q3 = np.percentile(data_clean, 75)
|
|
214
|
+
iqr = q3 - q1
|
|
215
|
+
lower_bound = q1 - 1.5 * iqr
|
|
216
|
+
upper_bound = q3 + 1.5 * iqr
|
|
217
|
+
outliers = (data_clean < lower_bound) | (data_clean > upper_bound)
|
|
218
|
+
|
|
219
|
+
elif method == 'zscore':
|
|
220
|
+
threshold = kwargs.get('threshold', 3)
|
|
221
|
+
z_scores = np.abs((data_clean - np.mean(data_clean)) / np.std(data_clean))
|
|
222
|
+
outliers = z_scores > threshold
|
|
223
|
+
|
|
224
|
+
elif method == 'isolation_forest':
|
|
225
|
+
from sklearn.ensemble import IsolationForest
|
|
226
|
+
contamination = kwargs.get('contamination', 0.1)
|
|
227
|
+
X = data_clean.reshape(-1, 1)
|
|
228
|
+
clf = IsolationForest(contamination=contamination, random_state=42)
|
|
229
|
+
outliers = clf.fit_predict(X) == -1
|
|
230
|
+
|
|
231
|
+
else:
|
|
232
|
+
raise ValueError("Método debe ser 'iqr', 'zscore', o 'isolation_forest'")
|
|
233
|
+
|
|
234
|
+
return outliers
|
|
235
|
+
|
|
236
|
+
def calculate_effect_size(self, group1: np.ndarray, group2: np.ndarray,
|
|
237
|
+
method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
|
|
238
|
+
"""
|
|
239
|
+
Calcula el tamaño del efecto entre dos grupos
|
|
240
|
+
"""
|
|
241
|
+
mean1, mean2 = np.mean(group1), np.mean(group2)
|
|
242
|
+
std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
|
|
243
|
+
n1, n2 = len(group1), len(group2)
|
|
244
|
+
|
|
245
|
+
pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
|
|
246
|
+
cohens_d = (mean1 - mean2) / pooled_std
|
|
247
|
+
|
|
248
|
+
if method == 'hedges':
|
|
249
|
+
correction = 1 - (3 / (4 * (n1 + n2) - 9))
|
|
250
|
+
effect_size = cohens_d * correction
|
|
251
|
+
else:
|
|
252
|
+
effect_size = cohens_d
|
|
253
|
+
|
|
254
|
+
abs_effect = abs(effect_size)
|
|
255
|
+
if abs_effect < 0.2:
|
|
256
|
+
interpretation = "Muy pequeño"
|
|
257
|
+
elif abs_effect < 0.5:
|
|
258
|
+
interpretation = "Pequeño"
|
|
259
|
+
elif abs_effect < 0.8:
|
|
260
|
+
interpretation = "Mediano"
|
|
261
|
+
else:
|
|
262
|
+
interpretation = "Grande"
|
|
263
|
+
|
|
264
|
+
return {
|
|
265
|
+
'effect_size': effect_size,
|
|
266
|
+
'method': method,
|
|
267
|
+
'interpretation': interpretation,
|
|
268
|
+
'mean_diff': mean1 - mean2,
|
|
269
|
+
'pooled_std': pooled_std
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
# ============= MÉTODOS DE VISUALIZACIÓN COMPLETOS =============
|
|
273
|
+
|
|
274
|
+
def _plot_distribution_seaborn(self, data, plot_type, bins, figsize, title, **kwargs):
|
|
275
|
+
"""Implementación con seaborn"""
|
|
276
|
+
if plot_type == 'all':
|
|
277
|
+
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
|
|
278
|
+
|
|
279
|
+
# Histograma
|
|
280
|
+
sns.histplot(data, bins=bins, kde=True, ax=axes[0, 0])
|
|
281
|
+
axes[0, 0].set_title('Histograma con KDE')
|
|
282
|
+
|
|
283
|
+
# Box plot
|
|
284
|
+
sns.boxplot(y=data, ax=axes[0, 1])
|
|
285
|
+
axes[0, 1].set_title('Box Plot')
|
|
286
|
+
|
|
287
|
+
# Violin plot
|
|
288
|
+
sns.violinplot(y=data, ax=axes[1, 0])
|
|
289
|
+
axes[1, 0].set_title('Violin Plot')
|
|
290
|
+
|
|
291
|
+
# Q-Q plot
|
|
292
|
+
stats.probplot(data, dist="norm", plot=axes[1, 1])
|
|
293
|
+
axes[1, 1].set_title('Q-Q Plot')
|
|
294
|
+
|
|
295
|
+
fig.suptitle(title, fontsize=16, y=1.00)
|
|
296
|
+
plt.tight_layout()
|
|
297
|
+
|
|
298
|
+
else:
|
|
299
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
300
|
+
|
|
301
|
+
if plot_type == 'hist':
|
|
302
|
+
sns.histplot(data, bins=bins, kde=True, ax=ax, **kwargs)
|
|
303
|
+
elif plot_type == 'kde':
|
|
304
|
+
sns.kdeplot(data, ax=ax, **kwargs)
|
|
305
|
+
elif plot_type == 'box':
|
|
306
|
+
sns.boxplot(y=data, ax=ax, **kwargs)
|
|
307
|
+
elif plot_type == 'violin':
|
|
308
|
+
sns.violinplot(y=data, ax=ax, **kwargs)
|
|
309
|
+
|
|
310
|
+
ax.set_title(title)
|
|
311
|
+
plt.tight_layout()
|
|
312
|
+
|
|
313
|
+
return fig
|
|
314
|
+
|
|
315
|
+
def _plot_distribution_matplotlib(self, data, plot_type, bins, figsize, title, **kwargs):
|
|
316
|
+
"""Implementación con matplotlib puro"""
|
|
317
|
+
if plot_type == 'all':
|
|
318
|
+
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
|
|
319
|
+
|
|
320
|
+
# Histograma
|
|
321
|
+
axes[0, 0].hist(data, bins=bins, alpha=0.7, edgecolor='black', density=True)
|
|
322
|
+
axes[0, 0].set_title('Histograma')
|
|
323
|
+
axes[0, 0].set_ylabel('Densidad')
|
|
324
|
+
|
|
325
|
+
# Box plot
|
|
326
|
+
axes[0, 1].boxplot(data)
|
|
327
|
+
axes[0, 1].set_title('Box Plot')
|
|
328
|
+
|
|
329
|
+
# KDE
|
|
330
|
+
from scipy.stats import gaussian_kde
|
|
331
|
+
kde = gaussian_kde(data)
|
|
332
|
+
x_range = np.linspace(data.min(), data.max(), 100)
|
|
333
|
+
axes[1, 0].plot(x_range, kde(x_range))
|
|
334
|
+
axes[1, 0].fill_between(x_range, kde(x_range), alpha=0.3)
|
|
335
|
+
axes[1, 0].set_title('KDE')
|
|
336
|
+
axes[1, 0].set_ylabel('Densidad')
|
|
337
|
+
|
|
338
|
+
# Q-Q plot
|
|
339
|
+
stats.probplot(data, dist="norm", plot=axes[1, 1])
|
|
340
|
+
axes[1, 1].set_title('Q-Q Plot')
|
|
341
|
+
|
|
342
|
+
fig.suptitle(title, fontsize=16)
|
|
343
|
+
plt.tight_layout()
|
|
344
|
+
|
|
345
|
+
else:
|
|
346
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
347
|
+
|
|
348
|
+
if plot_type == 'hist':
|
|
349
|
+
ax.hist(data, bins=bins, edgecolor='black', alpha=0.7, **kwargs)
|
|
350
|
+
ax.set_ylabel('Frecuencia')
|
|
351
|
+
elif plot_type == 'box':
|
|
352
|
+
ax.boxplot(data, vert=True)
|
|
353
|
+
elif plot_type == 'kde':
|
|
354
|
+
from scipy.stats import gaussian_kde
|
|
355
|
+
kde = gaussian_kde(data)
|
|
356
|
+
x_range = np.linspace(data.min(), data.max(), 100)
|
|
357
|
+
ax.plot(x_range, kde(x_range), **kwargs)
|
|
358
|
+
ax.fill_between(x_range, kde(x_range), alpha=0.3)
|
|
359
|
+
ax.set_ylabel('Densidad')
|
|
360
|
+
|
|
361
|
+
ax.set_title(title)
|
|
362
|
+
ax.grid(True, alpha=0.3)
|
|
363
|
+
plt.tight_layout()
|
|
364
|
+
|
|
365
|
+
return fig
|
|
366
|
+
|
|
367
|
+
def plot_distribution(self, data: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
368
|
+
column: Optional[str] = None,
|
|
369
|
+
plot_type: Literal['hist', 'kde', 'box', 'violin', 'all'] = 'hist',
|
|
370
|
+
backend: Optional[Literal['matplotlib', 'seaborn', 'plotly']] = "seaborn",
|
|
371
|
+
bins: int = 30,
|
|
372
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
373
|
+
save_fig: Optional[bool] = None,
|
|
374
|
+
filename: Optional[str] = None,
|
|
375
|
+
**kwargs):
|
|
376
|
+
"""
|
|
377
|
+
Graficar distribución de una variable
|
|
378
|
+
|
|
379
|
+
Parameters:
|
|
380
|
+
-----------
|
|
381
|
+
data : DataFrame, Series o ndarray
|
|
382
|
+
Datos a graficar
|
|
383
|
+
column : str, optional
|
|
384
|
+
Columna a graficar (si data es DataFrame)
|
|
385
|
+
plot_type : str
|
|
386
|
+
Tipo de gráfico
|
|
387
|
+
backend : str, optional
|
|
388
|
+
Backend de visualización
|
|
389
|
+
bins : int
|
|
390
|
+
Número de bins para histograma
|
|
391
|
+
figsize : tuple, optional
|
|
392
|
+
Tamaño de la figura
|
|
393
|
+
save_fig : bool, optional
|
|
394
|
+
Si guardar la figura
|
|
395
|
+
filename : str, optional
|
|
396
|
+
Nombre del archivo
|
|
397
|
+
"""
|
|
398
|
+
backend = backend or self._plot_backend
|
|
399
|
+
figsize = figsize or self._default_figsize
|
|
400
|
+
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
401
|
+
|
|
402
|
+
# Extraer datos
|
|
403
|
+
if isinstance(data, pd.DataFrame):
|
|
404
|
+
if column is None:
|
|
405
|
+
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
406
|
+
plot_data = data[column].dropna()
|
|
407
|
+
title = f"Distribución de {column}"
|
|
408
|
+
default_filename = f"distribucion_{column}"
|
|
409
|
+
elif isinstance(data, pd.Series):
|
|
410
|
+
plot_data = data.dropna()
|
|
411
|
+
title = f"Distribución de {data.name if data.name else 'Variable'}"
|
|
412
|
+
default_filename = f"distribucion_{data.name if data.name else 'variable'}"
|
|
413
|
+
else:
|
|
414
|
+
plot_data = pd.Series(data).dropna()
|
|
415
|
+
title = "Distribución"
|
|
416
|
+
default_filename = "distribucion"
|
|
417
|
+
|
|
418
|
+
filename = filename or default_filename
|
|
419
|
+
|
|
420
|
+
try:
|
|
421
|
+
if backend == 'seaborn':
|
|
422
|
+
fig = self._plot_distribution_seaborn(plot_data, plot_type, bins, figsize, title, **kwargs)
|
|
423
|
+
elif backend == 'matplotlib':
|
|
424
|
+
fig = self._plot_distribution_matplotlib(plot_data, plot_type, bins, figsize, title, **kwargs)
|
|
425
|
+
elif backend == 'plotly':
|
|
426
|
+
fig = self._plot_distribution_plotly(plot_data, plot_type, bins, title, **kwargs)
|
|
427
|
+
else:
|
|
428
|
+
raise ValueError(f"Backend '{backend}' no soportado")
|
|
429
|
+
|
|
430
|
+
# Guardar figura si está activado
|
|
431
|
+
if save_fig and backend != 'plotly':
|
|
432
|
+
self._save_figure(fig, filename)
|
|
433
|
+
|
|
434
|
+
return fig
|
|
435
|
+
|
|
436
|
+
except Exception as e:
|
|
437
|
+
print(f"Error en plot_distribution: {e}")
|
|
438
|
+
raise
|
|
439
|
+
|
|
440
|
+
def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
|
|
441
|
+
"""Implementación con plotly"""
|
|
442
|
+
try:
|
|
443
|
+
import plotly.graph_objects as go
|
|
444
|
+
import plotly.express as px
|
|
445
|
+
from plotly.subplots import make_subplots
|
|
446
|
+
except ImportError:
|
|
447
|
+
raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
|
|
448
|
+
|
|
449
|
+
if plot_type == 'all':
|
|
450
|
+
fig = make_subplots(
|
|
451
|
+
rows=2, cols=2,
|
|
452
|
+
subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Histograma
|
|
456
|
+
fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
|
|
457
|
+
|
|
458
|
+
# Box plot
|
|
459
|
+
fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
|
|
460
|
+
|
|
461
|
+
# Violin plot
|
|
462
|
+
fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
|
|
463
|
+
|
|
464
|
+
# Distribución acumulada
|
|
465
|
+
hist, bin_edges = np.histogram(data, bins=bins, density=True)
|
|
466
|
+
cdf = np.cumsum(hist * np.diff(bin_edges))
|
|
467
|
+
fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
|
|
468
|
+
|
|
469
|
+
else:
|
|
470
|
+
if plot_type == 'hist':
|
|
471
|
+
fig = px.histogram(data, nbins=bins, title=title)
|
|
472
|
+
elif plot_type == 'box':
|
|
473
|
+
fig = px.box(y=data, title=title)
|
|
474
|
+
elif plot_type == 'violin':
|
|
475
|
+
fig = px.violin(y=data, title=title, box=True)
|
|
476
|
+
else:
|
|
477
|
+
fig = px.histogram(data, nbins=bins, title=title)
|
|
478
|
+
|
|
479
|
+
return fig
|
|
480
|
+
|
|
481
|
+
def plot_correlation_matrix(self, data: pd.DataFrame,
|
|
482
|
+
method: str = 'pearson',
|
|
483
|
+
backend: Optional[Literal['seaborn', 'plotly']] = None,
|
|
484
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
485
|
+
save_fig: Optional[bool] = None,
|
|
486
|
+
filename: Optional[str] = None,
|
|
487
|
+
**kwargs):
|
|
488
|
+
"""
|
|
489
|
+
Visualizar matriz de correlación
|
|
490
|
+
|
|
491
|
+
Parameters:
|
|
492
|
+
-----------
|
|
493
|
+
data : DataFrame
|
|
494
|
+
Datos para calcular correlación
|
|
495
|
+
method : str
|
|
496
|
+
'pearson', 'spearman' o 'kendall'
|
|
497
|
+
backend : str, optional
|
|
498
|
+
Backend de visualización
|
|
499
|
+
"""
|
|
500
|
+
backend = backend or self._plot_backend
|
|
501
|
+
figsize = figsize or self._default_figsize
|
|
502
|
+
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
503
|
+
filename = filename or "matriz_correlacion"
|
|
504
|
+
|
|
505
|
+
# Calcular matriz de correlación
|
|
506
|
+
corr_matrix = data.corr(method=method)
|
|
507
|
+
|
|
508
|
+
if backend == 'seaborn':
|
|
509
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
510
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
|
511
|
+
|
|
512
|
+
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
|
|
513
|
+
cmap='coolwarm', center=0, ax=ax,
|
|
514
|
+
square=True, linewidths=0.5, **kwargs)
|
|
515
|
+
ax.set_title(f'Matriz de Correlación ({method})', fontsize=14, pad=20)
|
|
516
|
+
plt.tight_layout()
|
|
517
|
+
|
|
518
|
+
elif backend == 'plotly':
|
|
519
|
+
import plotly.graph_objects as go
|
|
520
|
+
|
|
521
|
+
fig = go.Figure(data=go.Heatmap(
|
|
522
|
+
z=corr_matrix.values,
|
|
523
|
+
x=corr_matrix.columns,
|
|
524
|
+
y=corr_matrix.index,
|
|
525
|
+
colorscale='RdBu',
|
|
526
|
+
zmid=0,
|
|
527
|
+
text=corr_matrix.values,
|
|
528
|
+
texttemplate='%{text:.2f}',
|
|
529
|
+
textfont={"size": 10},
|
|
530
|
+
**kwargs
|
|
531
|
+
))
|
|
532
|
+
|
|
533
|
+
fig.update_layout(
|
|
534
|
+
title=f'Matriz de Correlación ({method})',
|
|
535
|
+
xaxis_title='Variables',
|
|
536
|
+
yaxis_title='Variables',
|
|
537
|
+
width=figsize[0]*100,
|
|
538
|
+
height=figsize[1]*100
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Guardar figura
|
|
542
|
+
if save_fig:
|
|
543
|
+
if backend == 'seaborn':
|
|
544
|
+
self._save_figure(fig, filename)
|
|
545
|
+
elif backend == 'plotly':
|
|
546
|
+
try:
|
|
547
|
+
os.makedirs(self._figures_dir, exist_ok=True)
|
|
548
|
+
filepath = os.path.join(self._figures_dir, f"{filename}.{self._fig_format}")
|
|
549
|
+
fig.write_image(filepath)
|
|
550
|
+
print(f"✓ Figura Plotly guardada: {filepath}")
|
|
551
|
+
except Exception as e:
|
|
552
|
+
print(f"✗ Error guardando figura Plotly: {e}")
|
|
553
|
+
|
|
554
|
+
return fig
|
|
555
|
+
|
|
556
|
+
def plot_scatter_matrix(self, data: pd.DataFrame,
|
|
557
|
+
columns: Optional[List[str]] = None,
|
|
558
|
+
backend: Optional[Literal['seaborn', 'plotly', 'pandas']] = None,
|
|
559
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
560
|
+
save_fig: Optional[bool] = None,
|
|
561
|
+
filename: Optional[str] = None,
|
|
562
|
+
**kwargs):
|
|
563
|
+
"""
|
|
564
|
+
Matriz de gráficos de dispersión (pairplot)
|
|
565
|
+
"""
|
|
566
|
+
backend = backend or self._plot_backend
|
|
567
|
+
figsize = figsize or self._default_figsize
|
|
568
|
+
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
569
|
+
filename = filename or "scatter_matrix"
|
|
570
|
+
|
|
571
|
+
if columns:
|
|
572
|
+
data = data[columns]
|
|
573
|
+
|
|
574
|
+
if backend == 'seaborn':
|
|
575
|
+
fig = sns.pairplot(data, **kwargs)
|
|
576
|
+
fig.fig.suptitle('Matriz de Dispersión', y=1.02)
|
|
577
|
+
|
|
578
|
+
elif backend == 'plotly':
|
|
579
|
+
import plotly.express as px
|
|
580
|
+
fig = px.scatter_matrix(data, **kwargs)
|
|
581
|
+
fig.update_layout(title='Matriz de Dispersión')
|
|
582
|
+
|
|
583
|
+
elif backend == 'pandas':
|
|
584
|
+
from pandas.plotting import scatter_matrix
|
|
585
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
586
|
+
scatter_matrix(data, ax=ax, **kwargs)
|
|
587
|
+
|
|
588
|
+
# Guardar figura
|
|
589
|
+
if save_fig:
|
|
590
|
+
if backend in ['seaborn', 'pandas']:
|
|
591
|
+
self._save_figure(fig.figure if hasattr(fig, 'figure') else fig, filename)
|
|
592
|
+
elif backend == 'plotly':
|
|
593
|
+
try:
|
|
594
|
+
os.makedirs(self._figures_dir, exist_ok=True)
|
|
595
|
+
filepath = os.path.join(self._figures_dir, f"{filename}.{self._fig_format}")
|
|
596
|
+
fig.write_image(filepath)
|
|
597
|
+
print(f"✓ Figura Plotly guardada: {filepath}")
|
|
598
|
+
except Exception as e:
|
|
599
|
+
print(f"✗ Error guardando figura Plotly: {e}")
|
|
600
|
+
|
|
601
|
+
return fig
|
|
602
|
+
|
|
603
|
+
# ============= GRÁFICOS CON INTERVALOS DE CONFIANZA =============
|
|
604
|
+
|
|
605
|
+
def plot_distribution_with_ci(self,
|
|
606
|
+
data: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
607
|
+
column: Optional[str] = None,
|
|
608
|
+
confidence_level: float = 0.95,
|
|
609
|
+
ci_method: str = 'parametric',
|
|
610
|
+
bins: int = 30,
|
|
611
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
612
|
+
save_fig: Optional[bool] = None,
|
|
613
|
+
filename: Optional[str] = None,
|
|
614
|
+
**kwargs) -> plt.Figure:
|
|
615
|
+
|
|
616
|
+
# ======= PREPARACIÓN =======
|
|
617
|
+
if isinstance(data, pd.DataFrame):
|
|
618
|
+
if column is None:
|
|
619
|
+
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
620
|
+
plot_data = data[column].dropna()
|
|
621
|
+
data_name = column
|
|
622
|
+
elif isinstance(data, pd.Series):
|
|
623
|
+
plot_data = data.dropna()
|
|
624
|
+
data_name = data.name if data.name else 'Variable'
|
|
625
|
+
else:
|
|
626
|
+
plot_data = pd.Series(data).dropna()
|
|
627
|
+
data_name = 'Variable'
|
|
628
|
+
|
|
629
|
+
data_array = plot_data.values
|
|
630
|
+
filename = filename or f"distribucion_ci_{data_name.lower().replace(' ', '_')}"
|
|
631
|
+
|
|
632
|
+
# Estadísticas
|
|
633
|
+
ci_result = self.calculate_confidence_intervals(data_array, confidence_level, ci_method)
|
|
634
|
+
normality_result = self.check_normality(data_array)
|
|
635
|
+
|
|
636
|
+
# KDE
|
|
637
|
+
kde = stats.gaussian_kde(data_array)
|
|
638
|
+
x_range = np.linspace(data_array.min(), data_array.max(), 300)
|
|
639
|
+
|
|
640
|
+
# ======= FIGURA =======
|
|
641
|
+
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize or (14, 6))
|
|
642
|
+
|
|
643
|
+
# ============================================================
|
|
644
|
+
# PANEL 1: HISTOGRAMA + KDE
|
|
645
|
+
# ============================================================
|
|
646
|
+
ax1.hist(data_array, bins=bins, density=True,
|
|
647
|
+
color='skyblue', edgecolor='black', alpha=0.7)
|
|
648
|
+
|
|
649
|
+
ax1.plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
|
|
650
|
+
|
|
651
|
+
ax1.axvline(ci_result['mean'], color='red', linestyle='--', linewidth=2,
|
|
652
|
+
label=f"Media: {ci_result['mean']:.2f}")
|
|
653
|
+
|
|
654
|
+
ax1.set_title(f"Distribución de {data_name}")
|
|
655
|
+
ax1.set_xlabel("Valores")
|
|
656
|
+
ax1.set_ylabel("Densidad")
|
|
657
|
+
ax1.legend()
|
|
658
|
+
ax1.grid(alpha=0.3)
|
|
659
|
+
|
|
660
|
+
# ============================================================
|
|
661
|
+
# PANEL 2: KDE + INTERVALO DE CONFIANZA
|
|
662
|
+
# ============================================================
|
|
663
|
+
|
|
664
|
+
# KDE pura
|
|
665
|
+
ax2.plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
|
|
666
|
+
|
|
667
|
+
# Intervalo de Confianza
|
|
668
|
+
ax2.axvspan(ci_result["ci_lower"], ci_result["ci_upper"],
|
|
669
|
+
color='orange', alpha=0.3,
|
|
670
|
+
label=f"IC {confidence_level*100:.0f}%")
|
|
671
|
+
|
|
672
|
+
# Media
|
|
673
|
+
ax2.axvline(ci_result["mean"], color='red', linewidth=2)
|
|
674
|
+
|
|
675
|
+
# Distribución normal teórica (si aplica)
|
|
676
|
+
if normality_result["is_normal"]:
|
|
677
|
+
normal_y = stats.norm.pdf(x_range, ci_result['mean'], ci_result['std'])
|
|
678
|
+
ax2.plot(x_range, normal_y, 'g--', linewidth=2, alpha=0.7,
|
|
679
|
+
label="Normal Teórica")
|
|
680
|
+
|
|
681
|
+
ax2.set_title(f"IC con método '{ci_method}'")
|
|
682
|
+
ax2.set_xlabel("Valores")
|
|
683
|
+
ax2.set_ylabel("Densidad")
|
|
684
|
+
ax2.legend()
|
|
685
|
+
ax2.grid(alpha=0.3)
|
|
686
|
+
|
|
687
|
+
# ======= CUADRO DE INFO =======
|
|
688
|
+
info = (
|
|
689
|
+
f"Estadísticas de {data_name}:\n"
|
|
690
|
+
f"• n = {ci_result['n']}\n"
|
|
691
|
+
f"• Media = {ci_result['mean']:.3f}\n"
|
|
692
|
+
f"• Desv. Est. = {ci_result['std']:.3f}\n"
|
|
693
|
+
f"• IC {confidence_level*100:.0f}% = [{ci_result['ci_lower']:.3f}, {ci_result['ci_upper']:.3f}]\n"
|
|
694
|
+
f"• Margen Error = ±{ci_result['margin_error']:.3f}\n"
|
|
695
|
+
f"• Normalidad = {normality_result['interpretation']}\n"
|
|
696
|
+
f"• p-value Shapiro = {normality_result['shapiro_pvalue']:.4f}"
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
fig.text(0.01, 0.01, info, fontsize=9,
|
|
700
|
+
bbox=dict(facecolor='lightgray', alpha=0.6),
|
|
701
|
+
va='bottom')
|
|
702
|
+
|
|
703
|
+
plt.tight_layout()
|
|
704
|
+
|
|
705
|
+
# Guardado opcional
|
|
706
|
+
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
707
|
+
if save_fig:
|
|
708
|
+
self._save_figure(fig, filename)
|
|
709
|
+
|
|
710
|
+
return fig
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def plot_multiple_distributions_with_ci(self,
|
|
714
|
+
data_dict: dict,
|
|
715
|
+
confidence_level: float = 0.95,
|
|
716
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
717
|
+
save_fig: Optional[bool] = None,
|
|
718
|
+
filename: Optional[str] = None,
|
|
719
|
+
**kwargs) -> plt.Figure:
|
|
720
|
+
"""
|
|
721
|
+
Grafica múltiples distribuciones con sus intervalos de confianza
|
|
722
|
+
"""
|
|
723
|
+
n_distributions = len(data_dict)
|
|
724
|
+
fig, axes = plt.subplots(n_distributions, 2,
|
|
725
|
+
figsize=figsize or (14, 5 * n_distributions))
|
|
726
|
+
|
|
727
|
+
if n_distributions == 1:
|
|
728
|
+
axes = axes.reshape(1, -1)
|
|
729
|
+
|
|
730
|
+
colors = plt.cm.Set3(np.linspace(0, 1, n_distributions))
|
|
731
|
+
|
|
732
|
+
for idx, (name, data) in enumerate(data_dict.items()):
|
|
733
|
+
ax1, ax2 = axes[idx]
|
|
734
|
+
|
|
735
|
+
if isinstance(data, pd.Series):
|
|
736
|
+
data_array = data.dropna().values
|
|
737
|
+
else:
|
|
738
|
+
data_array = np.array(data)
|
|
739
|
+
data_array = data_array[~np.isnan(data_array)]
|
|
740
|
+
|
|
741
|
+
# Calcular estadísticas
|
|
742
|
+
ci_result = self.calculate_confidence_intervals(data_array, confidence_level)
|
|
743
|
+
|
|
744
|
+
# Gráfica izquierda: Distribución básica
|
|
745
|
+
ax1.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
|
|
746
|
+
edgecolor='black', density=True)
|
|
747
|
+
|
|
748
|
+
kde = stats.gaussian_kde(data_array)
|
|
749
|
+
x_range = np.linspace(data_array.min(), data_array.max(), 200)
|
|
750
|
+
ax1.plot(x_range, kde(x_range), 'k-', linewidth=2)
|
|
751
|
+
ax1.axvline(ci_result['mean'], color='red', linestyle='--', linewidth=2)
|
|
752
|
+
|
|
753
|
+
ax1.set_title(f'{name}\nMedia: {ci_result["mean"]:.2f}')
|
|
754
|
+
ax1.grid(True, alpha=0.3)
|
|
755
|
+
|
|
756
|
+
# Gráfica derecha: Con intervalos de confianza
|
|
757
|
+
ax2.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
|
|
758
|
+
edgecolor='black', density=True)
|
|
759
|
+
ax2.plot(x_range, kde(x_range), 'k-', linewidth=2)
|
|
760
|
+
|
|
761
|
+
ax2.axvline(ci_result['mean'], color='red', linestyle='-', linewidth=3)
|
|
762
|
+
ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
|
|
763
|
+
alpha=0.3, color='orange')
|
|
764
|
+
ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
|
|
765
|
+
ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
|
|
766
|
+
|
|
767
|
+
ax2.set_title(f'{name} con IC {confidence_level*100}%')
|
|
768
|
+
ax2.grid(True, alpha=0.3)
|
|
769
|
+
|
|
770
|
+
plt.tight_layout()
|
|
771
|
+
|
|
772
|
+
# Guardar figura si está activado
|
|
773
|
+
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
774
|
+
if save_fig:
|
|
775
|
+
filename = filename or "multiples_distribuciones_ci"
|
|
776
|
+
self._save_figure(fig, filename)
|
|
777
|
+
|
|
778
|
+
return fig
|
|
779
|
+
|
|
780
|
+
# ============= MÉTODOS UTILITARIOS ADICIONALES =============
|
|
781
|
+
|
|
782
|
+
def get_descriptive_stats(self, data: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
783
|
+
column: Optional[str] = None) -> dict:
|
|
784
|
+
"""
|
|
785
|
+
Obtiene estadísticas descriptivas completas
|
|
786
|
+
"""
|
|
787
|
+
if isinstance(data, pd.DataFrame):
|
|
788
|
+
if column is None:
|
|
789
|
+
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
790
|
+
data_series = data[column]
|
|
791
|
+
elif isinstance(data, pd.Series):
|
|
792
|
+
data_series = data
|
|
793
|
+
else:
|
|
794
|
+
data_series = pd.Series(data)
|
|
795
|
+
|
|
796
|
+
data_clean = data_series.dropna()
|
|
797
|
+
|
|
798
|
+
return {
|
|
799
|
+
'count': len(data_clean),
|
|
800
|
+
'mean': np.mean(data_clean),
|
|
801
|
+
'median': np.median(data_clean),
|
|
802
|
+
'mode': stats.mode(data_clean)[0][0] if len(data_clean) > 0 else np.nan,
|
|
803
|
+
'std': np.std(data_clean, ddof=1),
|
|
804
|
+
'variance': np.var(data_clean, ddof=1),
|
|
805
|
+
'min': np.min(data_clean),
|
|
806
|
+
'max': np.max(data_clean),
|
|
807
|
+
'q1': np.percentile(data_clean, 25),
|
|
808
|
+
'q3': np.percentile(data_clean, 75),
|
|
809
|
+
'iqr': np.percentile(data_clean, 75) - np.percentile(data_clean, 25),
|
|
810
|
+
'skewness': stats.skew(data_clean),
|
|
811
|
+
'kurtosis': stats.kurtosis(data_clean),
|
|
812
|
+
'range': np.max(data_clean) - np.min(data_clean)
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
def help(self):
|
|
816
|
+
"""
|
|
817
|
+
Muestra ayuda completa de la clase DescriptiveStats
|
|
818
|
+
"""
|
|
819
|
+
help_text = """
|
|
820
|
+
╔════════════════════════════════════════════════════════════════════════════╗
|
|
821
|
+
║ 📊 CLASE UtilsStats - AYUDA COMPLETA ║
|
|
822
|
+
╚════════════════════════════════════════════════════════════════════════════╝
|
|
823
|
+
|
|
824
|
+
📝 DESCRIPCIÓN:
|
|
825
|
+
Clase para análisis estadístico descriptivo univariado y multivariado.
|
|
826
|
+
Proporciona herramientas para análisis exploratorio de datos, medidas de
|
|
827
|
+
tendencia central, dispersión, forma de distribución y regresión lineal.
|
|
828
|
+
|
|
829
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
830
|
+
|
|
831
|
+
📋 MÉTODOS PRINCIPALES:
|
|
832
|
+
|
|
833
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
834
|
+
│ 1. 📊 ANÁLISIS ESTADÍSTICO │
|
|
835
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
836
|
+
|
|
837
|
+
• .check_normality(data, alpha=0.05)
|
|
838
|
+
Verifica normalidad usando test Shapiro-Wilk
|
|
839
|
+
Retorna: dict con estadístico, p-value e interpretación
|
|
840
|
+
|
|
841
|
+
• .calculate_confidence_intervals(data, confidence_level=0.95,
|
|
842
|
+
method='parametric')
|
|
843
|
+
Calcula intervalos de confianza para la media
|
|
844
|
+
Métodos: 'parametric' o 'bootstrap'
|
|
845
|
+
|
|
846
|
+
• .detect_outliers(data, method='iqr', **kwargs)
|
|
847
|
+
Detecta valores atípicos
|
|
848
|
+
Métodos: 'iqr', 'zscore', 'isolation_forest'
|
|
849
|
+
|
|
850
|
+
• .calculate_effect_size(group1, group2, method='cohen')
|
|
851
|
+
Calcula tamaño del efecto entre grupos
|
|
852
|
+
Métodos: 'cohen' (Cohen's d) o 'hedges' (Hedges' g)
|
|
853
|
+
|
|
854
|
+
• .get_descriptive_stats(data, column=None)
|
|
855
|
+
Estadísticas descriptivas completas en un dict
|
|
856
|
+
|
|
857
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
858
|
+
│ 2. 🎨 VISUALIZACIÓN DE DISTRIBUCIONES │
|
|
859
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
860
|
+
|
|
861
|
+
• .plot_distribution(data, column=None, plot_type='hist',
|
|
862
|
+
backend='seaborn', bins=30, figsize=None,
|
|
863
|
+
save_fig=None, filename=None)
|
|
864
|
+
|
|
865
|
+
Grafica distribución de una variable
|
|
866
|
+
|
|
867
|
+
plot_type: 'hist', 'kde', 'box', 'violin', 'all'
|
|
868
|
+
backend: 'matplotlib', 'seaborn', 'plotly'
|
|
869
|
+
|
|
870
|
+
• .plot_distribution_with_ci(data, column=None, confidence_level=0.95,
|
|
871
|
+
ci_method='parametric', bins=30, figsize=None,
|
|
872
|
+
save_fig=None, filename=None)
|
|
873
|
+
|
|
874
|
+
Distribución con intervalos de confianza visualizados
|
|
875
|
+
|
|
876
|
+
• .plot_multiple_distributions_with_ci(data_dict, confidence_level=0.95)
|
|
877
|
+
|
|
878
|
+
Compara múltiples distribuciones con sus IC
|
|
879
|
+
|
|
880
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
881
|
+
│ 3. 🎨 VISUALIZACIÓN MULTIVARIADA │
|
|
882
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
883
|
+
|
|
884
|
+
• .plot_correlation_matrix(data, method='pearson', backend='seaborn',
|
|
885
|
+
figsize=None, save_fig=None)
|
|
886
|
+
|
|
887
|
+
Matriz de correlación con heatmap
|
|
888
|
+
Métodos: 'pearson', 'spearman', 'kendall'
|
|
889
|
+
|
|
890
|
+
• .plot_scatter_matrix(data, columns=None, backend='seaborn',
|
|
891
|
+
figsize=None, save_fig=None)
|
|
892
|
+
|
|
893
|
+
Matriz de gráficos de dispersión (pairplot)
|
|
894
|
+
Backends: 'seaborn', 'plotly', 'pandas'
|
|
895
|
+
|
|
896
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
897
|
+
│ 4. ⚙️ CONFIGURACIÓN │
|
|
898
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
899
|
+
|
|
900
|
+
• .set_plot_backend(backend)
|
|
901
|
+
Establece backend por defecto: 'matplotlib', 'seaborn', 'plotly'
|
|
902
|
+
|
|
903
|
+
• .set_default_figsize(figsize)
|
|
904
|
+
Establece tamaño de figura por defecto: (ancho, alto)
|
|
905
|
+
|
|
906
|
+
• .set_save_fig_options(save_fig=False, fig_format='png',
|
|
907
|
+
fig_dpi=300, figures_dir='figures')
|
|
908
|
+
|
|
909
|
+
Configura guardado automático de figuras
|
|
910
|
+
|
|
911
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
912
|
+
│ 5. 🛠️ UTILIDADES │
|
|
913
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
914
|
+
|
|
915
|
+
• .validate_dataframe(data)
|
|
916
|
+
Valida y convierte datos a DataFrame
|
|
917
|
+
|
|
918
|
+
• .format_number(num, decimals=6, scientific=False)
|
|
919
|
+
Formatea números con precisión específica
|
|
920
|
+
|
|
921
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
922
|
+
|
|
923
|
+
💡 EJEMPLOS DE USO:
|
|
924
|
+
|
|
925
|
+
┌─ Ejemplo 1: Configuración Inicial ──────────────────────────────────────┐
|
|
926
|
+
│ from utils import UtilsStats │
|
|
927
|
+
│ import pandas as pd │
|
|
928
|
+
│ import numpy as np │
|
|
929
|
+
│ │
|
|
930
|
+
│ # Inicializar │
|
|
931
|
+
│ utils = UtilsStats() │
|
|
932
|
+
│ │
|
|
933
|
+
│ # Configurar visualización │
|
|
934
|
+
│ utils.set_plot_backend('seaborn') │
|
|
935
|
+
│ utils.set_default_figsize((12, 6)) │
|
|
936
|
+
│ │
|
|
937
|
+
│ # Configurar guardado automático │
|
|
938
|
+
│ utils.set_save_fig_options( │
|
|
939
|
+
│ save_fig=True, │
|
|
940
|
+
│ fig_format='png', │
|
|
941
|
+
│ fig_dpi=300, │
|
|
942
|
+
│ figures_dir='mis_graficos' │
|
|
943
|
+
│ ) │
|
|
944
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
945
|
+
|
|
946
|
+
┌─ Ejemplo 2: Análisis de Normalidad ─────────────────────────────────────┐
|
|
947
|
+
│ # Generar datos │
|
|
948
|
+
│ datos_normales = np.random.normal(0, 1, 1000) │
|
|
949
|
+
│ datos_no_normales = np.random.exponential(2, 1000) │
|
|
950
|
+
│ │
|
|
951
|
+
│ # Test de normalidad │
|
|
952
|
+
│ resultado1 = utils.check_normality(datos_normales) │
|
|
953
|
+
│ print(f"Normales: {resultado1['interpretation']}") │
|
|
954
|
+
│ print(f"p-value: {resultado1['shapiro_pvalue']:.4f}") │
|
|
955
|
+
│ │
|
|
956
|
+
│ resultado2 = utils.check_normality(datos_no_normales) │
|
|
957
|
+
│ print(f"No normales: {resultado2['interpretation']}") │
|
|
958
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
959
|
+
|
|
960
|
+
┌─ Ejemplo 3: Intervalos de Confianza ────────────────────────────────────┐
|
|
961
|
+
│ # Método paramétrico │
|
|
962
|
+
│ ci_param = utils.calculate_confidence_intervals( │
|
|
963
|
+
│ datos_normales, │
|
|
964
|
+
│ confidence_level=0.95, │
|
|
965
|
+
│ method='parametric' │
|
|
966
|
+
│ ) │
|
|
967
|
+
│ │
|
|
968
|
+
│ print(f"Media: {ci_param['mean']:.3f}") │
|
|
969
|
+
│ print(f"IC 95%: [{ci_param['ci_lower']:.3f}, " │
|
|
970
|
+
│ f"{ci_param['ci_upper']:.3f}]") │
|
|
971
|
+
│ │
|
|
972
|
+
│ # Método bootstrap (para datos no normales) │
|
|
973
|
+
│ ci_boot = utils.calculate_confidence_intervals( │
|
|
974
|
+
│ datos_no_normales, │
|
|
975
|
+
│ confidence_level=0.95, │
|
|
976
|
+
│ method='bootstrap' │
|
|
977
|
+
│ ) │
|
|
978
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
979
|
+
|
|
980
|
+
┌─ Ejemplo 4: Detección de Outliers ──────────────────────────────────────┐
|
|
981
|
+
│ # Método IQR (rango intercuartílico) │
|
|
982
|
+
│ datos = np.random.normal(100, 15, 1000) │
|
|
983
|
+
│ datos = np.append(datos, [200, 210, -50]) # Agregar outliers │
|
|
984
|
+
│ │
|
|
985
|
+
│ outliers_iqr = utils.detect_outliers(datos, method='iqr') │
|
|
986
|
+
│ print(f"Outliers IQR: {outliers_iqr.sum()}") │
|
|
987
|
+
│ │
|
|
988
|
+
│ # Método Z-score │
|
|
989
|
+
│ outliers_z = utils.detect_outliers( │
|
|
990
|
+
│ datos, │
|
|
991
|
+
│ method='zscore', │
|
|
992
|
+
│ threshold=3 │
|
|
993
|
+
│ ) │
|
|
994
|
+
│ print(f"Outliers Z-score: {outliers_z.sum()}") │
|
|
995
|
+
│ │
|
|
996
|
+
│ # Isolation Forest (machine learning) │
|
|
997
|
+
│ outliers_if = utils.detect_outliers( │
|
|
998
|
+
│ datos, │
|
|
999
|
+
│ method='isolation_forest', │
|
|
1000
|
+
│ contamination=0.05 │
|
|
1001
|
+
│ ) │
|
|
1002
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
1003
|
+
|
|
1004
|
+
┌─ Ejemplo 5: Tamaño del Efecto ──────────────────────────────────────────┐
|
|
1005
|
+
│ # Comparar dos grupos │
|
|
1006
|
+
│ grupo_control = np.random.normal(100, 15, 100) │
|
|
1007
|
+
│ grupo_tratamiento = np.random.normal(110, 15, 100) │
|
|
1008
|
+
│ │
|
|
1009
|
+
│ efecto = utils.calculate_effect_size( │
|
|
1010
|
+
│ grupo_control, │
|
|
1011
|
+
│ grupo_tratamiento, │
|
|
1012
|
+
│ method='cohen' │
|
|
1013
|
+
│ ) │
|
|
1014
|
+
│ │
|
|
1015
|
+
│ print(f"Cohen's d: {efecto['effect_size']:.3f}") │
|
|
1016
|
+
│ print(f"Interpretación: {efecto['interpretation']}") │
|
|
1017
|
+
│ print(f"Diferencia de medias: {efecto['mean_diff']:.2f}") │
|
|
1018
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
1019
|
+
|
|
1020
|
+
┌─ Ejemplo 6: Gráficos de Distribución ───────────────────────────────────┐
|
|
1021
|
+
│ df = pd.DataFrame({ │
|
|
1022
|
+
│ 'edad': np.random.normal(35, 10, 500), │
|
|
1023
|
+
│ 'salario': np.random.lognormal(10.5, 0.5, 500) │
|
|
1024
|
+
│ }) │
|
|
1025
|
+
│ │
|
|
1026
|
+
│ # Histograma simple │
|
|
1027
|
+
│ fig1 = utils.plot_distribution( │
|
|
1028
|
+
│ df, │
|
|
1029
|
+
│ column='edad', │
|
|
1030
|
+
│ plot_type='hist', │
|
|
1031
|
+
│ bins=30 │
|
|
1032
|
+
│ ) │
|
|
1033
|
+
│ │
|
|
1034
|
+
│ # Panel completo (histograma, box, violin, Q-Q) │
|
|
1035
|
+
│ fig2 = utils.plot_distribution( │
|
|
1036
|
+
│ df, │
|
|
1037
|
+
│ column='salario', │
|
|
1038
|
+
│ plot_type='all', │
|
|
1039
|
+
│ backend='seaborn' │
|
|
1040
|
+
│ ) │
|
|
1041
|
+
│ │
|
|
1042
|
+
│ # Con Plotly (interactivo) │
|
|
1043
|
+
│ fig3 = utils.plot_distribution( │
|
|
1044
|
+
│ df, │
|
|
1045
|
+
│ column='edad', │
|
|
1046
|
+
│ plot_type='violin', │
|
|
1047
|
+
│ backend='plotly' │
|
|
1048
|
+
│ ) │
|
|
1049
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
1050
|
+
|
|
1051
|
+
┌─ Ejemplo 7: Distribución con Intervalos de Confianza ───────────────────┐
|
|
1052
|
+
│ # Visualizar distribución con IC │
|
|
1053
|
+
│ fig = utils.plot_distribution_with_ci( │
|
|
1054
|
+
│ df, │
|
|
1055
|
+
│ column='edad', │
|
|
1056
|
+
│ confidence_level=0.95, │
|
|
1057
|
+
│ ci_method='parametric', │
|
|
1058
|
+
│ bins=30, │
|
|
1059
|
+
│ save_fig=True, │
|
|
1060
|
+
│ filename='edad_con_ic' │
|
|
1061
|
+
│ ) │
|
|
1062
|
+
│ │
|
|
1063
|
+
│ # Comparar múltiples distribuciones │
|
|
1064
|
+
│ data_dict = { │
|
|
1065
|
+
│ 'Grupo A': df['edad'][:200], │
|
|
1066
|
+
│ 'Grupo B': df['edad'][200:400], │
|
|
1067
|
+
│ 'Grupo C': df['edad'][400:] │
|
|
1068
|
+
│ } │
|
|
1069
|
+
│ │
|
|
1070
|
+
│ fig = utils.plot_multiple_distributions_with_ci( │
|
|
1071
|
+
│ data_dict, │
|
|
1072
|
+
│ confidence_level=0.95 │
|
|
1073
|
+
│ ) │
|
|
1074
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
1075
|
+
|
|
1076
|
+
┌─ Ejemplo 8: Matriz de Correlación ──────────────────────────────────────┐
|
|
1077
|
+
│ # Crear datos correlacionados │
|
|
1078
|
+
│ df = pd.DataFrame({ │
|
|
1079
|
+
│ 'A': np.random.normal(0, 1, 100), │
|
|
1080
|
+
│ 'B': np.random.normal(0, 1, 100), │
|
|
1081
|
+
│ 'C': np.random.normal(0, 1, 100) │
|
|
1082
|
+
│ }) │
|
|
1083
|
+
│ df['D'] = df['A'] * 0.8 + np.random.normal(0, 0.2, 100) │
|
|
1084
|
+
│ │
|
|
1085
|
+
│ # Matriz de correlación con seaborn │
|
|
1086
|
+
│ fig = utils.plot_correlation_matrix( │
|
|
1087
|
+
│ df, │
|
|
1088
|
+
│ method='pearson', │
|
|
1089
|
+
│ backend='seaborn', │
|
|
1090
|
+
│ figsize=(10, 8) │
|
|
1091
|
+
│ ) │
|
|
1092
|
+
│ │
|
|
1093
|
+
│ # Con Plotly (interactiva) │
|
|
1094
|
+
│ fig = utils.plot_correlation_matrix( │
|
|
1095
|
+
│ df, │
|
|
1096
|
+
│ method='spearman', │
|
|
1097
|
+
│ backend='plotly' │
|
|
1098
|
+
│ ) │
|
|
1099
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
1100
|
+
|
|
1101
|
+
┌─ Ejemplo 9: Matriz de Dispersión ───────────────────────────────────────┐
|
|
1102
|
+
│ # Pairplot completo │
|
|
1103
|
+
│ fig = utils.plot_scatter_matrix( │
|
|
1104
|
+
│ df, │
|
|
1105
|
+
│ columns=['A', 'B', 'C', 'D'], │
|
|
1106
|
+
│ backend='seaborn' │
|
|
1107
|
+
│ ) │
|
|
1108
|
+
│ │
|
|
1109
|
+
│ # Con Plotly │
|
|
1110
|
+
│ fig = utils.plot_scatter_matrix( │
|
|
1111
|
+
│ df, │
|
|
1112
|
+
│ backend='plotly' │
|
|
1113
|
+
│ ) │
|
|
1114
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
1115
|
+
|
|
1116
|
+
┌─ Ejemplo 10: Estadísticas Descriptivas Completas ───────────────────────┐
|
|
1117
|
+
│ # Obtener todas las estadísticas │
|
|
1118
|
+
│ stats = utils.get_descriptive_stats(df, column='edad') │
|
|
1119
|
+
│ │
|
|
1120
|
+
│ print(f"Media: {stats['mean']:.2f}") │
|
|
1121
|
+
│ print(f"Mediana: {stats['median']:.2f}") │
|
|
1122
|
+
│ print(f"Desv. Est.: {stats['std']:.2f}") │
|
|
1123
|
+
│ print(f"IQR: {stats['iqr']:.2f}") │
|
|
1124
|
+
│ print(f"Asimetría: {stats['skewness']:.3f}") │
|
|
1125
|
+
│ print(f"Curtosis: {stats['kurtosis']:.3f}") │
|
|
1126
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
1127
|
+
|
|
1128
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
1129
|
+
|
|
1130
|
+
🎯 CARACTERÍSTICAS CLAVE:
|
|
1131
|
+
|
|
1132
|
+
✓ Múltiples backends de visualización (matplotlib, seaborn, plotly)
|
|
1133
|
+
✓ Guardado automático de figuras en alta resolución
|
|
1134
|
+
✓ Análisis estadísticos robustos
|
|
1135
|
+
✓ Detección de outliers con 3 métodos
|
|
1136
|
+
✓ Intervalos de confianza paramétricos y bootstrap
|
|
1137
|
+
✓ Visualizaciones profesionales listas para publicación
|
|
1138
|
+
✓ Manejo automático de valores faltantes
|
|
1139
|
+
✓ Integración perfecta con pandas y numpy
|
|
1140
|
+
✓ Gráficos interactivos con Plotly
|
|
1141
|
+
|
|
1142
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
1143
|
+
|
|
1144
|
+
📊 BACKENDS DE VISUALIZACIÓN:
|
|
1145
|
+
|
|
1146
|
+
🔹 Matplotlib:
|
|
1147
|
+
• Rápido y ligero
|
|
1148
|
+
• Ideal para gráficos simples
|
|
1149
|
+
• Mejor para exportar a archivos
|
|
1150
|
+
|
|
1151
|
+
🔹 Seaborn:
|
|
1152
|
+
• Gráficos estadísticos elegantes
|
|
1153
|
+
• Temas predefinidos atractivos
|
|
1154
|
+
• Mejor para análisis exploratorio
|
|
1155
|
+
|
|
1156
|
+
🔹 Plotly:
|
|
1157
|
+
• Gráficos interactivos
|
|
1158
|
+
• Zoom, pan, hover tooltips
|
|
1159
|
+
• Ideal para presentaciones y dashboards
|
|
1160
|
+
|
|
1161
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
1162
|
+
|
|
1163
|
+
💡 CONSEJOS Y MEJORES PRÁCTICAS:
|
|
1164
|
+
|
|
1165
|
+
1. Siempre verificar normalidad antes de usar métodos paramétricos
|
|
1166
|
+
2. Usar bootstrap para IC cuando los datos no son normales
|
|
1167
|
+
3. Detectar outliers antes de calcular estadísticas
|
|
1168
|
+
4. Guardar figuras en alta resolución (300 DPI) para publicaciones
|
|
1169
|
+
5. Usar Plotly para presentaciones interactivas
|
|
1170
|
+
6. Usar seaborn para análisis exploratorio rápido
|
|
1171
|
+
|
|
1172
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
1173
|
+
|
|
1174
|
+
📚 DOCUMENTACIÓN ADICIONAL:
|
|
1175
|
+
Para más información sobre métodos específicos, use:
|
|
1176
|
+
help(UtilsStats.nombre_metodo)
|
|
1177
|
+
|
|
1178
|
+
╚════════════════════════════════════════════════════════════════════════════╝
|
|
1179
|
+
"""
|
|
1180
|
+
print(help_text)
|