statslibx 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {statslib → statslibx}/__init__.py +5 -4
- {statslib → statslibx}/descriptive.py +182 -31
- statslibx/inferential.py +974 -0
- statslibx/utils.py +1180 -0
- {statslibx-0.1.0.dist-info → statslibx-0.1.2.dist-info}/METADATA +34 -3
- statslibx-0.1.2.dist-info/RECORD +8 -0
- statslibx-0.1.2.dist-info/top_level.txt +1 -0
- statslib/inferential.py +0 -547
- statslib/utils.py +0 -889
- statslibx-0.1.0.dist-info/RECORD +0 -8
- statslibx-0.1.0.dist-info/top_level.txt +0 -1
- {statslibx-0.1.0.dist-info → statslibx-0.1.2.dist-info}/WHEEL +0 -0
statslib/utils.py
DELETED
|
@@ -1,889 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import pandas as pd
|
|
3
|
-
import matplotlib.pyplot as plt
|
|
4
|
-
from typing import Union, List, Optional, Literal, Tuple
|
|
5
|
-
import warnings
|
|
6
|
-
import os
|
|
7
|
-
from scipy import stats
|
|
8
|
-
import seaborn as sns
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class UtilsStats:
|
|
12
|
-
"""
|
|
13
|
-
Clase utilitaria para operaciones estadísticas comunes y visualización
|
|
14
|
-
|
|
15
|
-
Esta clase proporciona métodos para validación de datos, análisis estadísticos
|
|
16
|
-
básicos y visualización de resultados.
|
|
17
|
-
|
|
18
|
-
Examples:
|
|
19
|
-
---------
|
|
20
|
-
>>> utils = UtilsStats()
|
|
21
|
-
>>> data = np.random.normal(0, 1, 100)
|
|
22
|
-
>>> utils.check_normality(data)
|
|
23
|
-
>>> utils.plot_distribution(data)
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def __init__(self):
|
|
27
|
-
"""Inicializar la clase utilitaria"""
|
|
28
|
-
self._plot_backend = 'seaborn'
|
|
29
|
-
self._default_figsize = (12, 5)
|
|
30
|
-
self._save_fig = False
|
|
31
|
-
self._fig_format = 'png'
|
|
32
|
-
self._fig_dpi = 300
|
|
33
|
-
self._figures_dir = 'figures'
|
|
34
|
-
|
|
35
|
-
# Configuración de estilo para matplotlib
|
|
36
|
-
plt.style.use('default')
|
|
37
|
-
self._setup_plotting_style()
|
|
38
|
-
|
|
39
|
-
def _setup_plotting_style(self):
|
|
40
|
-
"""Configurar estilos de plotting por defecto"""
|
|
41
|
-
plt.rcParams['figure.figsize'] = [self._default_figsize[0], self._default_figsize[1]]
|
|
42
|
-
plt.rcParams['figure.dpi'] = self._fig_dpi
|
|
43
|
-
plt.rcParams['savefig.dpi'] = self._fig_dpi
|
|
44
|
-
plt.rcParams['font.size'] = 10
|
|
45
|
-
plt.rcParams['axes.grid'] = True
|
|
46
|
-
plt.rcParams['grid.alpha'] = 0.3
|
|
47
|
-
plt.rcParams['lines.linewidth'] = 2
|
|
48
|
-
|
|
49
|
-
def set_plot_backend(self, backend: Literal['matplotlib', 'seaborn', 'plotly']):
|
|
50
|
-
"""
|
|
51
|
-
Establecer el backend de visualización por defecto
|
|
52
|
-
"""
|
|
53
|
-
self._plot_backend = backend
|
|
54
|
-
|
|
55
|
-
def set_default_figsize(self, figsize: Tuple[int, int]):
|
|
56
|
-
"""
|
|
57
|
-
Establecer el tamaño de figura por defecto
|
|
58
|
-
"""
|
|
59
|
-
self._default_figsize = figsize
|
|
60
|
-
plt.rcParams['figure.figsize'] = [figsize[0], figsize[1]]
|
|
61
|
-
|
|
62
|
-
def set_save_fig_options(self, save_fig: Optional[bool] = False,
|
|
63
|
-
fig_format: str = 'png',
|
|
64
|
-
fig_dpi: int = 300,
|
|
65
|
-
figures_dir: str = 'figures'):
|
|
66
|
-
"""
|
|
67
|
-
Configurar opciones para guardar figuras
|
|
68
|
-
"""
|
|
69
|
-
self._save_fig = save_fig
|
|
70
|
-
self._fig_format = fig_format
|
|
71
|
-
self._fig_dpi = fig_dpi
|
|
72
|
-
self._figures_dir = figures_dir
|
|
73
|
-
|
|
74
|
-
def _save_figure(self, fig, filename: str, **kwargs):
|
|
75
|
-
"""
|
|
76
|
-
Guardar figura si save_fig está activado
|
|
77
|
-
"""
|
|
78
|
-
if self._save_fig:
|
|
79
|
-
try:
|
|
80
|
-
os.makedirs(self._figures_dir, exist_ok=True)
|
|
81
|
-
filepath = os.path.join(self._figures_dir, f"{filename}.{self._fig_format}")
|
|
82
|
-
|
|
83
|
-
fig.savefig(
|
|
84
|
-
filepath,
|
|
85
|
-
format=self._fig_format,
|
|
86
|
-
dpi=self._fig_dpi,
|
|
87
|
-
bbox_inches='tight',
|
|
88
|
-
facecolor='white',
|
|
89
|
-
**kwargs
|
|
90
|
-
)
|
|
91
|
-
print(f"✓ Figura guardada: {filepath}")
|
|
92
|
-
|
|
93
|
-
except Exception as e:
|
|
94
|
-
print(f"✗ Error guardando figura: {e}")
|
|
95
|
-
|
|
96
|
-
# ============= MÉTODOS DE ANÁLISIS ESTADÍSTICO =============
|
|
97
|
-
|
|
98
|
-
def validate_dataframe(self, data: Union[pd.DataFrame, np.ndarray, list]) -> pd.DataFrame:
|
|
99
|
-
"""Valida y convierte datos a DataFrame"""
|
|
100
|
-
if isinstance(data, pd.DataFrame):
|
|
101
|
-
return data
|
|
102
|
-
elif isinstance(data, np.ndarray):
|
|
103
|
-
if data.ndim == 1:
|
|
104
|
-
return pd.DataFrame({'var': data})
|
|
105
|
-
elif data.ndim == 2:
|
|
106
|
-
return pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
|
|
107
|
-
else:
|
|
108
|
-
raise ValueError("Solo se soportan arrays 1D y 2D")
|
|
109
|
-
elif isinstance(data, list):
|
|
110
|
-
return pd.DataFrame(data)
|
|
111
|
-
else:
|
|
112
|
-
raise TypeError(f"Tipo de dato no soportado: {type(data)}")
|
|
113
|
-
|
|
114
|
-
def format_number(self, num: float, decimals: int = 6, scientific: bool = False) -> str:
|
|
115
|
-
"""Formatea un número con decimales especificados"""
|
|
116
|
-
if scientific and abs(num) < 0.001:
|
|
117
|
-
return f"{num:.{decimals}e}"
|
|
118
|
-
return f"{num:.{decimals}f}"
|
|
119
|
-
|
|
120
|
-
def check_normality(self, data: Union[pd.Series, np.ndarray], alpha: float = 0.05) -> dict:
|
|
121
|
-
"""Verifica si los datos siguen distribución normal usando Shapiro-Wilk"""
|
|
122
|
-
if isinstance(data, pd.Series):
|
|
123
|
-
data = data.dropna().values
|
|
124
|
-
else:
|
|
125
|
-
data = np.array(data)
|
|
126
|
-
data = data[~np.isnan(data)]
|
|
127
|
-
|
|
128
|
-
shapiro_stat, shapiro_p = stats.shapiro(data)
|
|
129
|
-
|
|
130
|
-
return {
|
|
131
|
-
'is_normal': shapiro_p > alpha,
|
|
132
|
-
'shapiro_statistic': shapiro_stat,
|
|
133
|
-
'shapiro_pvalue': shapiro_p,
|
|
134
|
-
'alpha': alpha,
|
|
135
|
-
'interpretation': 'Normal' if shapiro_p > alpha else 'No Normal'
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
def calculate_confidence_intervals(self, data: Union[pd.Series, np.ndarray],
|
|
139
|
-
confidence_level: float = 0.95,
|
|
140
|
-
method: str = 'parametric') -> dict:
|
|
141
|
-
"""
|
|
142
|
-
Calcula intervalos de confianza para la media
|
|
143
|
-
"""
|
|
144
|
-
if isinstance(data, pd.Series):
|
|
145
|
-
data_clean = data.dropna().values
|
|
146
|
-
else:
|
|
147
|
-
data_clean = data[~np.isnan(data)]
|
|
148
|
-
|
|
149
|
-
n = len(data_clean)
|
|
150
|
-
mean = np.mean(data_clean)
|
|
151
|
-
std = np.std(data_clean, ddof=1)
|
|
152
|
-
|
|
153
|
-
if method == 'parametric':
|
|
154
|
-
se = std / np.sqrt(n)
|
|
155
|
-
z_value = stats.t.ppf((1 + confidence_level) / 2, n - 1)
|
|
156
|
-
margin_error = z_value * se
|
|
157
|
-
|
|
158
|
-
ci_lower = mean - margin_error
|
|
159
|
-
ci_upper = mean + margin_error
|
|
160
|
-
|
|
161
|
-
elif method == 'bootstrap':
|
|
162
|
-
n_bootstraps = 1000
|
|
163
|
-
bootstrap_means = []
|
|
164
|
-
|
|
165
|
-
for _ in range(n_bootstraps):
|
|
166
|
-
bootstrap_sample = np.random.choice(data_clean, size=n, replace=True)
|
|
167
|
-
bootstrap_means.append(np.mean(bootstrap_sample))
|
|
168
|
-
|
|
169
|
-
alpha = 1 - confidence_level
|
|
170
|
-
ci_lower = np.percentile(bootstrap_means, (alpha / 2) * 100)
|
|
171
|
-
ci_upper = np.percentile(bootstrap_means, (1 - alpha / 2) * 100)
|
|
172
|
-
margin_error = (ci_upper - ci_lower) / 2
|
|
173
|
-
|
|
174
|
-
else:
|
|
175
|
-
raise ValueError("Método debe ser 'parametric' o 'bootstrap'")
|
|
176
|
-
|
|
177
|
-
return {
|
|
178
|
-
'mean': mean,
|
|
179
|
-
'std': std,
|
|
180
|
-
'n': n,
|
|
181
|
-
'confidence_level': confidence_level,
|
|
182
|
-
'ci_lower': ci_lower,
|
|
183
|
-
'ci_upper': ci_upper,
|
|
184
|
-
'margin_error': margin_error,
|
|
185
|
-
'method': method
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
def detect_outliers(self, data: Union[pd.Series, np.ndarray],
|
|
189
|
-
method: Literal['iqr', 'zscore', 'isolation_forest'] = 'iqr',
|
|
190
|
-
**kwargs) -> np.ndarray:
|
|
191
|
-
"""
|
|
192
|
-
Detecta outliers usando diferentes métodos
|
|
193
|
-
|
|
194
|
-
Parameters:
|
|
195
|
-
-----------
|
|
196
|
-
data : array-like
|
|
197
|
-
Datos a analizar
|
|
198
|
-
method : str
|
|
199
|
-
'iqr', 'zscore', o 'isolation_forest'
|
|
200
|
-
|
|
201
|
-
Returns:
|
|
202
|
-
--------
|
|
203
|
-
np.ndarray
|
|
204
|
-
Array booleano indicando outliers
|
|
205
|
-
"""
|
|
206
|
-
if isinstance(data, pd.Series):
|
|
207
|
-
data = data.values
|
|
208
|
-
|
|
209
|
-
data_clean = data[~np.isnan(data)]
|
|
210
|
-
|
|
211
|
-
if method == 'iqr':
|
|
212
|
-
q1 = np.percentile(data_clean, 25)
|
|
213
|
-
q3 = np.percentile(data_clean, 75)
|
|
214
|
-
iqr = q3 - q1
|
|
215
|
-
lower_bound = q1 - 1.5 * iqr
|
|
216
|
-
upper_bound = q3 + 1.5 * iqr
|
|
217
|
-
outliers = (data_clean < lower_bound) | (data_clean > upper_bound)
|
|
218
|
-
|
|
219
|
-
elif method == 'zscore':
|
|
220
|
-
threshold = kwargs.get('threshold', 3)
|
|
221
|
-
z_scores = np.abs((data_clean - np.mean(data_clean)) / np.std(data_clean))
|
|
222
|
-
outliers = z_scores > threshold
|
|
223
|
-
|
|
224
|
-
elif method == 'isolation_forest':
|
|
225
|
-
from sklearn.ensemble import IsolationForest
|
|
226
|
-
contamination = kwargs.get('contamination', 0.1)
|
|
227
|
-
X = data_clean.reshape(-1, 1)
|
|
228
|
-
clf = IsolationForest(contamination=contamination, random_state=42)
|
|
229
|
-
outliers = clf.fit_predict(X) == -1
|
|
230
|
-
|
|
231
|
-
else:
|
|
232
|
-
raise ValueError("Método debe ser 'iqr', 'zscore', o 'isolation_forest'")
|
|
233
|
-
|
|
234
|
-
return outliers
|
|
235
|
-
|
|
236
|
-
def calculate_effect_size(self, group1: np.ndarray, group2: np.ndarray,
|
|
237
|
-
method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
|
|
238
|
-
"""
|
|
239
|
-
Calcula el tamaño del efecto entre dos grupos
|
|
240
|
-
"""
|
|
241
|
-
mean1, mean2 = np.mean(group1), np.mean(group2)
|
|
242
|
-
std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
|
|
243
|
-
n1, n2 = len(group1), len(group2)
|
|
244
|
-
|
|
245
|
-
pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
|
|
246
|
-
cohens_d = (mean1 - mean2) / pooled_std
|
|
247
|
-
|
|
248
|
-
if method == 'hedges':
|
|
249
|
-
correction = 1 - (3 / (4 * (n1 + n2) - 9))
|
|
250
|
-
effect_size = cohens_d * correction
|
|
251
|
-
else:
|
|
252
|
-
effect_size = cohens_d
|
|
253
|
-
|
|
254
|
-
abs_effect = abs(effect_size)
|
|
255
|
-
if abs_effect < 0.2:
|
|
256
|
-
interpretation = "Muy pequeño"
|
|
257
|
-
elif abs_effect < 0.5:
|
|
258
|
-
interpretation = "Pequeño"
|
|
259
|
-
elif abs_effect < 0.8:
|
|
260
|
-
interpretation = "Mediano"
|
|
261
|
-
else:
|
|
262
|
-
interpretation = "Grande"
|
|
263
|
-
|
|
264
|
-
return {
|
|
265
|
-
'effect_size': effect_size,
|
|
266
|
-
'method': method,
|
|
267
|
-
'interpretation': interpretation,
|
|
268
|
-
'mean_diff': mean1 - mean2,
|
|
269
|
-
'pooled_std': pooled_std
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
# ============= MÉTODOS DE VISUALIZACIÓN COMPLETOS =============
|
|
273
|
-
|
|
274
|
-
def _plot_distribution_seaborn(self, data, plot_type, bins, figsize, title, **kwargs):
|
|
275
|
-
"""Implementación con seaborn"""
|
|
276
|
-
if plot_type == 'all':
|
|
277
|
-
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
|
|
278
|
-
|
|
279
|
-
# Histograma
|
|
280
|
-
sns.histplot(data, bins=bins, kde=True, ax=axes[0, 0])
|
|
281
|
-
axes[0, 0].set_title('Histograma con KDE')
|
|
282
|
-
|
|
283
|
-
# Box plot
|
|
284
|
-
sns.boxplot(y=data, ax=axes[0, 1])
|
|
285
|
-
axes[0, 1].set_title('Box Plot')
|
|
286
|
-
|
|
287
|
-
# Violin plot
|
|
288
|
-
sns.violinplot(y=data, ax=axes[1, 0])
|
|
289
|
-
axes[1, 0].set_title('Violin Plot')
|
|
290
|
-
|
|
291
|
-
# Q-Q plot
|
|
292
|
-
stats.probplot(data, dist="norm", plot=axes[1, 1])
|
|
293
|
-
axes[1, 1].set_title('Q-Q Plot')
|
|
294
|
-
|
|
295
|
-
fig.suptitle(title, fontsize=16, y=1.00)
|
|
296
|
-
plt.tight_layout()
|
|
297
|
-
|
|
298
|
-
else:
|
|
299
|
-
fig, ax = plt.subplots(figsize=figsize)
|
|
300
|
-
|
|
301
|
-
if plot_type == 'hist':
|
|
302
|
-
sns.histplot(data, bins=bins, kde=True, ax=ax, **kwargs)
|
|
303
|
-
elif plot_type == 'kde':
|
|
304
|
-
sns.kdeplot(data, ax=ax, **kwargs)
|
|
305
|
-
elif plot_type == 'box':
|
|
306
|
-
sns.boxplot(y=data, ax=ax, **kwargs)
|
|
307
|
-
elif plot_type == 'violin':
|
|
308
|
-
sns.violinplot(y=data, ax=ax, **kwargs)
|
|
309
|
-
|
|
310
|
-
ax.set_title(title)
|
|
311
|
-
plt.tight_layout()
|
|
312
|
-
|
|
313
|
-
return fig
|
|
314
|
-
|
|
315
|
-
def _plot_distribution_matplotlib(self, data, plot_type, bins, figsize, title, **kwargs):
|
|
316
|
-
"""Implementación con matplotlib puro"""
|
|
317
|
-
if plot_type == 'all':
|
|
318
|
-
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
|
|
319
|
-
|
|
320
|
-
# Histograma
|
|
321
|
-
axes[0, 0].hist(data, bins=bins, alpha=0.7, edgecolor='black', density=True)
|
|
322
|
-
axes[0, 0].set_title('Histograma')
|
|
323
|
-
axes[0, 0].set_ylabel('Densidad')
|
|
324
|
-
|
|
325
|
-
# Box plot
|
|
326
|
-
axes[0, 1].boxplot(data)
|
|
327
|
-
axes[0, 1].set_title('Box Plot')
|
|
328
|
-
|
|
329
|
-
# KDE
|
|
330
|
-
from scipy.stats import gaussian_kde
|
|
331
|
-
kde = gaussian_kde(data)
|
|
332
|
-
x_range = np.linspace(data.min(), data.max(), 100)
|
|
333
|
-
axes[1, 0].plot(x_range, kde(x_range))
|
|
334
|
-
axes[1, 0].fill_between(x_range, kde(x_range), alpha=0.3)
|
|
335
|
-
axes[1, 0].set_title('KDE')
|
|
336
|
-
axes[1, 0].set_ylabel('Densidad')
|
|
337
|
-
|
|
338
|
-
# Q-Q plot
|
|
339
|
-
stats.probplot(data, dist="norm", plot=axes[1, 1])
|
|
340
|
-
axes[1, 1].set_title('Q-Q Plot')
|
|
341
|
-
|
|
342
|
-
fig.suptitle(title, fontsize=16)
|
|
343
|
-
plt.tight_layout()
|
|
344
|
-
|
|
345
|
-
else:
|
|
346
|
-
fig, ax = plt.subplots(figsize=figsize)
|
|
347
|
-
|
|
348
|
-
if plot_type == 'hist':
|
|
349
|
-
ax.hist(data, bins=bins, edgecolor='black', alpha=0.7, **kwargs)
|
|
350
|
-
ax.set_ylabel('Frecuencia')
|
|
351
|
-
elif plot_type == 'box':
|
|
352
|
-
ax.boxplot(data, vert=True)
|
|
353
|
-
elif plot_type == 'kde':
|
|
354
|
-
from scipy.stats import gaussian_kde
|
|
355
|
-
kde = gaussian_kde(data)
|
|
356
|
-
x_range = np.linspace(data.min(), data.max(), 100)
|
|
357
|
-
ax.plot(x_range, kde(x_range), **kwargs)
|
|
358
|
-
ax.fill_between(x_range, kde(x_range), alpha=0.3)
|
|
359
|
-
ax.set_ylabel('Densidad')
|
|
360
|
-
|
|
361
|
-
ax.set_title(title)
|
|
362
|
-
ax.grid(True, alpha=0.3)
|
|
363
|
-
plt.tight_layout()
|
|
364
|
-
|
|
365
|
-
return fig
|
|
366
|
-
|
|
367
|
-
def plot_distribution(self, data: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
368
|
-
column: Optional[str] = None,
|
|
369
|
-
plot_type: Literal['hist', 'kde', 'box', 'violin', 'all'] = 'hist',
|
|
370
|
-
backend: Optional[Literal['matplotlib', 'seaborn', 'plotly']] = "seaborn",
|
|
371
|
-
bins: int = 30,
|
|
372
|
-
figsize: Optional[Tuple[int, int]] = None,
|
|
373
|
-
save_fig: Optional[bool] = None,
|
|
374
|
-
filename: Optional[str] = None,
|
|
375
|
-
**kwargs):
|
|
376
|
-
"""
|
|
377
|
-
Graficar distribución de una variable
|
|
378
|
-
|
|
379
|
-
Parameters:
|
|
380
|
-
-----------
|
|
381
|
-
data : DataFrame, Series o ndarray
|
|
382
|
-
Datos a graficar
|
|
383
|
-
column : str, optional
|
|
384
|
-
Columna a graficar (si data es DataFrame)
|
|
385
|
-
plot_type : str
|
|
386
|
-
Tipo de gráfico
|
|
387
|
-
backend : str, optional
|
|
388
|
-
Backend de visualización
|
|
389
|
-
bins : int
|
|
390
|
-
Número de bins para histograma
|
|
391
|
-
figsize : tuple, optional
|
|
392
|
-
Tamaño de la figura
|
|
393
|
-
save_fig : bool, optional
|
|
394
|
-
Si guardar la figura
|
|
395
|
-
filename : str, optional
|
|
396
|
-
Nombre del archivo
|
|
397
|
-
"""
|
|
398
|
-
backend = backend or self._plot_backend
|
|
399
|
-
figsize = figsize or self._default_figsize
|
|
400
|
-
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
401
|
-
|
|
402
|
-
# Extraer datos
|
|
403
|
-
if isinstance(data, pd.DataFrame):
|
|
404
|
-
if column is None:
|
|
405
|
-
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
406
|
-
plot_data = data[column].dropna()
|
|
407
|
-
title = f"Distribución de {column}"
|
|
408
|
-
default_filename = f"distribucion_{column}"
|
|
409
|
-
elif isinstance(data, pd.Series):
|
|
410
|
-
plot_data = data.dropna()
|
|
411
|
-
title = f"Distribución de {data.name if data.name else 'Variable'}"
|
|
412
|
-
default_filename = f"distribucion_{data.name if data.name else 'variable'}"
|
|
413
|
-
else:
|
|
414
|
-
plot_data = pd.Series(data).dropna()
|
|
415
|
-
title = "Distribución"
|
|
416
|
-
default_filename = "distribucion"
|
|
417
|
-
|
|
418
|
-
filename = filename or default_filename
|
|
419
|
-
|
|
420
|
-
try:
|
|
421
|
-
if backend == 'seaborn':
|
|
422
|
-
fig = self._plot_distribution_seaborn(plot_data, plot_type, bins, figsize, title, **kwargs)
|
|
423
|
-
elif backend == 'matplotlib':
|
|
424
|
-
fig = self._plot_distribution_matplotlib(plot_data, plot_type, bins, figsize, title, **kwargs)
|
|
425
|
-
elif backend == 'plotly':
|
|
426
|
-
fig = self._plot_distribution_plotly(plot_data, plot_type, bins, title, **kwargs)
|
|
427
|
-
else:
|
|
428
|
-
raise ValueError(f"Backend '{backend}' no soportado")
|
|
429
|
-
|
|
430
|
-
# Guardar figura si está activado
|
|
431
|
-
if save_fig and backend != 'plotly':
|
|
432
|
-
self._save_figure(fig, filename)
|
|
433
|
-
|
|
434
|
-
return fig
|
|
435
|
-
|
|
436
|
-
except Exception as e:
|
|
437
|
-
print(f"Error en plot_distribution: {e}")
|
|
438
|
-
raise
|
|
439
|
-
|
|
440
|
-
def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
|
|
441
|
-
"""Implementación con plotly"""
|
|
442
|
-
try:
|
|
443
|
-
import plotly.graph_objects as go
|
|
444
|
-
import plotly.express as px
|
|
445
|
-
from plotly.subplots import make_subplots
|
|
446
|
-
except ImportError:
|
|
447
|
-
raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
|
|
448
|
-
|
|
449
|
-
if plot_type == 'all':
|
|
450
|
-
fig = make_subplots(
|
|
451
|
-
rows=2, cols=2,
|
|
452
|
-
subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
|
|
453
|
-
)
|
|
454
|
-
|
|
455
|
-
# Histograma
|
|
456
|
-
fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
|
|
457
|
-
|
|
458
|
-
# Box plot
|
|
459
|
-
fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
|
|
460
|
-
|
|
461
|
-
# Violin plot
|
|
462
|
-
fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
|
|
463
|
-
|
|
464
|
-
# Distribución acumulada
|
|
465
|
-
hist, bin_edges = np.histogram(data, bins=bins, density=True)
|
|
466
|
-
cdf = np.cumsum(hist * np.diff(bin_edges))
|
|
467
|
-
fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
|
|
468
|
-
|
|
469
|
-
else:
|
|
470
|
-
if plot_type == 'hist':
|
|
471
|
-
fig = px.histogram(data, nbins=bins, title=title)
|
|
472
|
-
elif plot_type == 'box':
|
|
473
|
-
fig = px.box(y=data, title=title)
|
|
474
|
-
elif plot_type == 'violin':
|
|
475
|
-
fig = px.violin(y=data, title=title, box=True)
|
|
476
|
-
else:
|
|
477
|
-
fig = px.histogram(data, nbins=bins, title=title)
|
|
478
|
-
|
|
479
|
-
return fig
|
|
480
|
-
|
|
481
|
-
def plot_correlation_matrix(self, data: pd.DataFrame,
|
|
482
|
-
method: str = 'pearson',
|
|
483
|
-
backend: Optional[Literal['seaborn', 'plotly']] = None,
|
|
484
|
-
figsize: Optional[Tuple[int, int]] = None,
|
|
485
|
-
save_fig: Optional[bool] = None,
|
|
486
|
-
filename: Optional[str] = None,
|
|
487
|
-
**kwargs):
|
|
488
|
-
"""
|
|
489
|
-
Visualizar matriz de correlación
|
|
490
|
-
|
|
491
|
-
Parameters:
|
|
492
|
-
-----------
|
|
493
|
-
data : DataFrame
|
|
494
|
-
Datos para calcular correlación
|
|
495
|
-
method : str
|
|
496
|
-
'pearson', 'spearman' o 'kendall'
|
|
497
|
-
backend : str, optional
|
|
498
|
-
Backend de visualización
|
|
499
|
-
"""
|
|
500
|
-
backend = backend or self._plot_backend
|
|
501
|
-
figsize = figsize or self._default_figsize
|
|
502
|
-
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
503
|
-
filename = filename or "matriz_correlacion"
|
|
504
|
-
|
|
505
|
-
# Calcular matriz de correlación
|
|
506
|
-
corr_matrix = data.corr(method=method)
|
|
507
|
-
|
|
508
|
-
if backend == 'seaborn':
|
|
509
|
-
fig, ax = plt.subplots(figsize=figsize)
|
|
510
|
-
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
|
511
|
-
|
|
512
|
-
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
|
|
513
|
-
cmap='coolwarm', center=0, ax=ax,
|
|
514
|
-
square=True, linewidths=0.5, **kwargs)
|
|
515
|
-
ax.set_title(f'Matriz de Correlación ({method})', fontsize=14, pad=20)
|
|
516
|
-
plt.tight_layout()
|
|
517
|
-
|
|
518
|
-
elif backend == 'plotly':
|
|
519
|
-
import plotly.graph_objects as go
|
|
520
|
-
|
|
521
|
-
fig = go.Figure(data=go.Heatmap(
|
|
522
|
-
z=corr_matrix.values,
|
|
523
|
-
x=corr_matrix.columns,
|
|
524
|
-
y=corr_matrix.index,
|
|
525
|
-
colorscale='RdBu',
|
|
526
|
-
zmid=0,
|
|
527
|
-
text=corr_matrix.values,
|
|
528
|
-
texttemplate='%{text:.2f}',
|
|
529
|
-
textfont={"size": 10},
|
|
530
|
-
**kwargs
|
|
531
|
-
))
|
|
532
|
-
|
|
533
|
-
fig.update_layout(
|
|
534
|
-
title=f'Matriz de Correlación ({method})',
|
|
535
|
-
xaxis_title='Variables',
|
|
536
|
-
yaxis_title='Variables',
|
|
537
|
-
width=figsize[0]*100,
|
|
538
|
-
height=figsize[1]*100
|
|
539
|
-
)
|
|
540
|
-
|
|
541
|
-
# Guardar figura
|
|
542
|
-
if save_fig:
|
|
543
|
-
if backend == 'seaborn':
|
|
544
|
-
self._save_figure(fig, filename)
|
|
545
|
-
elif backend == 'plotly':
|
|
546
|
-
try:
|
|
547
|
-
os.makedirs(self._figures_dir, exist_ok=True)
|
|
548
|
-
filepath = os.path.join(self._figures_dir, f"{filename}.{self._fig_format}")
|
|
549
|
-
fig.write_image(filepath)
|
|
550
|
-
print(f"✓ Figura Plotly guardada: {filepath}")
|
|
551
|
-
except Exception as e:
|
|
552
|
-
print(f"✗ Error guardando figura Plotly: {e}")
|
|
553
|
-
|
|
554
|
-
return fig
|
|
555
|
-
|
|
556
|
-
def plot_scatter_matrix(self, data: pd.DataFrame,
|
|
557
|
-
columns: Optional[List[str]] = None,
|
|
558
|
-
backend: Optional[Literal['seaborn', 'plotly', 'pandas']] = None,
|
|
559
|
-
figsize: Optional[Tuple[int, int]] = None,
|
|
560
|
-
save_fig: Optional[bool] = None,
|
|
561
|
-
filename: Optional[str] = None,
|
|
562
|
-
**kwargs):
|
|
563
|
-
"""
|
|
564
|
-
Matriz de gráficos de dispersión (pairplot)
|
|
565
|
-
"""
|
|
566
|
-
backend = backend or self._plot_backend
|
|
567
|
-
figsize = figsize or self._default_figsize
|
|
568
|
-
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
569
|
-
filename = filename or "scatter_matrix"
|
|
570
|
-
|
|
571
|
-
if columns:
|
|
572
|
-
data = data[columns]
|
|
573
|
-
|
|
574
|
-
if backend == 'seaborn':
|
|
575
|
-
fig = sns.pairplot(data, **kwargs)
|
|
576
|
-
fig.fig.suptitle('Matriz de Dispersión', y=1.02)
|
|
577
|
-
|
|
578
|
-
elif backend == 'plotly':
|
|
579
|
-
import plotly.express as px
|
|
580
|
-
fig = px.scatter_matrix(data, **kwargs)
|
|
581
|
-
fig.update_layout(title='Matriz de Dispersión')
|
|
582
|
-
|
|
583
|
-
elif backend == 'pandas':
|
|
584
|
-
from pandas.plotting import scatter_matrix
|
|
585
|
-
fig, ax = plt.subplots(figsize=figsize)
|
|
586
|
-
scatter_matrix(data, ax=ax, **kwargs)
|
|
587
|
-
|
|
588
|
-
# Guardar figura
|
|
589
|
-
if save_fig:
|
|
590
|
-
if backend in ['seaborn', 'pandas']:
|
|
591
|
-
self._save_figure(fig.figure if hasattr(fig, 'figure') else fig, filename)
|
|
592
|
-
elif backend == 'plotly':
|
|
593
|
-
try:
|
|
594
|
-
os.makedirs(self._figures_dir, exist_ok=True)
|
|
595
|
-
filepath = os.path.join(self._figures_dir, f"{filename}.{self._fig_format}")
|
|
596
|
-
fig.write_image(filepath)
|
|
597
|
-
print(f"✓ Figura Plotly guardada: {filepath}")
|
|
598
|
-
except Exception as e:
|
|
599
|
-
print(f"✗ Error guardando figura Plotly: {e}")
|
|
600
|
-
|
|
601
|
-
return fig
|
|
602
|
-
|
|
603
|
-
# ============= GRÁFICOS CON INTERVALOS DE CONFIANZA =============
|
|
604
|
-
|
|
605
|
-
def plot_distribution_with_ci(self,
|
|
606
|
-
data: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
607
|
-
column: Optional[str] = None,
|
|
608
|
-
confidence_level: float = 0.95,
|
|
609
|
-
ci_method: str = 'parametric',
|
|
610
|
-
bins: int = 30,
|
|
611
|
-
figsize: Optional[Tuple[int, int]] = None,
|
|
612
|
-
save_fig: Optional[bool] = None,
|
|
613
|
-
filename: Optional[str] = None,
|
|
614
|
-
**kwargs) -> plt.Figure:
|
|
615
|
-
"""
|
|
616
|
-
Grafica la distribución junto con intervalos de confianza
|
|
617
|
-
"""
|
|
618
|
-
# Extraer y limpiar datos
|
|
619
|
-
if isinstance(data, pd.DataFrame):
|
|
620
|
-
if column is None:
|
|
621
|
-
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
622
|
-
plot_data = data[column].dropna()
|
|
623
|
-
data_name = column
|
|
624
|
-
elif isinstance(data, pd.Series):
|
|
625
|
-
plot_data = data.dropna()
|
|
626
|
-
data_name = data.name if data.name else 'Variable'
|
|
627
|
-
else:
|
|
628
|
-
plot_data = pd.Series(data).dropna()
|
|
629
|
-
data_name = 'Variable'
|
|
630
|
-
|
|
631
|
-
data_array = plot_data.values
|
|
632
|
-
default_filename = f"distribucion_ci_{data_name.lower().replace(' ', '_')}"
|
|
633
|
-
filename = filename or default_filename
|
|
634
|
-
|
|
635
|
-
# Calcular estadísticas e intervalos de confianza
|
|
636
|
-
ci_result = self.calculate_confidence_intervals(data_array, confidence_level, ci_method)
|
|
637
|
-
normality_result = self.check_normality(data_array)
|
|
638
|
-
|
|
639
|
-
# Crear figura con dos subgráficas
|
|
640
|
-
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize or (14, 6))
|
|
641
|
-
|
|
642
|
-
# ===== PRIMERA GRÁFICA: Distribución básica =====
|
|
643
|
-
n, bins, patches = ax1.hist(data_array, bins=bins, alpha=0.7,
|
|
644
|
-
color='skyblue', edgecolor='black',
|
|
645
|
-
density=True, label='Histograma')
|
|
646
|
-
|
|
647
|
-
# KDE
|
|
648
|
-
kde = stats.gaussian_kde(data_array)
|
|
649
|
-
x_range = np.linspace(data_array.min(), data_array.max(), 200)
|
|
650
|
-
ax1.plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
|
|
651
|
-
|
|
652
|
-
# Línea vertical en la media
|
|
653
|
-
ax1.axvline(ci_result['mean'], color='red', linestyle='--',
|
|
654
|
-
linewidth=2, label=f'Media: {ci_result["mean"]:.2f}')
|
|
655
|
-
|
|
656
|
-
ax1.set_xlabel('Valores')
|
|
657
|
-
ax1.set_ylabel('Densidad')
|
|
658
|
-
ax1.set_title(f'Distribución de {data_name}\n'
|
|
659
|
-
f'Media: {ci_result["mean"]:.2f}, '
|
|
660
|
-
f'Desv. Est.: {ci_result["std"]:.2f}')
|
|
661
|
-
ax1.legend()
|
|
662
|
-
ax1.grid(True, alpha=0.3)
|
|
663
|
-
|
|
664
|
-
# ===== SEGUNDA GRÁFICA: Distribución con intervalos de confianza =====
|
|
665
|
-
n, bins, patches = ax2.hist(data_array, bins=bins, alpha=0.7,
|
|
666
|
-
color='lightgreen', edgecolor='black',
|
|
667
|
-
density=True, label='Histograma')
|
|
668
|
-
|
|
669
|
-
# KDE
|
|
670
|
-
ax2.plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
|
|
671
|
-
|
|
672
|
-
# Media y intervalos de confianza
|
|
673
|
-
ax2.axvline(ci_result['mean'], color='red', linestyle='-',
|
|
674
|
-
linewidth=3, label=f'Media: {ci_result["mean"]:.2f}')
|
|
675
|
-
|
|
676
|
-
# Intervalo de confianza
|
|
677
|
-
ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
|
|
678
|
-
alpha=0.3, color='orange',
|
|
679
|
-
label=f'IC {confidence_level*100}%: [{ci_result["ci_lower"]:.2f}, {ci_result["ci_upper"]:.2f}]')
|
|
680
|
-
|
|
681
|
-
# Líneas para los límites del IC
|
|
682
|
-
ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
|
|
683
|
-
ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
|
|
684
|
-
|
|
685
|
-
# Distribución normal teórica (si los datos son normales)
|
|
686
|
-
if normality_result['is_normal']:
|
|
687
|
-
normal_x = np.linspace(data_array.min(), data_array.max(), 200)
|
|
688
|
-
normal_y = stats.norm.pdf(normal_x, ci_result['mean'], ci_result['std'])
|
|
689
|
-
ax2.plot(normal_x, normal_y, 'g--', linewidth=2, alpha=0.7,
|
|
690
|
-
label='Distribución Normal Teórica')
|
|
691
|
-
|
|
692
|
-
ax2.set_xlabel('Valores')
|
|
693
|
-
ax2.set_ylabel('Densidad')
|
|
694
|
-
ax2.set_title(f'Distribución con Intervalos de Confianza\n'
|
|
695
|
-
f'Método: {ci_method}, n={ci_result["n"]}')
|
|
696
|
-
ax2.legend()
|
|
697
|
-
ax2.grid(True, alpha=0.3)
|
|
698
|
-
|
|
699
|
-
# Información adicional como texto
|
|
700
|
-
info_text = (f'Estadísticas:\n'
|
|
701
|
-
f'• Media: {ci_result["mean"]:.3f}\n'
|
|
702
|
-
f'• Desv. Est.: {ci_result["std"]:.3f}\n'
|
|
703
|
-
f'• n: {ci_result["n"]}\n'
|
|
704
|
-
f'• IC {confidence_level*100}%: [{ci_result["ci_lower"]:.3f}, {ci_result["ci_upper"]:.3f}]\n'
|
|
705
|
-
f'• Margen Error: ±{ci_result["margin_error"]:.3f}\n'
|
|
706
|
-
f'• Normalidad: {normality_result["interpretation"]}\n'
|
|
707
|
-
f'• p-value: {normality_result["shapiro_pvalue"]:.4f}')
|
|
708
|
-
|
|
709
|
-
fig.text(0.02, 0.02, info_text, fontsize=9,
|
|
710
|
-
bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.7),
|
|
711
|
-
verticalalignment='bottom')
|
|
712
|
-
|
|
713
|
-
plt.tight_layout()
|
|
714
|
-
|
|
715
|
-
# Guardar figura si está activado
|
|
716
|
-
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
717
|
-
if save_fig:
|
|
718
|
-
self._save_figure(fig, filename)
|
|
719
|
-
|
|
720
|
-
return fig
|
|
721
|
-
|
|
722
|
-
def plot_multiple_distributions_with_ci(self,
|
|
723
|
-
data_dict: dict,
|
|
724
|
-
confidence_level: float = 0.95,
|
|
725
|
-
figsize: Optional[Tuple[int, int]] = None,
|
|
726
|
-
save_fig: Optional[bool] = None,
|
|
727
|
-
filename: Optional[str] = None,
|
|
728
|
-
**kwargs) -> plt.Figure:
|
|
729
|
-
"""
|
|
730
|
-
Grafica múltiples distribuciones con sus intervalos de confianza
|
|
731
|
-
"""
|
|
732
|
-
n_distributions = len(data_dict)
|
|
733
|
-
fig, axes = plt.subplots(n_distributions, 2,
|
|
734
|
-
figsize=figsize or (14, 5 * n_distributions))
|
|
735
|
-
|
|
736
|
-
if n_distributions == 1:
|
|
737
|
-
axes = axes.reshape(1, -1)
|
|
738
|
-
|
|
739
|
-
colors = plt.cm.Set3(np.linspace(0, 1, n_distributions))
|
|
740
|
-
|
|
741
|
-
for idx, (name, data) in enumerate(data_dict.items()):
|
|
742
|
-
ax1, ax2 = axes[idx]
|
|
743
|
-
|
|
744
|
-
if isinstance(data, pd.Series):
|
|
745
|
-
data_array = data.dropna().values
|
|
746
|
-
else:
|
|
747
|
-
data_array = np.array(data)
|
|
748
|
-
data_array = data_array[~np.isnan(data_array)]
|
|
749
|
-
|
|
750
|
-
# Calcular estadísticas
|
|
751
|
-
ci_result = self.calculate_confidence_intervals(data_array, confidence_level)
|
|
752
|
-
|
|
753
|
-
# Gráfica izquierda: Distribución básica
|
|
754
|
-
ax1.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
|
|
755
|
-
edgecolor='black', density=True)
|
|
756
|
-
|
|
757
|
-
kde = stats.gaussian_kde(data_array)
|
|
758
|
-
x_range = np.linspace(data_array.min(), data_array.max(), 200)
|
|
759
|
-
ax1.plot(x_range, kde(x_range), 'k-', linewidth=2)
|
|
760
|
-
ax1.axvline(ci_result['mean'], color='red', linestyle='--', linewidth=2)
|
|
761
|
-
|
|
762
|
-
ax1.set_title(f'{name}\nMedia: {ci_result["mean"]:.2f}')
|
|
763
|
-
ax1.grid(True, alpha=0.3)
|
|
764
|
-
|
|
765
|
-
# Gráfica derecha: Con intervalos de confianza
|
|
766
|
-
ax2.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
|
|
767
|
-
edgecolor='black', density=True)
|
|
768
|
-
ax2.plot(x_range, kde(x_range), 'k-', linewidth=2)
|
|
769
|
-
|
|
770
|
-
ax2.axvline(ci_result['mean'], color='red', linestyle='-', linewidth=3)
|
|
771
|
-
ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
|
|
772
|
-
alpha=0.3, color='orange')
|
|
773
|
-
ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
|
|
774
|
-
ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
|
|
775
|
-
|
|
776
|
-
ax2.set_title(f'{name} con IC {confidence_level*100}%')
|
|
777
|
-
ax2.grid(True, alpha=0.3)
|
|
778
|
-
|
|
779
|
-
plt.tight_layout()
|
|
780
|
-
|
|
781
|
-
# Guardar figura si está activado
|
|
782
|
-
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
783
|
-
if save_fig:
|
|
784
|
-
filename = filename or "multiples_distribuciones_ci"
|
|
785
|
-
self._save_figure(fig, filename)
|
|
786
|
-
|
|
787
|
-
return fig
|
|
788
|
-
|
|
789
|
-
# ============= MÉTODOS UTILITARIOS ADICIONALES =============
|
|
790
|
-
|
|
791
|
-
def get_descriptive_stats(self, data: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
792
|
-
column: Optional[str] = None) -> dict:
|
|
793
|
-
"""
|
|
794
|
-
Obtiene estadísticas descriptivas completas
|
|
795
|
-
"""
|
|
796
|
-
if isinstance(data, pd.DataFrame):
|
|
797
|
-
if column is None:
|
|
798
|
-
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
799
|
-
data_series = data[column]
|
|
800
|
-
elif isinstance(data, pd.Series):
|
|
801
|
-
data_series = data
|
|
802
|
-
else:
|
|
803
|
-
data_series = pd.Series(data)
|
|
804
|
-
|
|
805
|
-
data_clean = data_series.dropna()
|
|
806
|
-
|
|
807
|
-
return {
|
|
808
|
-
'count': len(data_clean),
|
|
809
|
-
'mean': np.mean(data_clean),
|
|
810
|
-
'median': np.median(data_clean),
|
|
811
|
-
'mode': stats.mode(data_clean)[0][0] if len(data_clean) > 0 else np.nan,
|
|
812
|
-
'std': np.std(data_clean, ddof=1),
|
|
813
|
-
'variance': np.var(data_clean, ddof=1),
|
|
814
|
-
'min': np.min(data_clean),
|
|
815
|
-
'max': np.max(data_clean),
|
|
816
|
-
'q1': np.percentile(data_clean, 25),
|
|
817
|
-
'q3': np.percentile(data_clean, 75),
|
|
818
|
-
'iqr': np.percentile(data_clean, 75) - np.percentile(data_clean, 25),
|
|
819
|
-
'skewness': stats.skew(data_clean),
|
|
820
|
-
'kurtosis': stats.kurtosis(data_clean),
|
|
821
|
-
'range': np.max(data_clean) - np.min(data_clean)
|
|
822
|
-
}
|
|
823
|
-
|
|
824
|
-
def help(self):
|
|
825
|
-
"""
|
|
826
|
-
Muestra ayuda completa de la clase UtilsStats
|
|
827
|
-
"""
|
|
828
|
-
help_text = """
|
|
829
|
-
📊 CLASE UtilsStats - AYUDA COMPLETA
|
|
830
|
-
|
|
831
|
-
Clase utilitaria para análisis estadísticos y visualización de datos.
|
|
832
|
-
|
|
833
|
-
🔧 MÉTODOS PRINCIPALES:
|
|
834
|
-
|
|
835
|
-
1. 📈 ANÁLISIS ESTADÍSTICO:
|
|
836
|
-
• check_normality() # Test de normalidad
|
|
837
|
-
• calculate_confidence_intervals() # Intervalos de confianza
|
|
838
|
-
• detect_outliers() # Detección de outliers
|
|
839
|
-
• calculate_effect_size() # Tamaño del efecto
|
|
840
|
-
• get_descriptive_stats() # Estadísticas descriptivas
|
|
841
|
-
|
|
842
|
-
2. 🎨 VISUALIZACIÓN:
|
|
843
|
-
• plot_distribution() # Gráficos de distribución
|
|
844
|
-
• plot_distribution_with_ci() # Distribución con IC
|
|
845
|
-
• plot_multiple_distributions_with_ci() # Múltiples distribuciones
|
|
846
|
-
• plot_correlation_matrix() # Matriz de correlación
|
|
847
|
-
• plot_scatter_matrix() # Matriz de dispersión
|
|
848
|
-
|
|
849
|
-
3. ⚙️ CONFIGURACIÓN:
|
|
850
|
-
• set_plot_backend() # Backend de visualización
|
|
851
|
-
• set_default_figsize() # Tamaño de figura
|
|
852
|
-
• set_save_fig_options() # Opciones para guardar
|
|
853
|
-
|
|
854
|
-
4. 🛠️ UTILIDADES:
|
|
855
|
-
• validate_dataframe() # Validación de datos
|
|
856
|
-
• format_number() # Formateo de números
|
|
857
|
-
|
|
858
|
-
💡 EJEMPLOS DE USO:
|
|
859
|
-
|
|
860
|
-
# Inicializar
|
|
861
|
-
utils = UtilsStats()
|
|
862
|
-
|
|
863
|
-
# Análisis de normalidad
|
|
864
|
-
normalidad = utils.check_normality(mis_datos)
|
|
865
|
-
|
|
866
|
-
# Gráfico con intervalos de confianza
|
|
867
|
-
fig = utils.plot_distribution_with_ci(
|
|
868
|
-
data=mis_datos,
|
|
869
|
-
confidence_level=0.95,
|
|
870
|
-
bins=20
|
|
871
|
-
)
|
|
872
|
-
|
|
873
|
-
# Matriz de correlación
|
|
874
|
-
fig_corr = utils.plot_correlation_matrix(
|
|
875
|
-
data=mi_dataframe,
|
|
876
|
-
method='pearson'
|
|
877
|
-
)
|
|
878
|
-
|
|
879
|
-
# Estadísticas descriptivas
|
|
880
|
-
stats = utils.get_descriptive_stats(mis_datos)
|
|
881
|
-
|
|
882
|
-
🎯 CARACTERÍSTICAS:
|
|
883
|
-
• Múltiples backends: matplotlib, seaborn, plotly
|
|
884
|
-
• Guardado automático de figuras
|
|
885
|
-
• Manejo robusto de datos faltantes
|
|
886
|
-
• Visualizaciones profesionales listas para publicación
|
|
887
|
-
• Integración perfecta con Jupyter notebooks
|
|
888
|
-
"""
|
|
889
|
-
print(help_text)
|