statslibx 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statslib/utils.py ADDED
@@ -0,0 +1,889 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ from typing import Union, List, Optional, Literal, Tuple
5
+ import warnings
6
+ import os
7
+ from scipy import stats
8
+ import seaborn as sns
9
+
10
+
11
+ class UtilsStats:
12
+ """
13
+ Clase utilitaria para operaciones estadísticas comunes y visualización
14
+
15
+ Esta clase proporciona métodos para validación de datos, análisis estadísticos
16
+ básicos y visualización de resultados.
17
+
18
+ Examples:
19
+ ---------
20
+ >>> utils = UtilsStats()
21
+ >>> data = np.random.normal(0, 1, 100)
22
+ >>> utils.check_normality(data)
23
+ >>> utils.plot_distribution(data)
24
+ """
25
+
26
+ def __init__(self):
27
+ """Inicializar la clase utilitaria"""
28
+ self._plot_backend = 'seaborn'
29
+ self._default_figsize = (12, 5)
30
+ self._save_fig = False
31
+ self._fig_format = 'png'
32
+ self._fig_dpi = 300
33
+ self._figures_dir = 'figures'
34
+
35
+ # Configuración de estilo para matplotlib
36
+ plt.style.use('default')
37
+ self._setup_plotting_style()
38
+
39
+ def _setup_plotting_style(self):
40
+ """Configurar estilos de plotting por defecto"""
41
+ plt.rcParams['figure.figsize'] = [self._default_figsize[0], self._default_figsize[1]]
42
+ plt.rcParams['figure.dpi'] = self._fig_dpi
43
+ plt.rcParams['savefig.dpi'] = self._fig_dpi
44
+ plt.rcParams['font.size'] = 10
45
+ plt.rcParams['axes.grid'] = True
46
+ plt.rcParams['grid.alpha'] = 0.3
47
+ plt.rcParams['lines.linewidth'] = 2
48
+
49
+ def set_plot_backend(self, backend: Literal['matplotlib', 'seaborn', 'plotly']):
50
+ """
51
+ Establecer el backend de visualización por defecto
52
+ """
53
+ self._plot_backend = backend
54
+
55
+ def set_default_figsize(self, figsize: Tuple[int, int]):
56
+ """
57
+ Establecer el tamaño de figura por defecto
58
+ """
59
+ self._default_figsize = figsize
60
+ plt.rcParams['figure.figsize'] = [figsize[0], figsize[1]]
61
+
62
+ def set_save_fig_options(self, save_fig: Optional[bool] = False,
63
+ fig_format: str = 'png',
64
+ fig_dpi: int = 300,
65
+ figures_dir: str = 'figures'):
66
+ """
67
+ Configurar opciones para guardar figuras
68
+ """
69
+ self._save_fig = save_fig
70
+ self._fig_format = fig_format
71
+ self._fig_dpi = fig_dpi
72
+ self._figures_dir = figures_dir
73
+
74
+ def _save_figure(self, fig, filename: str, **kwargs):
75
+ """
76
+ Guardar figura si save_fig está activado
77
+ """
78
+ if self._save_fig:
79
+ try:
80
+ os.makedirs(self._figures_dir, exist_ok=True)
81
+ filepath = os.path.join(self._figures_dir, f"{filename}.{self._fig_format}")
82
+
83
+ fig.savefig(
84
+ filepath,
85
+ format=self._fig_format,
86
+ dpi=self._fig_dpi,
87
+ bbox_inches='tight',
88
+ facecolor='white',
89
+ **kwargs
90
+ )
91
+ print(f"✓ Figura guardada: {filepath}")
92
+
93
+ except Exception as e:
94
+ print(f"✗ Error guardando figura: {e}")
95
+
96
+ # ============= MÉTODOS DE ANÁLISIS ESTADÍSTICO =============
97
+
98
+ def validate_dataframe(self, data: Union[pd.DataFrame, np.ndarray, list]) -> pd.DataFrame:
99
+ """Valida y convierte datos a DataFrame"""
100
+ if isinstance(data, pd.DataFrame):
101
+ return data
102
+ elif isinstance(data, np.ndarray):
103
+ if data.ndim == 1:
104
+ return pd.DataFrame({'var': data})
105
+ elif data.ndim == 2:
106
+ return pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
107
+ else:
108
+ raise ValueError("Solo se soportan arrays 1D y 2D")
109
+ elif isinstance(data, list):
110
+ return pd.DataFrame(data)
111
+ else:
112
+ raise TypeError(f"Tipo de dato no soportado: {type(data)}")
113
+
114
+ def format_number(self, num: float, decimals: int = 6, scientific: bool = False) -> str:
115
+ """Formatea un número con decimales especificados"""
116
+ if scientific and abs(num) < 0.001:
117
+ return f"{num:.{decimals}e}"
118
+ return f"{num:.{decimals}f}"
119
+
120
+ def check_normality(self, data: Union[pd.Series, np.ndarray], alpha: float = 0.05) -> dict:
121
+ """Verifica si los datos siguen distribución normal usando Shapiro-Wilk"""
122
+ if isinstance(data, pd.Series):
123
+ data = data.dropna().values
124
+ else:
125
+ data = np.array(data)
126
+ data = data[~np.isnan(data)]
127
+
128
+ shapiro_stat, shapiro_p = stats.shapiro(data)
129
+
130
+ return {
131
+ 'is_normal': shapiro_p > alpha,
132
+ 'shapiro_statistic': shapiro_stat,
133
+ 'shapiro_pvalue': shapiro_p,
134
+ 'alpha': alpha,
135
+ 'interpretation': 'Normal' if shapiro_p > alpha else 'No Normal'
136
+ }
137
+
138
+ def calculate_confidence_intervals(self, data: Union[pd.Series, np.ndarray],
139
+ confidence_level: float = 0.95,
140
+ method: str = 'parametric') -> dict:
141
+ """
142
+ Calcula intervalos de confianza para la media
143
+ """
144
+ if isinstance(data, pd.Series):
145
+ data_clean = data.dropna().values
146
+ else:
147
+ data_clean = data[~np.isnan(data)]
148
+
149
+ n = len(data_clean)
150
+ mean = np.mean(data_clean)
151
+ std = np.std(data_clean, ddof=1)
152
+
153
+ if method == 'parametric':
154
+ se = std / np.sqrt(n)
155
+ z_value = stats.t.ppf((1 + confidence_level) / 2, n - 1)
156
+ margin_error = z_value * se
157
+
158
+ ci_lower = mean - margin_error
159
+ ci_upper = mean + margin_error
160
+
161
+ elif method == 'bootstrap':
162
+ n_bootstraps = 1000
163
+ bootstrap_means = []
164
+
165
+ for _ in range(n_bootstraps):
166
+ bootstrap_sample = np.random.choice(data_clean, size=n, replace=True)
167
+ bootstrap_means.append(np.mean(bootstrap_sample))
168
+
169
+ alpha = 1 - confidence_level
170
+ ci_lower = np.percentile(bootstrap_means, (alpha / 2) * 100)
171
+ ci_upper = np.percentile(bootstrap_means, (1 - alpha / 2) * 100)
172
+ margin_error = (ci_upper - ci_lower) / 2
173
+
174
+ else:
175
+ raise ValueError("Método debe ser 'parametric' o 'bootstrap'")
176
+
177
+ return {
178
+ 'mean': mean,
179
+ 'std': std,
180
+ 'n': n,
181
+ 'confidence_level': confidence_level,
182
+ 'ci_lower': ci_lower,
183
+ 'ci_upper': ci_upper,
184
+ 'margin_error': margin_error,
185
+ 'method': method
186
+ }
187
+
188
+ def detect_outliers(self, data: Union[pd.Series, np.ndarray],
189
+ method: Literal['iqr', 'zscore', 'isolation_forest'] = 'iqr',
190
+ **kwargs) -> np.ndarray:
191
+ """
192
+ Detecta outliers usando diferentes métodos
193
+
194
+ Parameters:
195
+ -----------
196
+ data : array-like
197
+ Datos a analizar
198
+ method : str
199
+ 'iqr', 'zscore', o 'isolation_forest'
200
+
201
+ Returns:
202
+ --------
203
+ np.ndarray
204
+ Array booleano indicando outliers
205
+ """
206
+ if isinstance(data, pd.Series):
207
+ data = data.values
208
+
209
+ data_clean = data[~np.isnan(data)]
210
+
211
+ if method == 'iqr':
212
+ q1 = np.percentile(data_clean, 25)
213
+ q3 = np.percentile(data_clean, 75)
214
+ iqr = q3 - q1
215
+ lower_bound = q1 - 1.5 * iqr
216
+ upper_bound = q3 + 1.5 * iqr
217
+ outliers = (data_clean < lower_bound) | (data_clean > upper_bound)
218
+
219
+ elif method == 'zscore':
220
+ threshold = kwargs.get('threshold', 3)
221
+ z_scores = np.abs((data_clean - np.mean(data_clean)) / np.std(data_clean))
222
+ outliers = z_scores > threshold
223
+
224
+ elif method == 'isolation_forest':
225
+ from sklearn.ensemble import IsolationForest
226
+ contamination = kwargs.get('contamination', 0.1)
227
+ X = data_clean.reshape(-1, 1)
228
+ clf = IsolationForest(contamination=contamination, random_state=42)
229
+ outliers = clf.fit_predict(X) == -1
230
+
231
+ else:
232
+ raise ValueError("Método debe ser 'iqr', 'zscore', o 'isolation_forest'")
233
+
234
+ return outliers
235
+
236
+ def calculate_effect_size(self, group1: np.ndarray, group2: np.ndarray,
237
+ method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
238
+ """
239
+ Calcula el tamaño del efecto entre dos grupos
240
+ """
241
+ mean1, mean2 = np.mean(group1), np.mean(group2)
242
+ std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
243
+ n1, n2 = len(group1), len(group2)
244
+
245
+ pooled_std = np.sqrt(((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2))
246
+ cohens_d = (mean1 - mean2) / pooled_std
247
+
248
+ if method == 'hedges':
249
+ correction = 1 - (3 / (4 * (n1 + n2) - 9))
250
+ effect_size = cohens_d * correction
251
+ else:
252
+ effect_size = cohens_d
253
+
254
+ abs_effect = abs(effect_size)
255
+ if abs_effect < 0.2:
256
+ interpretation = "Muy pequeño"
257
+ elif abs_effect < 0.5:
258
+ interpretation = "Pequeño"
259
+ elif abs_effect < 0.8:
260
+ interpretation = "Mediano"
261
+ else:
262
+ interpretation = "Grande"
263
+
264
+ return {
265
+ 'effect_size': effect_size,
266
+ 'method': method,
267
+ 'interpretation': interpretation,
268
+ 'mean_diff': mean1 - mean2,
269
+ 'pooled_std': pooled_std
270
+ }
271
+
272
+ # ============= MÉTODOS DE VISUALIZACIÓN COMPLETOS =============
273
+
274
+ def _plot_distribution_seaborn(self, data, plot_type, bins, figsize, title, **kwargs):
275
+ """Implementación con seaborn"""
276
+ if plot_type == 'all':
277
+ fig, axes = plt.subplots(2, 2, figsize=(15, 12))
278
+
279
+ # Histograma
280
+ sns.histplot(data, bins=bins, kde=True, ax=axes[0, 0])
281
+ axes[0, 0].set_title('Histograma con KDE')
282
+
283
+ # Box plot
284
+ sns.boxplot(y=data, ax=axes[0, 1])
285
+ axes[0, 1].set_title('Box Plot')
286
+
287
+ # Violin plot
288
+ sns.violinplot(y=data, ax=axes[1, 0])
289
+ axes[1, 0].set_title('Violin Plot')
290
+
291
+ # Q-Q plot
292
+ stats.probplot(data, dist="norm", plot=axes[1, 1])
293
+ axes[1, 1].set_title('Q-Q Plot')
294
+
295
+ fig.suptitle(title, fontsize=16, y=1.00)
296
+ plt.tight_layout()
297
+
298
+ else:
299
+ fig, ax = plt.subplots(figsize=figsize)
300
+
301
+ if plot_type == 'hist':
302
+ sns.histplot(data, bins=bins, kde=True, ax=ax, **kwargs)
303
+ elif plot_type == 'kde':
304
+ sns.kdeplot(data, ax=ax, **kwargs)
305
+ elif plot_type == 'box':
306
+ sns.boxplot(y=data, ax=ax, **kwargs)
307
+ elif plot_type == 'violin':
308
+ sns.violinplot(y=data, ax=ax, **kwargs)
309
+
310
+ ax.set_title(title)
311
+ plt.tight_layout()
312
+
313
+ return fig
314
+
315
+ def _plot_distribution_matplotlib(self, data, plot_type, bins, figsize, title, **kwargs):
316
+ """Implementación con matplotlib puro"""
317
+ if plot_type == 'all':
318
+ fig, axes = plt.subplots(2, 2, figsize=(15, 12))
319
+
320
+ # Histograma
321
+ axes[0, 0].hist(data, bins=bins, alpha=0.7, edgecolor='black', density=True)
322
+ axes[0, 0].set_title('Histograma')
323
+ axes[0, 0].set_ylabel('Densidad')
324
+
325
+ # Box plot
326
+ axes[0, 1].boxplot(data)
327
+ axes[0, 1].set_title('Box Plot')
328
+
329
+ # KDE
330
+ from scipy.stats import gaussian_kde
331
+ kde = gaussian_kde(data)
332
+ x_range = np.linspace(data.min(), data.max(), 100)
333
+ axes[1, 0].plot(x_range, kde(x_range))
334
+ axes[1, 0].fill_between(x_range, kde(x_range), alpha=0.3)
335
+ axes[1, 0].set_title('KDE')
336
+ axes[1, 0].set_ylabel('Densidad')
337
+
338
+ # Q-Q plot
339
+ stats.probplot(data, dist="norm", plot=axes[1, 1])
340
+ axes[1, 1].set_title('Q-Q Plot')
341
+
342
+ fig.suptitle(title, fontsize=16)
343
+ plt.tight_layout()
344
+
345
+ else:
346
+ fig, ax = plt.subplots(figsize=figsize)
347
+
348
+ if plot_type == 'hist':
349
+ ax.hist(data, bins=bins, edgecolor='black', alpha=0.7, **kwargs)
350
+ ax.set_ylabel('Frecuencia')
351
+ elif plot_type == 'box':
352
+ ax.boxplot(data, vert=True)
353
+ elif plot_type == 'kde':
354
+ from scipy.stats import gaussian_kde
355
+ kde = gaussian_kde(data)
356
+ x_range = np.linspace(data.min(), data.max(), 100)
357
+ ax.plot(x_range, kde(x_range), **kwargs)
358
+ ax.fill_between(x_range, kde(x_range), alpha=0.3)
359
+ ax.set_ylabel('Densidad')
360
+
361
+ ax.set_title(title)
362
+ ax.grid(True, alpha=0.3)
363
+ plt.tight_layout()
364
+
365
+ return fig
366
+
367
+ def plot_distribution(self, data: Union[pd.DataFrame, pd.Series, np.ndarray],
368
+ column: Optional[str] = None,
369
+ plot_type: Literal['hist', 'kde', 'box', 'violin', 'all'] = 'hist',
370
+ backend: Optional[Literal['matplotlib', 'seaborn', 'plotly']] = "seaborn",
371
+ bins: int = 30,
372
+ figsize: Optional[Tuple[int, int]] = None,
373
+ save_fig: Optional[bool] = None,
374
+ filename: Optional[str] = None,
375
+ **kwargs):
376
+ """
377
+ Graficar distribución de una variable
378
+
379
+ Parameters:
380
+ -----------
381
+ data : DataFrame, Series o ndarray
382
+ Datos a graficar
383
+ column : str, optional
384
+ Columna a graficar (si data es DataFrame)
385
+ plot_type : str
386
+ Tipo de gráfico
387
+ backend : str, optional
388
+ Backend de visualización
389
+ bins : int
390
+ Número de bins para histograma
391
+ figsize : tuple, optional
392
+ Tamaño de la figura
393
+ save_fig : bool, optional
394
+ Si guardar la figura
395
+ filename : str, optional
396
+ Nombre del archivo
397
+ """
398
+ backend = backend or self._plot_backend
399
+ figsize = figsize or self._default_figsize
400
+ save_fig = save_fig if save_fig is not None else self._save_fig
401
+
402
+ # Extraer datos
403
+ if isinstance(data, pd.DataFrame):
404
+ if column is None:
405
+ raise ValueError("Debe especificar 'column' cuando data es DataFrame")
406
+ plot_data = data[column].dropna()
407
+ title = f"Distribución de {column}"
408
+ default_filename = f"distribucion_{column}"
409
+ elif isinstance(data, pd.Series):
410
+ plot_data = data.dropna()
411
+ title = f"Distribución de {data.name if data.name else 'Variable'}"
412
+ default_filename = f"distribucion_{data.name if data.name else 'variable'}"
413
+ else:
414
+ plot_data = pd.Series(data).dropna()
415
+ title = "Distribución"
416
+ default_filename = "distribucion"
417
+
418
+ filename = filename or default_filename
419
+
420
+ try:
421
+ if backend == 'seaborn':
422
+ fig = self._plot_distribution_seaborn(plot_data, plot_type, bins, figsize, title, **kwargs)
423
+ elif backend == 'matplotlib':
424
+ fig = self._plot_distribution_matplotlib(plot_data, plot_type, bins, figsize, title, **kwargs)
425
+ elif backend == 'plotly':
426
+ fig = self._plot_distribution_plotly(plot_data, plot_type, bins, title, **kwargs)
427
+ else:
428
+ raise ValueError(f"Backend '{backend}' no soportado")
429
+
430
+ # Guardar figura si está activado
431
+ if save_fig and backend != 'plotly':
432
+ self._save_figure(fig, filename)
433
+
434
+ return fig
435
+
436
+ except Exception as e:
437
+ print(f"Error en plot_distribution: {e}")
438
+ raise
439
+
440
+ def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
441
+ """Implementación con plotly"""
442
+ try:
443
+ import plotly.graph_objects as go
444
+ import plotly.express as px
445
+ from plotly.subplots import make_subplots
446
+ except ImportError:
447
+ raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
448
+
449
+ if plot_type == 'all':
450
+ fig = make_subplots(
451
+ rows=2, cols=2,
452
+ subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
453
+ )
454
+
455
+ # Histograma
456
+ fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
457
+
458
+ # Box plot
459
+ fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
460
+
461
+ # Violin plot
462
+ fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
463
+
464
+ # Distribución acumulada
465
+ hist, bin_edges = np.histogram(data, bins=bins, density=True)
466
+ cdf = np.cumsum(hist * np.diff(bin_edges))
467
+ fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
468
+
469
+ else:
470
+ if plot_type == 'hist':
471
+ fig = px.histogram(data, nbins=bins, title=title)
472
+ elif plot_type == 'box':
473
+ fig = px.box(y=data, title=title)
474
+ elif plot_type == 'violin':
475
+ fig = px.violin(y=data, title=title, box=True)
476
+ else:
477
+ fig = px.histogram(data, nbins=bins, title=title)
478
+
479
+ return fig
480
+
481
+ def plot_correlation_matrix(self, data: pd.DataFrame,
482
+ method: str = 'pearson',
483
+ backend: Optional[Literal['seaborn', 'plotly']] = None,
484
+ figsize: Optional[Tuple[int, int]] = None,
485
+ save_fig: Optional[bool] = None,
486
+ filename: Optional[str] = None,
487
+ **kwargs):
488
+ """
489
+ Visualizar matriz de correlación
490
+
491
+ Parameters:
492
+ -----------
493
+ data : DataFrame
494
+ Datos para calcular correlación
495
+ method : str
496
+ 'pearson', 'spearman' o 'kendall'
497
+ backend : str, optional
498
+ Backend de visualización
499
+ """
500
+ backend = backend or self._plot_backend
501
+ figsize = figsize or self._default_figsize
502
+ save_fig = save_fig if save_fig is not None else self._save_fig
503
+ filename = filename or "matriz_correlacion"
504
+
505
+ # Calcular matriz de correlación
506
+ corr_matrix = data.corr(method=method)
507
+
508
+ if backend == 'seaborn':
509
+ fig, ax = plt.subplots(figsize=figsize)
510
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
511
+
512
+ sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
513
+ cmap='coolwarm', center=0, ax=ax,
514
+ square=True, linewidths=0.5, **kwargs)
515
+ ax.set_title(f'Matriz de Correlación ({method})', fontsize=14, pad=20)
516
+ plt.tight_layout()
517
+
518
+ elif backend == 'plotly':
519
+ import plotly.graph_objects as go
520
+
521
+ fig = go.Figure(data=go.Heatmap(
522
+ z=corr_matrix.values,
523
+ x=corr_matrix.columns,
524
+ y=corr_matrix.index,
525
+ colorscale='RdBu',
526
+ zmid=0,
527
+ text=corr_matrix.values,
528
+ texttemplate='%{text:.2f}',
529
+ textfont={"size": 10},
530
+ **kwargs
531
+ ))
532
+
533
+ fig.update_layout(
534
+ title=f'Matriz de Correlación ({method})',
535
+ xaxis_title='Variables',
536
+ yaxis_title='Variables',
537
+ width=figsize[0]*100,
538
+ height=figsize[1]*100
539
+ )
540
+
541
+ # Guardar figura
542
+ if save_fig:
543
+ if backend == 'seaborn':
544
+ self._save_figure(fig, filename)
545
+ elif backend == 'plotly':
546
+ try:
547
+ os.makedirs(self._figures_dir, exist_ok=True)
548
+ filepath = os.path.join(self._figures_dir, f"{filename}.{self._fig_format}")
549
+ fig.write_image(filepath)
550
+ print(f"✓ Figura Plotly guardada: {filepath}")
551
+ except Exception as e:
552
+ print(f"✗ Error guardando figura Plotly: {e}")
553
+
554
+ return fig
555
+
556
+ def plot_scatter_matrix(self, data: pd.DataFrame,
557
+ columns: Optional[List[str]] = None,
558
+ backend: Optional[Literal['seaborn', 'plotly', 'pandas']] = None,
559
+ figsize: Optional[Tuple[int, int]] = None,
560
+ save_fig: Optional[bool] = None,
561
+ filename: Optional[str] = None,
562
+ **kwargs):
563
+ """
564
+ Matriz de gráficos de dispersión (pairplot)
565
+ """
566
+ backend = backend or self._plot_backend
567
+ figsize = figsize or self._default_figsize
568
+ save_fig = save_fig if save_fig is not None else self._save_fig
569
+ filename = filename or "scatter_matrix"
570
+
571
+ if columns:
572
+ data = data[columns]
573
+
574
+ if backend == 'seaborn':
575
+ fig = sns.pairplot(data, **kwargs)
576
+ fig.fig.suptitle('Matriz de Dispersión', y=1.02)
577
+
578
+ elif backend == 'plotly':
579
+ import plotly.express as px
580
+ fig = px.scatter_matrix(data, **kwargs)
581
+ fig.update_layout(title='Matriz de Dispersión')
582
+
583
+ elif backend == 'pandas':
584
+ from pandas.plotting import scatter_matrix
585
+ fig, ax = plt.subplots(figsize=figsize)
586
+ scatter_matrix(data, ax=ax, **kwargs)
587
+
588
+ # Guardar figura
589
+ if save_fig:
590
+ if backend in ['seaborn', 'pandas']:
591
+ self._save_figure(fig.figure if hasattr(fig, 'figure') else fig, filename)
592
+ elif backend == 'plotly':
593
+ try:
594
+ os.makedirs(self._figures_dir, exist_ok=True)
595
+ filepath = os.path.join(self._figures_dir, f"{filename}.{self._fig_format}")
596
+ fig.write_image(filepath)
597
+ print(f"✓ Figura Plotly guardada: {filepath}")
598
+ except Exception as e:
599
+ print(f"✗ Error guardando figura Plotly: {e}")
600
+
601
+ return fig
602
+
603
+ # ============= GRÁFICOS CON INTERVALOS DE CONFIANZA =============
604
+
605
+ def plot_distribution_with_ci(self,
606
+ data: Union[pd.DataFrame, pd.Series, np.ndarray],
607
+ column: Optional[str] = None,
608
+ confidence_level: float = 0.95,
609
+ ci_method: str = 'parametric',
610
+ bins: int = 30,
611
+ figsize: Optional[Tuple[int, int]] = None,
612
+ save_fig: Optional[bool] = None,
613
+ filename: Optional[str] = None,
614
+ **kwargs) -> plt.Figure:
615
+ """
616
+ Grafica la distribución junto con intervalos de confianza
617
+ """
618
+ # Extraer y limpiar datos
619
+ if isinstance(data, pd.DataFrame):
620
+ if column is None:
621
+ raise ValueError("Debe especificar 'column' cuando data es DataFrame")
622
+ plot_data = data[column].dropna()
623
+ data_name = column
624
+ elif isinstance(data, pd.Series):
625
+ plot_data = data.dropna()
626
+ data_name = data.name if data.name else 'Variable'
627
+ else:
628
+ plot_data = pd.Series(data).dropna()
629
+ data_name = 'Variable'
630
+
631
+ data_array = plot_data.values
632
+ default_filename = f"distribucion_ci_{data_name.lower().replace(' ', '_')}"
633
+ filename = filename or default_filename
634
+
635
+ # Calcular estadísticas e intervalos de confianza
636
+ ci_result = self.calculate_confidence_intervals(data_array, confidence_level, ci_method)
637
+ normality_result = self.check_normality(data_array)
638
+
639
+ # Crear figura con dos subgráficas
640
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize or (14, 6))
641
+
642
+ # ===== PRIMERA GRÁFICA: Distribución básica =====
643
+ n, bins, patches = ax1.hist(data_array, bins=bins, alpha=0.7,
644
+ color='skyblue', edgecolor='black',
645
+ density=True, label='Histograma')
646
+
647
+ # KDE
648
+ kde = stats.gaussian_kde(data_array)
649
+ x_range = np.linspace(data_array.min(), data_array.max(), 200)
650
+ ax1.plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
651
+
652
+ # Línea vertical en la media
653
+ ax1.axvline(ci_result['mean'], color='red', linestyle='--',
654
+ linewidth=2, label=f'Media: {ci_result["mean"]:.2f}')
655
+
656
+ ax1.set_xlabel('Valores')
657
+ ax1.set_ylabel('Densidad')
658
+ ax1.set_title(f'Distribución de {data_name}\n'
659
+ f'Media: {ci_result["mean"]:.2f}, '
660
+ f'Desv. Est.: {ci_result["std"]:.2f}')
661
+ ax1.legend()
662
+ ax1.grid(True, alpha=0.3)
663
+
664
+ # ===== SEGUNDA GRÁFICA: Distribución con intervalos de confianza =====
665
+ n, bins, patches = ax2.hist(data_array, bins=bins, alpha=0.7,
666
+ color='lightgreen', edgecolor='black',
667
+ density=True, label='Histograma')
668
+
669
+ # KDE
670
+ ax2.plot(x_range, kde(x_range), 'r-', linewidth=2, label='KDE')
671
+
672
+ # Media y intervalos de confianza
673
+ ax2.axvline(ci_result['mean'], color='red', linestyle='-',
674
+ linewidth=3, label=f'Media: {ci_result["mean"]:.2f}')
675
+
676
+ # Intervalo de confianza
677
+ ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
678
+ alpha=0.3, color='orange',
679
+ label=f'IC {confidence_level*100}%: [{ci_result["ci_lower"]:.2f}, {ci_result["ci_upper"]:.2f}]')
680
+
681
+ # Líneas para los límites del IC
682
+ ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
683
+ ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
684
+
685
+ # Distribución normal teórica (si los datos son normales)
686
+ if normality_result['is_normal']:
687
+ normal_x = np.linspace(data_array.min(), data_array.max(), 200)
688
+ normal_y = stats.norm.pdf(normal_x, ci_result['mean'], ci_result['std'])
689
+ ax2.plot(normal_x, normal_y, 'g--', linewidth=2, alpha=0.7,
690
+ label='Distribución Normal Teórica')
691
+
692
+ ax2.set_xlabel('Valores')
693
+ ax2.set_ylabel('Densidad')
694
+ ax2.set_title(f'Distribución con Intervalos de Confianza\n'
695
+ f'Método: {ci_method}, n={ci_result["n"]}')
696
+ ax2.legend()
697
+ ax2.grid(True, alpha=0.3)
698
+
699
+ # Información adicional como texto
700
+ info_text = (f'Estadísticas:\n'
701
+ f'• Media: {ci_result["mean"]:.3f}\n'
702
+ f'• Desv. Est.: {ci_result["std"]:.3f}\n'
703
+ f'• n: {ci_result["n"]}\n'
704
+ f'• IC {confidence_level*100}%: [{ci_result["ci_lower"]:.3f}, {ci_result["ci_upper"]:.3f}]\n'
705
+ f'• Margen Error: ±{ci_result["margin_error"]:.3f}\n'
706
+ f'• Normalidad: {normality_result["interpretation"]}\n'
707
+ f'• p-value: {normality_result["shapiro_pvalue"]:.4f}')
708
+
709
+ fig.text(0.02, 0.02, info_text, fontsize=9,
710
+ bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.7),
711
+ verticalalignment='bottom')
712
+
713
+ plt.tight_layout()
714
+
715
+ # Guardar figura si está activado
716
+ save_fig = save_fig if save_fig is not None else self._save_fig
717
+ if save_fig:
718
+ self._save_figure(fig, filename)
719
+
720
+ return fig
721
+
722
+ def plot_multiple_distributions_with_ci(self,
723
+ data_dict: dict,
724
+ confidence_level: float = 0.95,
725
+ figsize: Optional[Tuple[int, int]] = None,
726
+ save_fig: Optional[bool] = None,
727
+ filename: Optional[str] = None,
728
+ **kwargs) -> plt.Figure:
729
+ """
730
+ Grafica múltiples distribuciones con sus intervalos de confianza
731
+ """
732
+ n_distributions = len(data_dict)
733
+ fig, axes = plt.subplots(n_distributions, 2,
734
+ figsize=figsize or (14, 5 * n_distributions))
735
+
736
+ if n_distributions == 1:
737
+ axes = axes.reshape(1, -1)
738
+
739
+ colors = plt.cm.Set3(np.linspace(0, 1, n_distributions))
740
+
741
+ for idx, (name, data) in enumerate(data_dict.items()):
742
+ ax1, ax2 = axes[idx]
743
+
744
+ if isinstance(data, pd.Series):
745
+ data_array = data.dropna().values
746
+ else:
747
+ data_array = np.array(data)
748
+ data_array = data_array[~np.isnan(data_array)]
749
+
750
+ # Calcular estadísticas
751
+ ci_result = self.calculate_confidence_intervals(data_array, confidence_level)
752
+
753
+ # Gráfica izquierda: Distribución básica
754
+ ax1.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
755
+ edgecolor='black', density=True)
756
+
757
+ kde = stats.gaussian_kde(data_array)
758
+ x_range = np.linspace(data_array.min(), data_array.max(), 200)
759
+ ax1.plot(x_range, kde(x_range), 'k-', linewidth=2)
760
+ ax1.axvline(ci_result['mean'], color='red', linestyle='--', linewidth=2)
761
+
762
+ ax1.set_title(f'{name}\nMedia: {ci_result["mean"]:.2f}')
763
+ ax1.grid(True, alpha=0.3)
764
+
765
+ # Gráfica derecha: Con intervalos de confianza
766
+ ax2.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
767
+ edgecolor='black', density=True)
768
+ ax2.plot(x_range, kde(x_range), 'k-', linewidth=2)
769
+
770
+ ax2.axvline(ci_result['mean'], color='red', linestyle='-', linewidth=3)
771
+ ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
772
+ alpha=0.3, color='orange')
773
+ ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
774
+ ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
775
+
776
+ ax2.set_title(f'{name} con IC {confidence_level*100}%')
777
+ ax2.grid(True, alpha=0.3)
778
+
779
+ plt.tight_layout()
780
+
781
+ # Guardar figura si está activado
782
+ save_fig = save_fig if save_fig is not None else self._save_fig
783
+ if save_fig:
784
+ filename = filename or "multiples_distribuciones_ci"
785
+ self._save_figure(fig, filename)
786
+
787
+ return fig
788
+
789
+ # ============= MÉTODOS UTILITARIOS ADICIONALES =============
790
+
791
+ def get_descriptive_stats(self, data: Union[pd.DataFrame, pd.Series, np.ndarray],
792
+ column: Optional[str] = None) -> dict:
793
+ """
794
+ Obtiene estadísticas descriptivas completas
795
+ """
796
+ if isinstance(data, pd.DataFrame):
797
+ if column is None:
798
+ raise ValueError("Debe especificar 'column' cuando data es DataFrame")
799
+ data_series = data[column]
800
+ elif isinstance(data, pd.Series):
801
+ data_series = data
802
+ else:
803
+ data_series = pd.Series(data)
804
+
805
+ data_clean = data_series.dropna()
806
+
807
+ return {
808
+ 'count': len(data_clean),
809
+ 'mean': np.mean(data_clean),
810
+ 'median': np.median(data_clean),
811
+ 'mode': stats.mode(data_clean)[0][0] if len(data_clean) > 0 else np.nan,
812
+ 'std': np.std(data_clean, ddof=1),
813
+ 'variance': np.var(data_clean, ddof=1),
814
+ 'min': np.min(data_clean),
815
+ 'max': np.max(data_clean),
816
+ 'q1': np.percentile(data_clean, 25),
817
+ 'q3': np.percentile(data_clean, 75),
818
+ 'iqr': np.percentile(data_clean, 75) - np.percentile(data_clean, 25),
819
+ 'skewness': stats.skew(data_clean),
820
+ 'kurtosis': stats.kurtosis(data_clean),
821
+ 'range': np.max(data_clean) - np.min(data_clean)
822
+ }
823
+
824
+ def help(self):
825
+ """
826
+ Muestra ayuda completa de la clase UtilsStats
827
+ """
828
+ help_text = """
829
+ 📊 CLASE UtilsStats - AYUDA COMPLETA
830
+
831
+ Clase utilitaria para análisis estadísticos y visualización de datos.
832
+
833
+ 🔧 MÉTODOS PRINCIPALES:
834
+
835
+ 1. 📈 ANÁLISIS ESTADÍSTICO:
836
+ • check_normality() # Test de normalidad
837
+ • calculate_confidence_intervals() # Intervalos de confianza
838
+ • detect_outliers() # Detección de outliers
839
+ • calculate_effect_size() # Tamaño del efecto
840
+ • get_descriptive_stats() # Estadísticas descriptivas
841
+
842
+ 2. 🎨 VISUALIZACIÓN:
843
+ • plot_distribution() # Gráficos de distribución
844
+ • plot_distribution_with_ci() # Distribución con IC
845
+ • plot_multiple_distributions_with_ci() # Múltiples distribuciones
846
+ • plot_correlation_matrix() # Matriz de correlación
847
+ • plot_scatter_matrix() # Matriz de dispersión
848
+
849
+ 3. ⚙️ CONFIGURACIÓN:
850
+ • set_plot_backend() # Backend de visualización
851
+ • set_default_figsize() # Tamaño de figura
852
+ • set_save_fig_options() # Opciones para guardar
853
+
854
+ 4. 🛠️ UTILIDADES:
855
+ • validate_dataframe() # Validación de datos
856
+ • format_number() # Formateo de números
857
+
858
+ 💡 EJEMPLOS DE USO:
859
+
860
+ # Inicializar
861
+ utils = UtilsStats()
862
+
863
+ # Análisis de normalidad
864
+ normalidad = utils.check_normality(mis_datos)
865
+
866
+ # Gráfico con intervalos de confianza
867
+ fig = utils.plot_distribution_with_ci(
868
+ data=mis_datos,
869
+ confidence_level=0.95,
870
+ bins=20
871
+ )
872
+
873
+ # Matriz de correlación
874
+ fig_corr = utils.plot_correlation_matrix(
875
+ data=mi_dataframe,
876
+ method='pearson'
877
+ )
878
+
879
+ # Estadísticas descriptivas
880
+ stats = utils.get_descriptive_stats(mis_datos)
881
+
882
+ 🎯 CARACTERÍSTICAS:
883
+ • Múltiples backends: matplotlib, seaborn, plotly
884
+ • Guardado automático de figuras
885
+ • Manejo robusto de datos faltantes
886
+ • Visualizaciones profesionales listas para publicación
887
+ • Integración perfecta con Jupyter notebooks
888
+ """
889
+ print(help_text)