statslibx 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +2 -2
- statslibx/datasets/__init__.py +1 -0
- statslibx/datasets/course_completion.csv +100001 -0
- statslibx/descriptive.py +274 -148
- statslibx/inferential.py +139 -72
- statslibx/utils.py +288 -82
- {statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/METADATA +1 -1
- statslibx-0.1.6.dist-info/RECORD +14 -0
- statslibx-0.1.4.dist-info/RECORD +0 -13
- {statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/WHEEL +0 -0
- {statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/top_level.txt +0 -0
statslibx/utils.py
CHANGED
|
@@ -6,6 +6,7 @@ import warnings
|
|
|
6
6
|
import os
|
|
7
7
|
from scipy import stats
|
|
8
8
|
import seaborn as sns
|
|
9
|
+
from pathlib import Path
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class UtilsStats:
|
|
@@ -13,11 +14,15 @@ class UtilsStats:
|
|
|
13
14
|
Clase utilitaria para operaciones estadísticas comunes y visualización
|
|
14
15
|
|
|
15
16
|
Esta clase proporciona métodos para validación de datos, análisis estadísticos
|
|
16
|
-
básicos y visualización de resultados.
|
|
17
|
+
básicos y visualización de resultados. Ahora con soporte para leer archivos directamente.
|
|
17
18
|
|
|
18
19
|
Examples:
|
|
19
20
|
---------
|
|
20
21
|
>>> utils = UtilsStats()
|
|
22
|
+
>>> # Desde archivo
|
|
23
|
+
>>> data = utils.load_data("datos.csv")
|
|
24
|
+
>>> utils.check_normality(data, column='edad')
|
|
25
|
+
>>> # Desde array
|
|
21
26
|
>>> data = np.random.normal(0, 1, 100)
|
|
22
27
|
>>> utils.check_normality(data)
|
|
23
28
|
>>> utils.plot_distribution(data)
|
|
@@ -47,34 +52,26 @@ class UtilsStats:
|
|
|
47
52
|
plt.rcParams['lines.linewidth'] = 2
|
|
48
53
|
|
|
49
54
|
def set_plot_backend(self, backend: Literal['matplotlib', 'seaborn', 'plotly']):
|
|
50
|
-
"""
|
|
51
|
-
Establecer el backend de visualización por defecto
|
|
52
|
-
"""
|
|
55
|
+
"""Establecer el backend de visualización por defecto"""
|
|
53
56
|
self._plot_backend = backend
|
|
54
57
|
|
|
55
58
|
def set_default_figsize(self, figsize: Tuple[int, int]):
|
|
56
|
-
"""
|
|
57
|
-
Establecer el tamaño de figura por defecto
|
|
58
|
-
"""
|
|
59
|
+
"""Establecer el tamaño de figura por defecto"""
|
|
59
60
|
self._default_figsize = figsize
|
|
60
61
|
plt.rcParams['figure.figsize'] = [figsize[0], figsize[1]]
|
|
61
62
|
|
|
62
63
|
def set_save_fig_options(self, save_fig: Optional[bool] = False,
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
"""
|
|
67
|
-
Configurar opciones para guardar figuras
|
|
68
|
-
"""
|
|
64
|
+
fig_format: str = 'png',
|
|
65
|
+
fig_dpi: int = 300,
|
|
66
|
+
figures_dir: str = 'figures'):
|
|
67
|
+
"""Configurar opciones para guardar figuras"""
|
|
69
68
|
self._save_fig = save_fig
|
|
70
69
|
self._fig_format = fig_format
|
|
71
70
|
self._fig_dpi = fig_dpi
|
|
72
71
|
self._figures_dir = figures_dir
|
|
73
72
|
|
|
74
73
|
def _save_figure(self, fig, filename: str, **kwargs):
|
|
75
|
-
"""
|
|
76
|
-
Guardar figura si save_fig está activado
|
|
77
|
-
"""
|
|
74
|
+
"""Guardar figura si save_fig está activado"""
|
|
78
75
|
if self._save_fig:
|
|
79
76
|
try:
|
|
80
77
|
os.makedirs(self._figures_dir, exist_ok=True)
|
|
@@ -93,10 +90,114 @@ class UtilsStats:
|
|
|
93
90
|
except Exception as e:
|
|
94
91
|
print(f"✗ Error guardando figura: {e}")
|
|
95
92
|
|
|
96
|
-
# ============= MÉTODOS DE
|
|
93
|
+
# ============= NUEVO: MÉTODOS DE CARGA DE DATOS =============
|
|
94
|
+
|
|
95
|
+
def load_data(self, path: Union[str, Path], **kwargs) -> pd.DataFrame:
|
|
96
|
+
"""
|
|
97
|
+
Carga datos desde archivo en múltiples formatos
|
|
98
|
+
|
|
99
|
+
Parameters:
|
|
100
|
+
-----------
|
|
101
|
+
path : str o Path
|
|
102
|
+
Ruta al archivo de datos
|
|
103
|
+
**kwargs : dict
|
|
104
|
+
Argumentos adicionales para la función de lectura de pandas
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
--------
|
|
108
|
+
pd.DataFrame
|
|
109
|
+
DataFrame con los datos cargados
|
|
110
|
+
|
|
111
|
+
Supported formats:
|
|
112
|
+
------------------
|
|
113
|
+
- CSV (.csv)
|
|
114
|
+
- Excel (.xlsx, .xls)
|
|
115
|
+
- Text/TSV (.txt, .tsv)
|
|
116
|
+
- JSON (.json)
|
|
117
|
+
- Parquet (.parquet)
|
|
118
|
+
- Feather (.feather)
|
|
119
|
+
|
|
120
|
+
Examples:
|
|
121
|
+
---------
|
|
122
|
+
>>> utils = UtilsStats()
|
|
123
|
+
>>> df = utils.load_data("datos.csv")
|
|
124
|
+
>>> df = utils.load_data("datos.xlsx", sheet_name="Hoja1")
|
|
125
|
+
>>> df = utils.load_data("datos.json")
|
|
126
|
+
"""
|
|
127
|
+
path = Path(path)
|
|
128
|
+
|
|
129
|
+
if not path.exists():
|
|
130
|
+
raise FileNotFoundError(f"El archivo no existe: {path}")
|
|
131
|
+
|
|
132
|
+
ext = path.suffix.lower()
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
if ext == ".csv":
|
|
136
|
+
df = pd.read_csv(path, **kwargs)
|
|
137
|
+
|
|
138
|
+
elif ext in [".xlsx", ".xls"]:
|
|
139
|
+
df = pd.read_excel(path, **kwargs)
|
|
140
|
+
|
|
141
|
+
elif ext in [".txt", ".tsv"]:
|
|
142
|
+
df = pd.read_table(path, **kwargs)
|
|
143
|
+
|
|
144
|
+
elif ext == ".json":
|
|
145
|
+
df = pd.read_json(path, **kwargs)
|
|
146
|
+
|
|
147
|
+
elif ext == ".parquet":
|
|
148
|
+
df = pd.read_parquet(path, **kwargs)
|
|
149
|
+
|
|
150
|
+
elif ext == ".feather":
|
|
151
|
+
df = pd.read_feather(path, **kwargs)
|
|
152
|
+
|
|
153
|
+
else:
|
|
154
|
+
raise ValueError(f"Formato de archivo no soportado: {ext}")
|
|
155
|
+
|
|
156
|
+
print(f"✓ Datos cargados exitosamente desde: {path}")
|
|
157
|
+
print(f" Shape: {df.shape}")
|
|
158
|
+
print(f" Columnas: {list(df.columns)}")
|
|
159
|
+
|
|
160
|
+
return df
|
|
161
|
+
|
|
162
|
+
except Exception as e:
|
|
163
|
+
raise Exception(f"Error al cargar el archivo {path}: {str(e)}")
|
|
164
|
+
|
|
165
|
+
def _resolve_data(self, data: Union[pd.DataFrame, pd.Series, np.ndarray, list, str, Path],
|
|
166
|
+
column: Optional[str] = None) -> Tuple[Union[pd.DataFrame, pd.Series, np.ndarray], str]:
|
|
167
|
+
"""
|
|
168
|
+
Resuelve el input de datos: si es una ruta, carga el archivo; si no, usa los datos directamente
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
--------
|
|
172
|
+
Tuple[data, data_source]
|
|
173
|
+
- data: Los datos procesados
|
|
174
|
+
- data_source: String indicando la fuente ('file' o 'memory')
|
|
175
|
+
"""
|
|
176
|
+
# Si es string o Path, intentar cargar como archivo
|
|
177
|
+
if isinstance(data, (str, Path)):
|
|
178
|
+
path = Path(data)
|
|
179
|
+
if path.exists():
|
|
180
|
+
df = self.load_data(path)
|
|
181
|
+
if column is not None and column in df.columns:
|
|
182
|
+
return df[column], 'file'
|
|
183
|
+
return df, 'file'
|
|
184
|
+
else:
|
|
185
|
+
raise FileNotFoundError(f"El archivo no existe: {path}")
|
|
186
|
+
|
|
187
|
+
# Si ya son datos en memoria, devolverlos tal cual
|
|
188
|
+
return data, 'memory'
|
|
189
|
+
|
|
190
|
+
# ============= MÉTODOS DE ANÁLISIS ESTADÍSTICO (ACTUALIZADOS) =============
|
|
97
191
|
|
|
98
|
-
def validate_dataframe(self, data: Union[pd.DataFrame, np.ndarray, list]) -> pd.DataFrame:
|
|
99
|
-
"""
|
|
192
|
+
def validate_dataframe(self, data: Union[pd.DataFrame, np.ndarray, list, str, Path]) -> pd.DataFrame:
|
|
193
|
+
"""
|
|
194
|
+
Valida y convierte datos a DataFrame
|
|
195
|
+
|
|
196
|
+
Ahora acepta también rutas de archivos
|
|
197
|
+
"""
|
|
198
|
+
# Intentar resolver si es un archivo
|
|
199
|
+
data, source = self._resolve_data(data)
|
|
200
|
+
|
|
100
201
|
if isinstance(data, pd.DataFrame):
|
|
101
202
|
return data
|
|
102
203
|
elif isinstance(data, np.ndarray):
|
|
@@ -117,8 +218,36 @@ class UtilsStats:
|
|
|
117
218
|
return f"{num:.{decimals}e}"
|
|
118
219
|
return f"{num:.{decimals}f}"
|
|
119
220
|
|
|
120
|
-
def check_normality(self,
|
|
121
|
-
|
|
221
|
+
def check_normality(self,
|
|
222
|
+
data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path],
|
|
223
|
+
column: Optional[str] = None,
|
|
224
|
+
alpha: float = 0.05) -> dict:
|
|
225
|
+
"""
|
|
226
|
+
Verifica si los datos siguen distribución normal usando Shapiro-Wilk
|
|
227
|
+
|
|
228
|
+
Parameters:
|
|
229
|
+
-----------
|
|
230
|
+
data : Series, ndarray, DataFrame, str o Path
|
|
231
|
+
Datos a analizar o ruta al archivo
|
|
232
|
+
column : str, optional
|
|
233
|
+
Columna a analizar (si data es DataFrame o archivo)
|
|
234
|
+
alpha : float
|
|
235
|
+
Nivel de significancia
|
|
236
|
+
|
|
237
|
+
Examples:
|
|
238
|
+
---------
|
|
239
|
+
>>> utils.check_normality("datos.csv", column="edad")
|
|
240
|
+
>>> utils.check_normality(np.random.normal(0, 1, 100))
|
|
241
|
+
"""
|
|
242
|
+
# Resolver datos
|
|
243
|
+
data, source = self._resolve_data(data, column)
|
|
244
|
+
|
|
245
|
+
# Extraer array
|
|
246
|
+
if isinstance(data, pd.DataFrame):
|
|
247
|
+
if column is None:
|
|
248
|
+
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
249
|
+
data = data[column]
|
|
250
|
+
|
|
122
251
|
if isinstance(data, pd.Series):
|
|
123
252
|
data = data.dropna().values
|
|
124
253
|
else:
|
|
@@ -135,16 +264,39 @@ class UtilsStats:
|
|
|
135
264
|
'interpretation': 'Normal' if shapiro_p > alpha else 'No Normal'
|
|
136
265
|
}
|
|
137
266
|
|
|
138
|
-
def calculate_confidence_intervals(self,
|
|
139
|
-
|
|
140
|
-
|
|
267
|
+
def calculate_confidence_intervals(self,
|
|
268
|
+
data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path],
|
|
269
|
+
column: Optional[str] = None,
|
|
270
|
+
confidence_level: float = 0.95,
|
|
271
|
+
method: str = 'parametric') -> dict:
|
|
141
272
|
"""
|
|
142
273
|
Calcula intervalos de confianza para la media
|
|
274
|
+
|
|
275
|
+
Parameters:
|
|
276
|
+
-----------
|
|
277
|
+
data : Series, ndarray, DataFrame, str o Path
|
|
278
|
+
Datos a analizar o ruta al archivo
|
|
279
|
+
column : str, optional
|
|
280
|
+
Columna a analizar
|
|
281
|
+
confidence_level : float
|
|
282
|
+
Nivel de confianza (default: 0.95)
|
|
283
|
+
method : str
|
|
284
|
+
'parametric' o 'bootstrap'
|
|
143
285
|
"""
|
|
286
|
+
# Resolver datos
|
|
287
|
+
data, source = self._resolve_data(data, column)
|
|
288
|
+
|
|
289
|
+
# Extraer array
|
|
290
|
+
if isinstance(data, pd.DataFrame):
|
|
291
|
+
if column is None:
|
|
292
|
+
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
293
|
+
data = data[column]
|
|
294
|
+
|
|
144
295
|
if isinstance(data, pd.Series):
|
|
145
296
|
data_clean = data.dropna().values
|
|
146
297
|
else:
|
|
147
|
-
data_clean =
|
|
298
|
+
data_clean = np.array(data)
|
|
299
|
+
data_clean = data_clean[~np.isnan(data_clean)]
|
|
148
300
|
|
|
149
301
|
n = len(data_clean)
|
|
150
302
|
mean = np.mean(data_clean)
|
|
@@ -185,16 +337,20 @@ class UtilsStats:
|
|
|
185
337
|
'method': method
|
|
186
338
|
}
|
|
187
339
|
|
|
188
|
-
def detect_outliers(self,
|
|
189
|
-
|
|
190
|
-
|
|
340
|
+
def detect_outliers(self,
|
|
341
|
+
data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path],
|
|
342
|
+
column: Optional[str] = None,
|
|
343
|
+
method: Literal['iqr', 'zscore', 'isolation_forest'] = 'iqr',
|
|
344
|
+
**kwargs) -> np.ndarray:
|
|
191
345
|
"""
|
|
192
346
|
Detecta outliers usando diferentes métodos
|
|
193
347
|
|
|
194
348
|
Parameters:
|
|
195
349
|
-----------
|
|
196
|
-
data :
|
|
197
|
-
Datos a analizar
|
|
350
|
+
data : Series, ndarray, DataFrame, str o Path
|
|
351
|
+
Datos a analizar o ruta al archivo
|
|
352
|
+
column : str, optional
|
|
353
|
+
Columna a analizar
|
|
198
354
|
method : str
|
|
199
355
|
'iqr', 'zscore', o 'isolation_forest'
|
|
200
356
|
|
|
@@ -203,6 +359,15 @@ class UtilsStats:
|
|
|
203
359
|
np.ndarray
|
|
204
360
|
Array booleano indicando outliers
|
|
205
361
|
"""
|
|
362
|
+
# Resolver datos
|
|
363
|
+
data, source = self._resolve_data(data, column)
|
|
364
|
+
|
|
365
|
+
# Extraer array
|
|
366
|
+
if isinstance(data, pd.DataFrame):
|
|
367
|
+
if column is None:
|
|
368
|
+
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
369
|
+
data = data[column]
|
|
370
|
+
|
|
206
371
|
if isinstance(data, pd.Series):
|
|
207
372
|
data = data.values
|
|
208
373
|
|
|
@@ -234,7 +399,7 @@ class UtilsStats:
|
|
|
234
399
|
return outliers
|
|
235
400
|
|
|
236
401
|
def calculate_effect_size(self, group1: np.ndarray, group2: np.ndarray,
|
|
237
|
-
|
|
402
|
+
method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
|
|
238
403
|
"""
|
|
239
404
|
Calcula el tamaño del efecto entre dos grupos
|
|
240
405
|
"""
|
|
@@ -364,24 +529,25 @@ class UtilsStats:
|
|
|
364
529
|
|
|
365
530
|
return fig
|
|
366
531
|
|
|
367
|
-
def plot_distribution(self,
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
532
|
+
def plot_distribution(self,
|
|
533
|
+
data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
|
|
534
|
+
column: Optional[str] = None,
|
|
535
|
+
plot_type: Literal['hist', 'kde', 'box', 'violin', 'all'] = 'hist',
|
|
536
|
+
backend: Optional[Literal['matplotlib', 'seaborn', 'plotly']] = "seaborn",
|
|
537
|
+
bins: int = 30,
|
|
538
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
539
|
+
save_fig: Optional[bool] = None,
|
|
540
|
+
filename: Optional[str] = None,
|
|
541
|
+
**kwargs):
|
|
376
542
|
"""
|
|
377
543
|
Graficar distribución de una variable
|
|
378
544
|
|
|
379
545
|
Parameters:
|
|
380
546
|
-----------
|
|
381
|
-
data : DataFrame, Series o
|
|
382
|
-
Datos a graficar
|
|
547
|
+
data : DataFrame, Series, ndarray, str o Path
|
|
548
|
+
Datos a graficar o ruta al archivo
|
|
383
549
|
column : str, optional
|
|
384
|
-
Columna a graficar (si data es DataFrame)
|
|
550
|
+
Columna a graficar (si data es DataFrame o archivo)
|
|
385
551
|
plot_type : str
|
|
386
552
|
Tipo de gráfico
|
|
387
553
|
backend : str, optional
|
|
@@ -394,11 +560,19 @@ class UtilsStats:
|
|
|
394
560
|
Si guardar la figura
|
|
395
561
|
filename : str, optional
|
|
396
562
|
Nombre del archivo
|
|
563
|
+
|
|
564
|
+
Examples:
|
|
565
|
+
---------
|
|
566
|
+
>>> utils.plot_distribution("datos.csv", column="edad")
|
|
567
|
+
>>> utils.plot_distribution(df, column="salario", plot_type="all")
|
|
397
568
|
"""
|
|
398
569
|
backend = backend or self._plot_backend
|
|
399
570
|
figsize = figsize or self._default_figsize
|
|
400
571
|
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
401
572
|
|
|
573
|
+
# Resolver datos
|
|
574
|
+
data, source = self._resolve_data(data, column)
|
|
575
|
+
|
|
402
576
|
# Extraer datos
|
|
403
577
|
if isinstance(data, pd.DataFrame):
|
|
404
578
|
if column is None:
|
|
@@ -478,20 +652,21 @@ class UtilsStats:
|
|
|
478
652
|
|
|
479
653
|
return fig
|
|
480
654
|
|
|
481
|
-
def plot_correlation_matrix(self,
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
655
|
+
def plot_correlation_matrix(self,
|
|
656
|
+
data: Union[pd.DataFrame, str, Path],
|
|
657
|
+
method: str = 'pearson',
|
|
658
|
+
backend: Optional[Literal['seaborn', 'plotly']] = None,
|
|
659
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
660
|
+
save_fig: Optional[bool] = None,
|
|
661
|
+
filename: Optional[str] = None,
|
|
662
|
+
**kwargs):
|
|
488
663
|
"""
|
|
489
664
|
Visualizar matriz de correlación
|
|
490
665
|
|
|
491
666
|
Parameters:
|
|
492
667
|
-----------
|
|
493
|
-
data : DataFrame
|
|
494
|
-
Datos para calcular correlación
|
|
668
|
+
data : DataFrame, str o Path
|
|
669
|
+
Datos para calcular correlación o ruta al archivo
|
|
495
670
|
method : str
|
|
496
671
|
'pearson', 'spearman' o 'kendall'
|
|
497
672
|
backend : str, optional
|
|
@@ -502,6 +677,12 @@ class UtilsStats:
|
|
|
502
677
|
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
503
678
|
filename = filename or "matriz_correlacion"
|
|
504
679
|
|
|
680
|
+
# Resolver datos
|
|
681
|
+
data, source = self._resolve_data(data)
|
|
682
|
+
|
|
683
|
+
if not isinstance(data, pd.DataFrame):
|
|
684
|
+
raise ValueError("Se requiere un DataFrame para calcular matriz de correlación")
|
|
685
|
+
|
|
505
686
|
# Calcular matriz de correlación
|
|
506
687
|
corr_matrix = data.corr(method=method)
|
|
507
688
|
|
|
@@ -510,8 +691,8 @@ class UtilsStats:
|
|
|
510
691
|
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
|
511
692
|
|
|
512
693
|
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
|
|
513
|
-
|
|
514
|
-
|
|
694
|
+
cmap='coolwarm', center=0, ax=ax,
|
|
695
|
+
square=True, linewidths=0.5, **kwargs)
|
|
515
696
|
ax.set_title(f'Matriz de Correlación ({method})', fontsize=14, pad=20)
|
|
516
697
|
plt.tight_layout()
|
|
517
698
|
|
|
@@ -553,21 +734,33 @@ class UtilsStats:
|
|
|
553
734
|
|
|
554
735
|
return fig
|
|
555
736
|
|
|
556
|
-
def plot_scatter_matrix(self,
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
737
|
+
def plot_scatter_matrix(self,
|
|
738
|
+
data: Union[pd.DataFrame, str, Path],
|
|
739
|
+
columns: Optional[List[str]] = None,
|
|
740
|
+
backend: Optional[Literal['seaborn', 'plotly', 'pandas']] = None,
|
|
741
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
742
|
+
save_fig: Optional[bool] = None,
|
|
743
|
+
filename: Optional[str] = None,
|
|
744
|
+
**kwargs):
|
|
563
745
|
"""
|
|
564
746
|
Matriz de gráficos de dispersión (pairplot)
|
|
747
|
+
|
|
748
|
+
Parameters:
|
|
749
|
+
-----------
|
|
750
|
+
data : DataFrame, str o Path
|
|
751
|
+
Datos o ruta al archivo
|
|
565
752
|
"""
|
|
566
753
|
backend = backend or self._plot_backend
|
|
567
754
|
figsize = figsize or self._default_figsize
|
|
568
755
|
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
569
756
|
filename = filename or "scatter_matrix"
|
|
570
757
|
|
|
758
|
+
# Resolver datos
|
|
759
|
+
data, source = self._resolve_data(data)
|
|
760
|
+
|
|
761
|
+
if not isinstance(data, pd.DataFrame):
|
|
762
|
+
raise ValueError("Se requiere un DataFrame para matriz de dispersión")
|
|
763
|
+
|
|
571
764
|
if columns:
|
|
572
765
|
data = data[columns]
|
|
573
766
|
|
|
@@ -603,16 +796,23 @@ class UtilsStats:
|
|
|
603
796
|
# ============= GRÁFICOS CON INTERVALOS DE CONFIANZA =============
|
|
604
797
|
|
|
605
798
|
def plot_distribution_with_ci(self,
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
799
|
+
data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
|
|
800
|
+
column: Optional[str] = None,
|
|
801
|
+
confidence_level: float = 0.95,
|
|
802
|
+
ci_method: str = 'parametric',
|
|
803
|
+
bins: int = 30,
|
|
804
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
805
|
+
save_fig: Optional[bool] = None,
|
|
806
|
+
filename: Optional[str] = None,
|
|
807
|
+
**kwargs) -> plt.Figure:
|
|
808
|
+
"""
|
|
809
|
+
Distribución con intervalos de confianza
|
|
810
|
+
|
|
811
|
+
Ahora acepta rutas de archivos
|
|
812
|
+
"""
|
|
813
|
+
# Resolver datos
|
|
814
|
+
data, source = self._resolve_data(data, column)
|
|
815
|
+
|
|
616
816
|
# ======= PREPARACIÓN =======
|
|
617
817
|
if isinstance(data, pd.DataFrame):
|
|
618
818
|
if column is None:
|
|
@@ -630,7 +830,7 @@ class UtilsStats:
|
|
|
630
830
|
filename = filename or f"distribucion_ci_{data_name.lower().replace(' ', '_')}"
|
|
631
831
|
|
|
632
832
|
# Estadísticas
|
|
633
|
-
ci_result = self.calculate_confidence_intervals(data_array, confidence_level, ci_method)
|
|
833
|
+
ci_result = self.calculate_confidence_intervals(data_array, confidence_level=confidence_level, method=ci_method)
|
|
634
834
|
normality_result = self.check_normality(data_array)
|
|
635
835
|
|
|
636
836
|
# KDE
|
|
@@ -711,12 +911,12 @@ class UtilsStats:
|
|
|
711
911
|
|
|
712
912
|
|
|
713
913
|
def plot_multiple_distributions_with_ci(self,
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
914
|
+
data_dict: dict,
|
|
915
|
+
confidence_level: float = 0.95,
|
|
916
|
+
figsize: Optional[Tuple[int, int]] = None,
|
|
917
|
+
save_fig: Optional[bool] = None,
|
|
918
|
+
filename: Optional[str] = None,
|
|
919
|
+
**kwargs) -> plt.Figure:
|
|
720
920
|
"""
|
|
721
921
|
Grafica múltiples distribuciones con sus intervalos de confianza
|
|
722
922
|
"""
|
|
@@ -739,7 +939,7 @@ class UtilsStats:
|
|
|
739
939
|
data_array = data_array[~np.isnan(data_array)]
|
|
740
940
|
|
|
741
941
|
# Calcular estadísticas
|
|
742
|
-
ci_result = self.calculate_confidence_intervals(data_array, confidence_level)
|
|
942
|
+
ci_result = self.calculate_confidence_intervals(data_array, confidence_level=confidence_level)
|
|
743
943
|
|
|
744
944
|
# Gráfica izquierda: Distribución básica
|
|
745
945
|
ax1.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
|
|
@@ -760,7 +960,7 @@ class UtilsStats:
|
|
|
760
960
|
|
|
761
961
|
ax2.axvline(ci_result['mean'], color='red', linestyle='-', linewidth=3)
|
|
762
962
|
ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
|
|
763
|
-
|
|
963
|
+
alpha=0.3, color='orange')
|
|
764
964
|
ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
|
|
765
965
|
ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
|
|
766
966
|
|
|
@@ -779,11 +979,17 @@ class UtilsStats:
|
|
|
779
979
|
|
|
780
980
|
# ============= MÉTODOS UTILITARIOS ADICIONALES =============
|
|
781
981
|
|
|
782
|
-
def get_descriptive_stats(self,
|
|
783
|
-
|
|
982
|
+
def get_descriptive_stats(self,
|
|
983
|
+
data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
|
|
984
|
+
column: Optional[str] = None) -> dict:
|
|
784
985
|
"""
|
|
785
986
|
Obtiene estadísticas descriptivas completas
|
|
987
|
+
|
|
988
|
+
Ahora acepta rutas de archivos
|
|
786
989
|
"""
|
|
990
|
+
# Resolver datos
|
|
991
|
+
data, source = self._resolve_data(data, column)
|
|
992
|
+
|
|
787
993
|
if isinstance(data, pd.DataFrame):
|
|
788
994
|
if column is None:
|
|
789
995
|
raise ValueError("Debe especificar 'column' cuando data es DataFrame")
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
statslibx/__init__.py,sha256=gA9uNJ7Th8mJunugVps8UWgBNJtMeo_mHqU-QSkEXQE,1173
|
|
2
|
+
statslibx/descriptive.py,sha256=Hjti-Cs-7-SzrTb0k4s92c4nasLthVwhYU75GS56LAc,40124
|
|
3
|
+
statslibx/inferential.py,sha256=0lpVAp2SiKDgWkH3z3JoVFAjMaXW2VboxtA2vwPwq04,49947
|
|
4
|
+
statslibx/utils.py,sha256=qDqF_XgvEJbdQURA2v0gF0sw0nNQR4-MFXDvVTl_00s,68480
|
|
5
|
+
statslibx/datasets/__init__.py,sha256=wQ4p8hXIhJqV-msWzTvvnbv-l7jyWz5Rn3JZyMSYJ44,452
|
|
6
|
+
statslibx/datasets/course_completion.csv,sha256=jaqyxAh4YCsYuH5OFsjvGV7KUyM_7vQt6LgnqnNAFsI,22422135
|
|
7
|
+
statslibx/datasets/iris.csv,sha256=xSdC5QMVqZ-Vajg_rt91dVUmdfZAnvD5pHB23QhHmTA,3858
|
|
8
|
+
statslibx/datasets/penguins.csv,sha256=4HY2vYr3QmAJnqL4Z44uq7813vV5lAzHb2cGHuFsBsE,13478
|
|
9
|
+
statslibx/datasets/sp500_companies.csv,sha256=WKS72YOGnAbyLR6kD95fOpIYZt5oXGjPryyFVqLRF_k,803820
|
|
10
|
+
statslibx/datasets/titanic.csv,sha256=5seOS8ybyBMBCCWhgKZrsbu06m_OWyKtD9l0YXOImXU,29474
|
|
11
|
+
statslibx-0.1.6.dist-info/METADATA,sha256=7djbcDCGKwPIIjMnF3hjrsjpgeJFUYxEO9zrVTayUj0,2943
|
|
12
|
+
statslibx-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
13
|
+
statslibx-0.1.6.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
|
|
14
|
+
statslibx-0.1.6.dist-info/RECORD,,
|
statslibx-0.1.4.dist-info/RECORD
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
statslibx/__init__.py,sha256=bPO1pSgkpLHNZ4BIz3U9R7MriyeMaIdR9VB-xrWCkaQ,1173
|
|
2
|
-
statslibx/descriptive.py,sha256=Hu7VuOGXs6oOq-zxQNiBKg7UtkNdNQ1Qy3PP-wEO5_k,36971
|
|
3
|
-
statslibx/inferential.py,sha256=BVBxEdLnNCw2yC-3s5fZ84oeJ8LqJYR_IJquPEiyiOk,48234
|
|
4
|
-
statslibx/utils.py,sha256=tdf1yZuR4fsmNq24ygv69BgCLzB0iE_x0ki1IV7Iwxs,60693
|
|
5
|
-
statslibx/datasets/__init__.py,sha256=BXP4ZIbPdaManpvDhoYavfX79pLffh81AUEfzSKXM6w,433
|
|
6
|
-
statslibx/datasets/iris.csv,sha256=xSdC5QMVqZ-Vajg_rt91dVUmdfZAnvD5pHB23QhHmTA,3858
|
|
7
|
-
statslibx/datasets/penguins.csv,sha256=4HY2vYr3QmAJnqL4Z44uq7813vV5lAzHb2cGHuFsBsE,13478
|
|
8
|
-
statslibx/datasets/sp500_companies.csv,sha256=WKS72YOGnAbyLR6kD95fOpIYZt5oXGjPryyFVqLRF_k,803820
|
|
9
|
-
statslibx/datasets/titanic.csv,sha256=5seOS8ybyBMBCCWhgKZrsbu06m_OWyKtD9l0YXOImXU,29474
|
|
10
|
-
statslibx-0.1.4.dist-info/METADATA,sha256=bZJ7IxGbZpnhUaMFj_B95S_-RRvBeQdq0jVbdyFW-yY,2943
|
|
11
|
-
statslibx-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
12
|
-
statslibx-0.1.4.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
|
|
13
|
-
statslibx-0.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|