statslibx 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +15 -8
- statslibx/cli.py +47 -0
- statslibx/computacional.py +2 -0
- statslibx/datasets/__init__.py +236 -8
- statslibx/descriptive.py +502 -160
- statslibx/inferential.py +746 -307
- statslibx/io.py +21 -0
- statslibx/preprocessing/__init__.py +228 -0
- statslibx/probability.py +2 -0
- statslibx/utils.py +112 -150
- {statslibx-0.1.6.dist-info → statslibx-0.1.8.dist-info}/METADATA +27 -32
- statslibx-0.1.8.dist-info/RECORD +15 -0
- statslibx-0.1.8.dist-info/entry_points.txt +2 -0
- statslibx/datasets/course_completion.csv +0 -100001
- statslibx/datasets/iris.csv +0 -151
- statslibx/datasets/penguins.csv +0 -345
- statslibx/datasets/sp500_companies.csv +0 -504
- statslibx/datasets/titanic.csv +0 -419
- statslibx-0.1.6.dist-info/RECORD +0 -14
- {statslibx-0.1.6.dist-info → statslibx-0.1.8.dist-info}/WHEEL +0 -0
- {statslibx-0.1.6.dist-info → statslibx-0.1.8.dist-info}/top_level.txt +0 -0
statslibx/descriptive.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
import pandas as pd
|
|
3
|
+
import polars as pl
|
|
3
4
|
from typing import Optional, Union, Literal, List
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
import flet as ft
|
|
@@ -12,54 +13,83 @@ import plotly.express as px
|
|
|
12
13
|
|
|
13
14
|
class DescriptiveStats:
|
|
14
15
|
"""
|
|
15
|
-
|
|
16
|
+
Class for univariate and multivariate descriptive statistics
|
|
16
17
|
"""
|
|
17
18
|
|
|
18
|
-
def __init__(self, data: Union[pd.DataFrame, np.ndarray],
|
|
19
|
-
|
|
19
|
+
def __init__(self, data: Union[pd.DataFrame, np.ndarray],
|
|
20
|
+
sep: str = None,
|
|
21
|
+
decimal: str = None,
|
|
22
|
+
thousand: str = None,
|
|
23
|
+
backend: Literal['pandas', 'polars'] = 'pandas'):
|
|
20
24
|
"""
|
|
21
|
-
|
|
25
|
+
# Initialize DataFrame
|
|
22
26
|
|
|
23
|
-
Parameters
|
|
24
|
-
|
|
25
|
-
data :
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
## **Parameters:**
|
|
28
|
+
|
|
29
|
+
- **data** : Data to analyze
|
|
30
|
+
- **sep** : Column separator
|
|
31
|
+
- **decimal** : Decimal separator
|
|
32
|
+
- **thousand** : Thousand separator
|
|
33
|
+
- **backend** : 'pandas' or 'polars' for processing
|
|
34
|
+
(Proximamente estara habilitado polars para big data)
|
|
35
|
+
|
|
36
|
+
**Examples:**
|
|
37
|
+
|
|
38
|
+
``Example 1:
|
|
39
|
+
stats = DescriptiveStats(data)
|
|
40
|
+
``
|
|
29
41
|
"""
|
|
30
42
|
|
|
31
43
|
if isinstance(data, str) and os.path.exists(data):
|
|
32
44
|
data = DescriptiveStats.from_file(data).data
|
|
33
45
|
|
|
46
|
+
if isinstance(data, pl.DataFrame):
|
|
47
|
+
raise TypeError(
|
|
48
|
+
"Polars aún no soportado. Use pandas.DataFrame."
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
34
52
|
if isinstance(data, np.ndarray):
|
|
35
53
|
if data.ndim == 1:
|
|
36
54
|
data = pd.DataFrame({'var': data})
|
|
37
55
|
else:
|
|
38
|
-
data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
|
|
56
|
+
data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])]) \
|
|
57
|
+
if isinstance(data, pd.DataFrame) else pl.DataFrame(data, )
|
|
39
58
|
|
|
40
59
|
self.data = data
|
|
41
60
|
self.backend = backend
|
|
42
61
|
self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
|
|
62
|
+
self.sep = sep
|
|
63
|
+
self.decimal = decimal
|
|
64
|
+
self.thousand = thousand
|
|
43
65
|
|
|
44
|
-
@
|
|
45
|
-
def from_file(path: str):
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_file(self, path: str):
|
|
46
68
|
"""
|
|
47
69
|
Carga automática de archivos y devuelve instancia de Intelligence.
|
|
48
70
|
Soporta CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
|
|
71
|
+
Automatic file upload and returns Intelligence instance.
|
|
72
|
+
Supports CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
|
|
73
|
+
|
|
74
|
+
Parametros / Parameters:
|
|
75
|
+
------------------------
|
|
76
|
+
path : str
|
|
77
|
+
Ruta del archivo
|
|
78
|
+
File path
|
|
49
79
|
"""
|
|
50
80
|
if not os.path.exists(path):
|
|
51
|
-
raise FileNotFoundError(f"Archivo no encontrado: {path}")
|
|
81
|
+
raise FileNotFoundError(f"Archivo no encontrado / File not found: {path}")
|
|
52
82
|
|
|
53
83
|
ext = os.path.splitext(path)[1].lower()
|
|
54
84
|
|
|
55
85
|
if ext == ".csv":
|
|
56
|
-
df = pd.read_csv(path)
|
|
86
|
+
df = pd.read_csv(path, sep=self.sep, decimal=self.decimal, thousand=self.thousand)
|
|
57
87
|
|
|
58
88
|
elif ext in [".xlsx", ".xls"]:
|
|
59
|
-
df = pd.read_excel(path)
|
|
89
|
+
df = pd.read_excel(path, decimal=self.decimal, thousand=self.thousand)
|
|
60
90
|
|
|
61
91
|
elif ext in [".txt", ".tsv"]:
|
|
62
|
-
df = pd.read_table(path)
|
|
92
|
+
df = pd.read_table(path, sep=self.sep, decimal=self.decimal, thousand=self.thousand)
|
|
63
93
|
|
|
64
94
|
elif ext == ".json":
|
|
65
95
|
df = pd.read_json(path)
|
|
@@ -71,56 +101,124 @@ class DescriptiveStats:
|
|
|
71
101
|
df = pd.read_feather(path)
|
|
72
102
|
|
|
73
103
|
else:
|
|
74
|
-
raise ValueError(f"Formato no soportado: {ext}")
|
|
104
|
+
raise ValueError(f"Formato no soportado / Unsupported format: {ext}")
|
|
75
105
|
|
|
76
106
|
return DescriptiveStats(df)
|
|
77
107
|
|
|
78
108
|
# ============= MÉTODOS UNIVARIADOS =============
|
|
79
109
|
|
|
80
110
|
def mean(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
81
|
-
"""
|
|
111
|
+
"""
|
|
112
|
+
Media aritmética / Arithmetic mean
|
|
113
|
+
|
|
114
|
+
Parametros / Parameters:
|
|
115
|
+
------------------------
|
|
116
|
+
**column** : str
|
|
117
|
+
Nombre de la columna
|
|
118
|
+
Name of the column
|
|
119
|
+
"""
|
|
82
120
|
if column:
|
|
83
121
|
return self.data[column].mean()
|
|
84
122
|
return self.data[self._numeric_cols].mean()
|
|
85
123
|
|
|
86
124
|
def median(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
87
|
-
"""
|
|
125
|
+
"""
|
|
126
|
+
Mediana / Median
|
|
127
|
+
|
|
128
|
+
Parametros / Parameters:
|
|
129
|
+
------------------------
|
|
130
|
+
**column** : str
|
|
131
|
+
Nombre de la columna
|
|
132
|
+
Name of the column
|
|
133
|
+
"""
|
|
88
134
|
if column:
|
|
89
135
|
return self.data[column].median()
|
|
90
136
|
return self.data[self._numeric_cols].median()
|
|
91
137
|
|
|
92
138
|
def mode(self, column: Optional[str] = None):
|
|
93
|
-
"""
|
|
139
|
+
"""
|
|
140
|
+
Moda / Mode
|
|
141
|
+
|
|
142
|
+
Parametros / Parameters:
|
|
143
|
+
------------------------
|
|
144
|
+
column : str
|
|
145
|
+
Nombre de la columna
|
|
146
|
+
Name of the column
|
|
147
|
+
"""
|
|
94
148
|
if column:
|
|
95
149
|
return self.data[column].mode()[0]
|
|
96
150
|
return self.data[self._numeric_cols].mode().iloc[0]
|
|
97
151
|
|
|
98
152
|
def variance(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
99
|
-
"""
|
|
153
|
+
"""
|
|
154
|
+
Varianza / Variance
|
|
155
|
+
|
|
156
|
+
Parametros / Parameters:
|
|
157
|
+
------------------------
|
|
158
|
+
column : str
|
|
159
|
+
Nombre de la columna
|
|
160
|
+
Name of the column
|
|
161
|
+
"""
|
|
100
162
|
if column:
|
|
101
163
|
return self.data[column].var()
|
|
102
164
|
return self.data[self._numeric_cols].var()
|
|
103
165
|
|
|
104
166
|
def std(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
105
|
-
"""
|
|
167
|
+
"""
|
|
168
|
+
Desviación estándar / Standard deviation
|
|
169
|
+
|
|
170
|
+
Parametros / Parameters:
|
|
171
|
+
------------------------
|
|
172
|
+
column : str
|
|
173
|
+
Nombre de la columna
|
|
174
|
+
Name of the column
|
|
175
|
+
|
|
176
|
+
"""
|
|
106
177
|
if column:
|
|
107
178
|
return self.data[column].std()
|
|
108
179
|
return self.data[self._numeric_cols].std()
|
|
109
180
|
|
|
110
181
|
def skewness(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
111
|
-
"""
|
|
182
|
+
"""
|
|
183
|
+
Asimetría / Asymmetry
|
|
184
|
+
|
|
185
|
+
Parametros / Parameters:
|
|
186
|
+
------------------------
|
|
187
|
+
column : str
|
|
188
|
+
Nombre de la columna
|
|
189
|
+
Name of the column
|
|
190
|
+
"""
|
|
112
191
|
if column:
|
|
113
192
|
return self.data[column].skew()
|
|
114
193
|
return self.data[self._numeric_cols].skew()
|
|
115
194
|
|
|
116
195
|
def kurtosis(self, column: Optional[str] = None) -> Union[float, pd.Series]:
|
|
117
|
-
"""
|
|
196
|
+
"""
|
|
197
|
+
Curtosis / Kurtosis
|
|
198
|
+
|
|
199
|
+
Parametros / Parameters:
|
|
200
|
+
------------------------
|
|
201
|
+
column : str
|
|
202
|
+
Nombre de la columna
|
|
203
|
+
Name of the column
|
|
204
|
+
"""
|
|
118
205
|
if column:
|
|
119
206
|
return self.data[column].kurtosis()
|
|
120
207
|
return self.data[self._numeric_cols].kurtosis()
|
|
121
208
|
|
|
122
209
|
def quantile(self, q: Union[float, List[float]], column: Optional[str] = None):
|
|
123
|
-
"""
|
|
210
|
+
"""
|
|
211
|
+
Cuantiles - Percentiles / Quantiles - Percentiles
|
|
212
|
+
|
|
213
|
+
Parametros / Parameters:
|
|
214
|
+
------------------------
|
|
215
|
+
q : float / List[float]
|
|
216
|
+
Cuantiles a calcular
|
|
217
|
+
Quantiles to calculate
|
|
218
|
+
column : str
|
|
219
|
+
Nombre de la columna
|
|
220
|
+
Name of the column
|
|
221
|
+
"""
|
|
124
222
|
if column:
|
|
125
223
|
return self.data[column].quantile(q)
|
|
126
224
|
return self.data[self._numeric_cols].quantile(q)
|
|
@@ -128,16 +226,19 @@ class DescriptiveStats:
|
|
|
128
226
|
def outliers(self, column: str, method: Literal['iqr', 'zscore'] = 'iqr',
|
|
129
227
|
threshold: float = 1.5) -> pd.Series:
|
|
130
228
|
"""
|
|
131
|
-
Detectar outliers en una columna
|
|
229
|
+
Detectar outliers en una columna / Detecting outliers in a column
|
|
230
|
+
|
|
132
231
|
|
|
133
|
-
Parameters:
|
|
134
|
-
|
|
232
|
+
Parametros / Parameters:
|
|
233
|
+
------------------------
|
|
135
234
|
column : str
|
|
136
235
|
Nombre de la columna
|
|
236
|
+
Name of the column
|
|
137
237
|
method : str
|
|
138
238
|
'iqr' o 'zscore'
|
|
139
239
|
threshold : float
|
|
140
240
|
1.5 para IQR, 3 para zscore típicamente
|
|
241
|
+
1.5 for IQR, 3 for zscore typically
|
|
141
242
|
"""
|
|
142
243
|
col_data = self.data[column]
|
|
143
244
|
|
|
@@ -157,22 +258,31 @@ class DescriptiveStats:
|
|
|
157
258
|
# ============= MÉTODOS MULTIVARIADOS =============
|
|
158
259
|
|
|
159
260
|
def correlation(self, method: Literal['pearson', 'spearman', 'kendall'] = 'pearson',
|
|
160
|
-
|
|
261
|
+
columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
161
262
|
"""
|
|
162
|
-
Matriz de correlación
|
|
263
|
+
Matriz de correlación / Correlation matrix
|
|
163
264
|
|
|
164
|
-
Parameters:
|
|
165
|
-
|
|
265
|
+
Parametros / Parameters:
|
|
266
|
+
------------------------
|
|
166
267
|
method : str
|
|
167
268
|
'pearson', 'spearman' o 'kendall'
|
|
168
269
|
columns : list, optional
|
|
169
270
|
Lista de columnas a incluir
|
|
271
|
+
List of columns to include
|
|
170
272
|
"""
|
|
171
273
|
data_subset = self.data[columns] if columns else self.data[self._numeric_cols]
|
|
172
274
|
return data_subset.corr(method=method)
|
|
173
275
|
|
|
174
276
|
def covariance(self, columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
175
|
-
"""
|
|
277
|
+
"""
|
|
278
|
+
Matriz de covarianza
|
|
279
|
+
|
|
280
|
+
Parametros / Parameters:
|
|
281
|
+
------------------------
|
|
282
|
+
columns: list, optional
|
|
283
|
+
Lista de columnas a incluir
|
|
284
|
+
List of columns to include
|
|
285
|
+
"""
|
|
176
286
|
data_subset = self.data[columns] if columns else self.data[self._numeric_cols]
|
|
177
287
|
return data_subset.cov()
|
|
178
288
|
|
|
@@ -182,14 +292,16 @@ class DescriptiveStats:
|
|
|
182
292
|
show_plot: bool = False,
|
|
183
293
|
plot_backend: str = 'seaborn') -> 'DescriptiveSummary':
|
|
184
294
|
"""
|
|
185
|
-
Resumen completo de estadísticas descriptivas
|
|
295
|
+
Resumen completo de estadísticas descriptivas / Complete descriptive statistics summary
|
|
186
296
|
|
|
187
|
-
Parameters:
|
|
188
|
-
|
|
297
|
+
Parametros / Parameters:
|
|
298
|
+
------------------------
|
|
189
299
|
columns : list, optional
|
|
190
300
|
Columnas específicas a resumir
|
|
301
|
+
Specific columns to summarize
|
|
191
302
|
show_plot : bool
|
|
192
303
|
Si mostrar gráficos
|
|
304
|
+
If to show graphics
|
|
193
305
|
plot_backend : str
|
|
194
306
|
'seaborn', 'plotly' o 'matplotlib'
|
|
195
307
|
"""
|
|
@@ -227,8 +339,28 @@ class DescriptiveStats:
|
|
|
227
339
|
plot_backend: str = 'seaborn',
|
|
228
340
|
handle_missing: Literal['drop', 'error', 'warn'] = 'drop') -> tuple:
|
|
229
341
|
"""
|
|
230
|
-
Regresión lineal simple o múltiple con opción de mostrar gráfico
|
|
231
|
-
|
|
342
|
+
Regresión lineal simple o múltiple con opción de mostrar gráfico / Simple or multiple \
|
|
343
|
+
linear regression with option to show graph
|
|
344
|
+
|
|
345
|
+
Parametros / Parameters:
|
|
346
|
+
------------------------
|
|
347
|
+
X: str, list, optional
|
|
348
|
+
Nombre de la variable independiente
|
|
349
|
+
|
|
350
|
+
y: str
|
|
351
|
+
Nombre de la variable dependiente
|
|
352
|
+
|
|
353
|
+
engine: str
|
|
354
|
+
Motor de la regresion
|
|
355
|
+
|
|
356
|
+
fit_intercept: bool
|
|
357
|
+
Intercepto de la regresion
|
|
358
|
+
|
|
359
|
+
show_plot: bool
|
|
360
|
+
Visualizar la regresion (recomendable, solo [X,y])
|
|
361
|
+
|
|
362
|
+
handle_missing:
|
|
363
|
+
'drop', 'error' o 'warn'
|
|
232
364
|
"""
|
|
233
365
|
if isinstance(X, str):
|
|
234
366
|
X = [X]
|
|
@@ -258,25 +390,29 @@ class DescriptiveStats:
|
|
|
258
390
|
result.fit()
|
|
259
391
|
result.show_plot = show_plot
|
|
260
392
|
result.plot_backend = plot_backend
|
|
261
|
-
|
|
262
|
-
figura = None
|
|
263
|
-
# Graficar si es regresión simple
|
|
264
|
-
if show_plot and len(X) == 1 and plot_backend.lower() == 'seaborn':
|
|
265
|
-
import matplotlib.pyplot as plt
|
|
266
|
-
g = sns.lmplot(x=X[0], y=y, data=regression_data, ci=None)
|
|
267
|
-
g.figure.suptitle(f"Regresión lineal: {y} ~ {X[0]}", y=1.02)
|
|
268
|
-
plt.tight_layout()
|
|
269
|
-
figura = g.figure
|
|
270
|
-
|
|
271
|
-
return result, figura
|
|
393
|
+
return result
|
|
272
394
|
|
|
273
395
|
|
|
274
396
|
|
|
275
|
-
def help(self):
|
|
397
|
+
def help(self, lang="es-ES"):
|
|
276
398
|
"""
|
|
277
399
|
Muestra ayuda completa de la clase DescriptiveStats
|
|
400
|
+
|
|
401
|
+
Parametros / Parameters:
|
|
402
|
+
------------------------
|
|
403
|
+
lang: str
|
|
404
|
+
Idioma Usuario: Codigo de Idioma (es-Es) o "Español"
|
|
405
|
+
User Language: Languaje Code (en-Us) or "English"
|
|
278
406
|
"""
|
|
279
|
-
|
|
407
|
+
if lang in ["en-US", "English", "english"]:
|
|
408
|
+
lang = "en-US"
|
|
409
|
+
else:
|
|
410
|
+
lang = "es-ES"
|
|
411
|
+
help_text = " "
|
|
412
|
+
|
|
413
|
+
match lang:
|
|
414
|
+
case "es-ES":
|
|
415
|
+
help_text = """
|
|
280
416
|
╔════════════════════════════════════════════════════════════════════════════╗
|
|
281
417
|
║ 📊 CLASE DescriptiveStats - AYUDA COMPLETA ║
|
|
282
418
|
╚════════════════════════════════════════════════════════════════════════════╝
|
|
@@ -364,122 +500,328 @@ class DescriptiveStats:
|
|
|
364
500
|
|
|
365
501
|
💡 EJEMPLOS DE USO:
|
|
366
502
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
503
|
+
┌─ Ejemplo 1: Inicialización ─────────────────────────────────────────────┐
|
|
504
|
+
│ import pandas as pd │
|
|
505
|
+
│ from descriptive import DescriptiveStats │
|
|
506
|
+
│ │
|
|
507
|
+
│ # Con DataFrame │
|
|
508
|
+
│ df = pd.read_csv('datos.csv') │
|
|
509
|
+
│ stats = DescriptiveStats(df) │
|
|
510
|
+
│ │
|
|
511
|
+
│ # Con array numpy │
|
|
512
|
+
│ import numpy as np │
|
|
513
|
+
│ datos = np.random.normal(0, 1, 1000) │
|
|
514
|
+
│ stats = DescriptiveStats(datos) │
|
|
515
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
516
|
+
|
|
517
|
+
┌─ Ejemplo 2: Análisis Univariado ────────────────────────────────────────┐
|
|
518
|
+
│ # Estadísticas de una columna │
|
|
519
|
+
│ media = stats.mean('edad') │
|
|
520
|
+
│ mediana = stats.median('edad') │
|
|
521
|
+
│ desv_est = stats.std('edad') │
|
|
522
|
+
│ │
|
|
523
|
+
│ # Cuartiles │
|
|
524
|
+
│ q25 = stats.quantile(0.25, 'edad') │
|
|
525
|
+
│ q75 = stats.quantile(0.75, 'edad') │
|
|
526
|
+
│ │
|
|
527
|
+
│ # Detectar outliers │
|
|
528
|
+
│ outliers_mask = stats.outliers('edad', method='iqr', threshold=1.5) │
|
|
529
|
+
│ print(f"Outliers detectados: {outliers_mask.sum()}") │
|
|
530
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
531
|
+
|
|
532
|
+
┌─ Ejemplo 3: Resumen Completo ───────────────────────────────────────────┐
|
|
533
|
+
│ # Resumen de todas las variables numéricas │
|
|
534
|
+
│ resumen = stats.summary() │
|
|
535
|
+
│ print(resumen) │
|
|
536
|
+
│ │
|
|
537
|
+
│ # Resumen de columnas específicas con visualización │
|
|
538
|
+
│ resumen = stats.summary( │
|
|
539
|
+
│ columns=['edad', 'salario', 'experiencia'], │
|
|
540
|
+
│ show_plot=True, │
|
|
541
|
+
│ plot_backend='seaborn' │
|
|
542
|
+
│ ) │
|
|
543
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
544
|
+
|
|
545
|
+
┌─ Ejemplo 4: Análisis Multivariado ──────────────────────────────────────┐
|
|
546
|
+
│ # Matriz de correlación │
|
|
547
|
+
│ corr_pearson = stats.correlation(method='pearson') │
|
|
548
|
+
│ corr_spearman = stats.correlation(method='spearman') │
|
|
549
|
+
│ │
|
|
550
|
+
│ # Matriz de covarianza │
|
|
551
|
+
│ cov_matrix = stats.covariance() │
|
|
552
|
+
│ │
|
|
553
|
+
│ # Correlación entre variables específicas │
|
|
554
|
+
│ corr_subset = stats.correlation( │
|
|
555
|
+
│ method='pearson', │
|
|
556
|
+
│ columns=['edad', 'salario', 'experiencia'] │
|
|
557
|
+
│ ) │
|
|
558
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
559
|
+
|
|
560
|
+
┌─ Ejemplo 5: Regresión Lineal Simple ────────────────────────────────────┐
|
|
561
|
+
│ # Regresión simple: salario ~ experiencia │
|
|
562
|
+
│ modelo = stats.linear_regression( │
|
|
563
|
+
│ y='salario', │
|
|
564
|
+
│ X='experiencia', │
|
|
565
|
+
│ engine='statsmodels', │
|
|
566
|
+
│ show_plot=True │
|
|
567
|
+
│ ) │
|
|
568
|
+
│ │
|
|
569
|
+
│ # Ver resultados │
|
|
570
|
+
│ print(modelo.summary()) │
|
|
571
|
+
│ │
|
|
572
|
+
│ # Acceder a coeficientes │
|
|
573
|
+
│ print(f"Intercepto: {modelo.intercept_}") │
|
|
574
|
+
│ print(f"Pendiente: {modelo.coef_[0]}") │
|
|
575
|
+
│ print(f"R²: {modelo.r_squared}") │
|
|
576
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
577
|
+
|
|
578
|
+
┌─ Ejemplo 6: Regresión Lineal Múltiple ──────────────────────────────────┐
|
|
579
|
+
│ # Regresión múltiple: salario ~ experiencia + edad + educacion │
|
|
580
|
+
│ modelo = stats.linear_regression( │
|
|
581
|
+
│ y='salario', │
|
|
582
|
+
│ X=['experiencia', 'edad', 'educacion'], │
|
|
583
|
+
│ engine='statsmodels', │
|
|
584
|
+
│ fit_intercept=True, │
|
|
585
|
+
│ handle_missing='drop' │
|
|
586
|
+
│ ) │
|
|
587
|
+
│ │
|
|
588
|
+
│ print(modelo.summary()) │
|
|
589
|
+
│ │
|
|
590
|
+
│ # Hacer predicciones │
|
|
591
|
+
│ import numpy as np │
|
|
592
|
+
│ X_nuevo = np.array([[5, 30, 16], [10, 35, 18]]) # experiencia, edad │
|
|
593
|
+
│ predicciones = modelo.predict(X_nuevo) │
|
|
594
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
459
595
|
|
|
460
596
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
461
597
|
|
|
462
598
|
🎯 CARACTERÍSTICAS CLAVE:
|
|
463
599
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
600
|
+
✓ Análisis univariado completo
|
|
601
|
+
✓ Análisis multivariado (correlación, covarianza)
|
|
602
|
+
✓ Detección de outliers con múltiples métodos
|
|
603
|
+
✓ Regresión lineal con statsmodels o scikit-learn
|
|
604
|
+
✓ Manejo automático de valores faltantes
|
|
605
|
+
✓ Soporte para pandas DataFrame y numpy arrays
|
|
606
|
+
✓ Salidas formateadas profesionales
|
|
607
|
+
✓ Visualizaciones opcionales
|
|
472
608
|
|
|
473
609
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
474
610
|
|
|
475
611
|
📚 DOCUMENTACIÓN ADICIONAL:
|
|
476
|
-
|
|
477
|
-
|
|
612
|
+
Para más información sobre métodos específicos, use:
|
|
613
|
+
help(DescriptiveStats.nombre_metodo)
|
|
478
614
|
|
|
479
615
|
╚════════════════════════════════════════════════════════════════════════════╝
|
|
480
616
|
"""
|
|
481
|
-
|
|
617
|
+
case "en-US":
|
|
618
|
+
help_text = """
|
|
619
|
+
╔════════════════════════════════════════════════════════════════════════════╗
|
|
620
|
+
║ 📊 CLASS DescriptiveStats - COMPLETE HELP ║
|
|
621
|
+
╚════════════════════════════════════════════════════════════════════════════╝
|
|
622
|
+
|
|
623
|
+
📝 DESCRIPTION:
|
|
624
|
+
Class for univariate and multivariate descriptive statistical analysis.
|
|
625
|
+
Provides tools for exploratory data analysis, measures of
|
|
626
|
+
central tendency, dispersion, shape of distribution and linear regression.
|
|
627
|
+
|
|
628
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
629
|
+
|
|
630
|
+
📋 MAIN METHODS:
|
|
631
|
+
|
|
632
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
633
|
+
│ 1. 📊 UNIVARIATE STATISTICS │
|
|
634
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
635
|
+
|
|
636
|
+
🔹 Measures of Central Tendency:
|
|
637
|
+
• .mean(column=None) → Arithmetic mean
|
|
638
|
+
• .median(column=None) → Median (center value)
|
|
639
|
+
• .mode(column=None) → Mode (most frequent value)
|
|
640
|
+
|
|
641
|
+
🔹 Dispersion Measurements:
|
|
642
|
+
• .std(column=None) → Standard deviation
|
|
643
|
+
• .variance(column=None) → Variance
|
|
644
|
+
• .quantile(q, column=None) → Quantiles/Percentiles
|
|
645
|
+
|
|
646
|
+
🔹 Shape Measurements:
|
|
647
|
+
• .skewness(column=None) → Asymmetry (bias)
|
|
648
|
+
• .kurtosis(column=None) → Kurtosis (pointing)
|
|
649
|
+
|
|
650
|
+
🔹 Outlier Detection:
|
|
651
|
+
• .outliers(column, method='iqr', threshold=1.5)
|
|
652
|
+
Methods: 'iqr' (interquartile range) or 'zscore' (z-score)
|
|
653
|
+
|
|
654
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
655
|
+
│ 2. 🔗 MULTIVARIATE STATISTICS │
|
|
656
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
657
|
+
|
|
658
|
+
🔹 .correlation(method='pearson', columns=None)
|
|
659
|
+
Correlation matrix between variables
|
|
660
|
+
Methods: 'pearson', 'spearman', 'kendall'
|
|
661
|
+
|
|
662
|
+
🔹 .covariance(columns=None)
|
|
663
|
+
Covariance matrix between variables
|
|
664
|
+
|
|
665
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
666
|
+
│ 3. 📋 COMPLETE SUMMARY │
|
|
667
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
668
|
+
|
|
669
|
+
🔹 .summary(columns=None, show_plot=False, plot_backend='seaborn')
|
|
670
|
+
Complete descriptive summary with all statistics
|
|
671
|
+
|
|
672
|
+
Includes: count, mean, median, mode, dev. est., variance,
|
|
673
|
+
minimum, Q1, Q3, maximum, IQR, skewness, kurtosis
|
|
674
|
+
|
|
675
|
+
🔹 .summary().to_dataframe(format)
|
|
676
|
+
Format:
|
|
677
|
+
- Wide
|
|
678
|
+
- Long
|
|
679
|
+
- Compact
|
|
680
|
+
🔹 .summary().to_categorical_summary()
|
|
681
|
+
🔹 .summary().to_styled_df()
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
┌────────────────────────────────────────────────────────────────────────────┐
|
|
685
|
+
│ 4. 📈 LINEAR REGRESSION │
|
|
686
|
+
└────────────────────────────────────────────────────────────────────────────┘
|
|
687
|
+
|
|
688
|
+
🔹 .linear_regression(y, X, engine='statsmodels',
|
|
689
|
+
fit_intercept=True, show_plot=False,
|
|
690
|
+
plot_backend='seaborn', handle_missing='drop')
|
|
691
|
+
|
|
692
|
+
Simple or multiple linear regression with full analysis
|
|
482
693
|
|
|
694
|
+
Parameters:
|
|
695
|
+
X : Independent variable(s) (str or list)
|
|
696
|
+
y: Dependent variable (str)
|
|
697
|
+
engine: 'statsmodels' or 'scikit-learn'
|
|
698
|
+
fit_intercept : Include intercept (bool)
|
|
699
|
+
show_plot : Show diagnostic plots (bool)
|
|
700
|
+
handle_missing : 'drop', 'error', 'warn'
|
|
701
|
+
|
|
702
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
703
|
+
|
|
704
|
+
💡 EXAMPLES OF USE:
|
|
705
|
+
|
|
706
|
+
┌─ Example 1: Initialization ─────────────────────────────────────────────┐
|
|
707
|
+
│ import pandas as pd │
|
|
708
|
+
│ from statslibx.descriptive import DescriptiveStats │
|
|
709
|
+
│ from statslibx.datasets import load_dataset │
|
|
710
|
+
│ │
|
|
711
|
+
│ # With DataFrame │
|
|
712
|
+
│ df = load_dataset('datos.csv') │
|
|
713
|
+
│ stats = DescriptiveStats(df) │
|
|
714
|
+
│ │
|
|
715
|
+
│ # With array numpy │
|
|
716
|
+
│ import numpy as np │
|
|
717
|
+
│ datos = np.random.normal(0, 1, 1000) │
|
|
718
|
+
│ stats = DescriptiveStats(datos) │
|
|
719
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
720
|
+
|
|
721
|
+
┌─ Example 2: Univariate Analysis ────────────────────────────────────────┐
|
|
722
|
+
│ # Statistics of a column │
|
|
723
|
+
│ mean = stats.mean('edad') │
|
|
724
|
+
│ median = stats.median('edad') │
|
|
725
|
+
│ desv_est = stats.std('edad') │
|
|
726
|
+
│ │
|
|
727
|
+
│ # Quartiles │
|
|
728
|
+
│ q25 = stats.quantile(0.25, 'edad') │
|
|
729
|
+
│ q75 = stats.quantile(0.75, 'edad') │
|
|
730
|
+
│ │
|
|
731
|
+
│ # To detect outsolves │
|
|
732
|
+
│ outliers_mask = stats.outliers('edad', method='iqr', threshold=1.5) │
|
|
733
|
+
│ print(f"Outliers detected: {outliers_mask.sum()}") │
|
|
734
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
735
|
+
|
|
736
|
+
┌─ Example 3: Complete Summary ───────────────────────────────────────────┐
|
|
737
|
+
│ # Summary of all numerical variables │
|
|
738
|
+
│ summary = stats.summary() │
|
|
739
|
+
│ print(summary) │
|
|
740
|
+
│ │
|
|
741
|
+
│ # Resumen de columnas específicas con visualización │
|
|
742
|
+
│ resumen = stats.summary( │
|
|
743
|
+
│ columns=['edad', 'salario', 'experiencia'], │
|
|
744
|
+
│ show_plot=True, │
|
|
745
|
+
│ plot_backend='seaborn' │
|
|
746
|
+
│ ) │
|
|
747
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
748
|
+
|
|
749
|
+
┌─ Ejemplo 4: Análisis Multivariado ──────────────────────────────────────┐
|
|
750
|
+
│ # Matriz de correlación │
|
|
751
|
+
│ corr_pearson = stats.correlation(method='pearson') │
|
|
752
|
+
│ corr_spearman = stats.correlation(method='spearman') │
|
|
753
|
+
│ │
|
|
754
|
+
│ # Matriz de covarianza │
|
|
755
|
+
│ cov_matrix = stats.covariance() │
|
|
756
|
+
│ │
|
|
757
|
+
│ # Correlación entre variables específicas │
|
|
758
|
+
│ corr_subset = stats.correlation( │
|
|
759
|
+
│ method='pearson', │
|
|
760
|
+
│ columns=['edad', 'salario', 'experiencia'] │
|
|
761
|
+
│ ) │
|
|
762
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
763
|
+
|
|
764
|
+
┌─ Ejemplo 5: Regresión Lineal Simple ────────────────────────────────────┐
|
|
765
|
+
│ # Regresión simple: salario ~ experiencia │
|
|
766
|
+
│ modelo = stats.linear_regression( │
|
|
767
|
+
│ y='salario', │
|
|
768
|
+
│ X='experiencia', │
|
|
769
|
+
│ engine='statsmodels', │
|
|
770
|
+
│ show_plot=True │
|
|
771
|
+
│ ) │
|
|
772
|
+
│ │
|
|
773
|
+
│ # Ver resultados │
|
|
774
|
+
│ print(modelo.summary()) │
|
|
775
|
+
│ │
|
|
776
|
+
│ # Acceder a coeficientes │
|
|
777
|
+
│ print(f"Intercepto: {modelo.intercept_}") │
|
|
778
|
+
│ print(f"Pendiente: {modelo.coef_[0]}") │
|
|
779
|
+
│ print(f"R²: {modelo.r_squared}") │
|
|
780
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
781
|
+
|
|
782
|
+
┌─ Ejemplo 6: Regresión Lineal Múltiple ──────────────────────────────────┐
|
|
783
|
+
│ # Regresión múltiple: salario ~ experiencia + edad + educacion │
|
|
784
|
+
│ modelo = stats.linear_regression( │
|
|
785
|
+
│ y='salario', │
|
|
786
|
+
│ X=['experiencia', 'edad', 'educacion'], │
|
|
787
|
+
│ engine='statsmodels', │
|
|
788
|
+
│ fit_intercept=True, │
|
|
789
|
+
│ handle_missing='drop' │
|
|
790
|
+
│ ) │
|
|
791
|
+
│ │
|
|
792
|
+
│ print(modelo.summary()) │
|
|
793
|
+
│ │
|
|
794
|
+
│ # Hacer predicciones │
|
|
795
|
+
│ import numpy as np │
|
|
796
|
+
│ X_nuevo = np.array([[5, 30, 16], [10, 35, 18]]) # experiencia, edad │
|
|
797
|
+
│ predicciones = modelo.predict(X_nuevo) │
|
|
798
|
+
└──────────────────────────────────────────────────────────────────────────┘
|
|
799
|
+
|
|
800
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
801
|
+
|
|
802
|
+
🎯 CARACTERÍSTICAS CLAVE:
|
|
803
|
+
|
|
804
|
+
✓ Análisis univariado completo
|
|
805
|
+
✓ Análisis multivariado (correlación, covarianza)
|
|
806
|
+
✓ Detección de outliers con múltiples métodos
|
|
807
|
+
✓ Regresión lineal con statsmodels o scikit-learn
|
|
808
|
+
✓ Manejo automático de valores faltantes
|
|
809
|
+
✓ Soporte para pandas DataFrame y numpy arrays
|
|
810
|
+
✓ Salidas formateadas profesionales
|
|
811
|
+
✓ Visualizaciones opcionales
|
|
812
|
+
|
|
813
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
814
|
+
|
|
815
|
+
📚 DOCUMENTACIÓN ADICIONAL:
|
|
816
|
+
Para más información sobre métodos específicos, use:
|
|
817
|
+
help(DescriptiveStats.nombre_metodo)
|
|
818
|
+
|
|
819
|
+
╚════════════════════════════════════════════════════════════════════════════╝
|
|
820
|
+
"""
|
|
821
|
+
|
|
822
|
+
print(help_text)
|
|
823
|
+
|
|
824
|
+
|
|
483
825
|
class DescriptiveSummary:
|
|
484
826
|
"""Clase para formatear salida de estadística descriptiva"""
|
|
485
827
|
|
|
@@ -586,10 +928,10 @@ class DescriptiveSummary:
|
|
|
586
928
|
Formato compacto: Variables en filas, estadísticas en columnas.
|
|
587
929
|
|
|
588
930
|
Ejemplo:
|
|
589
|
-
|
|
590
|
-
Var1
|
|
591
|
-
Var2
|
|
592
|
-
Var3
|
|
931
|
+
count mean median mode std variance ...
|
|
932
|
+
Var1 150.0 5.8 5.8 5.0 0.8 0.68 ...
|
|
933
|
+
Var2 150.0 3.1 3.0 3.0 0.4 0.19 ...
|
|
934
|
+
Var3 150.0 3.8 4.0 1.0 1.8 3.11 ...
|
|
593
935
|
"""
|
|
594
936
|
df_data = []
|
|
595
937
|
|
|
@@ -618,7 +960,7 @@ class DescriptiveSummary:
|
|
|
618
960
|
Formato largo: Una fila por cada combinación variable-estadística.
|
|
619
961
|
|
|
620
962
|
Ejemplo:
|
|
621
|
-
|
|
963
|
+
Variable Estadistica Valor
|
|
622
964
|
0 Var1 count 150.00
|
|
623
965
|
1 Var1 mean 5.84
|
|
624
966
|
2 Var1 median 5.80
|