statslibx 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statslibx/__init__.py +12 -8
- statslibx/computacional.py +126 -0
- statslibx/datasets/__init__.py +243 -54
- statslibx/descriptive.py +80 -15
- statslibx/inferential.py +812 -312
- statslibx/preprocessing/__init__.py +12 -5
- statslibx/utils.py +183 -163
- {statslibx-0.1.7.dist-info → statslibx-0.2.0.dist-info}/METADATA +19 -5
- statslibx-0.2.0.dist-info/RECORD +19 -0
- {statslibx-0.1.7.dist-info → statslibx-0.2.0.dist-info}/WHEEL +1 -1
- statslibx-0.1.7.dist-info/RECORD +0 -18
- {statslibx-0.1.7.dist-info → statslibx-0.2.0.dist-info}/entry_points.txt +0 -0
- {statslibx-0.1.7.dist-info → statslibx-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -169,26 +169,33 @@ class Preprocessing:
|
|
|
169
169
|
column: str,
|
|
170
170
|
method: str = "iqr"
|
|
171
171
|
) -> pd.DataFrame:
|
|
172
|
-
|
|
173
172
|
if self._is_pandas():
|
|
174
173
|
series = self.data[column]
|
|
175
174
|
else:
|
|
176
175
|
series = self.data[column].to_pandas()
|
|
177
176
|
|
|
177
|
+
# 2. Calcular la máscara según el método
|
|
178
178
|
if method == "iqr":
|
|
179
179
|
q1 = series.quantile(0.25)
|
|
180
180
|
q3 = series.quantile(0.75)
|
|
181
181
|
iqr = q3 - q1
|
|
182
|
-
|
|
182
|
+
mask_values = (series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)
|
|
183
183
|
|
|
184
184
|
elif method == "zscore":
|
|
185
185
|
z = (series - series.mean()) / series.std()
|
|
186
|
-
|
|
187
|
-
|
|
186
|
+
mask_values = z.abs() > 3
|
|
188
187
|
else:
|
|
189
188
|
raise ValueError("method must be 'iqr' or 'zscore'")
|
|
190
189
|
|
|
191
|
-
|
|
190
|
+
outliers = self.data[mask_values.values]
|
|
191
|
+
|
|
192
|
+
# 4. Manejo de retorno profesional
|
|
193
|
+
if len(outliers) == 0:
|
|
194
|
+
print(f"No outliers found in column '{column}'")
|
|
195
|
+
return outliers
|
|
196
|
+
|
|
197
|
+
return outliers
|
|
198
|
+
|
|
192
199
|
|
|
193
200
|
# ------------------------------------------------------------------
|
|
194
201
|
# Data Quality Report
|
statslibx/utils.py
CHANGED
|
@@ -11,23 +11,81 @@ from pathlib import Path
|
|
|
11
11
|
|
|
12
12
|
class UtilsStats:
|
|
13
13
|
"""
|
|
14
|
-
|
|
14
|
+
UtilsStats
|
|
15
|
+
A utility class for common statistical operations and visualization.
|
|
16
|
+
This class provides methods for data validation, basic statistical analysis,
|
|
17
|
+
and visualization of results. It also supports loading data directly from files.
|
|
18
|
+
>>> # Load data from a file
|
|
19
|
+
>>> data = utils.load_data("data.csv")
|
|
20
|
+
>>> utils.check_normality(data, column='age')
|
|
21
|
+
>>> # Analyze data from an array
|
|
22
|
+
Methods:
|
|
23
|
+
--------
|
|
24
|
+
_setup_plotting_style():
|
|
25
|
+
Configures default plotting styles for matplotlib.
|
|
15
26
|
|
|
16
|
-
|
|
17
|
-
|
|
27
|
+
set_plot_backend(backend: Literal['matplotlib', 'seaborn', 'plotly']):
|
|
28
|
+
Sets the default visualization backend.
|
|
18
29
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
30
|
+
set_default_figsize(figsize: Tuple[int, int]):
|
|
31
|
+
Sets the default figure size for plots.
|
|
32
|
+
|
|
33
|
+
set_save_fig_options(save_fig: Optional[bool] = False, fig_format: str = 'png',
|
|
34
|
+
fig_dpi: int = 300, figures_dir: str = 'figures'):
|
|
35
|
+
Configures options for saving figures.
|
|
36
|
+
|
|
37
|
+
load_data(path: Union[str, Path], **kwargs) -> pd.DataFrame:
|
|
38
|
+
Loads data from a file in various formats (CSV, Excel, JSON, etc.).
|
|
39
|
+
|
|
40
|
+
validate_dataframe(data: Union[pd.DataFrame, np.ndarray, list, str, Path]) -> pd.DataFrame:
|
|
41
|
+
Validates and converts data to a DataFrame. Also accepts file paths.
|
|
42
|
+
|
|
43
|
+
format_number(num: float, decimals: int = 6, scientific: bool = False) -> str:
|
|
44
|
+
Formats a number with specified decimal places.
|
|
45
|
+
|
|
46
|
+
check_normality(data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path],
|
|
47
|
+
column: Optional[str] = None, alpha: float = 0.05) -> dict:
|
|
48
|
+
Checks if the data follows a normal distribution using the Shapiro-Wilk test.
|
|
49
|
+
|
|
50
|
+
calculate_confidence_intervals(data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path],
|
|
51
|
+
column: Optional[str] = None, confidence_level: float = 0.95,
|
|
52
|
+
Calculates confidence intervals for the mean using parametric or bootstrap methods.
|
|
53
|
+
|
|
54
|
+
detect_outliers(data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path],
|
|
55
|
+
column: Optional[str] = None, method: Literal['iqr', 'zscore', 'isolation_forest'] = 'iqr',
|
|
56
|
+
Detects outliers using different methods: 'iqr', 'zscore', or 'isolation_forest'.
|
|
57
|
+
|
|
58
|
+
calculate_effect_size(data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path] = None,
|
|
59
|
+
Calculates the effect size between two groups using Cohen's d or Hedges' g.
|
|
60
|
+
|
|
61
|
+
plot_distribution(data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
|
|
62
|
+
column: Optional[str] = None, plot_type: Literal['hist', 'kde', 'box', 'violin', 'all'] = 'hist',
|
|
63
|
+
bins: int = 30, figsize: Optional[Tuple[int, int]] = None,
|
|
64
|
+
save_fig: Optional[bool] = False, filename: Optional[str] = None, **kwargs):
|
|
65
|
+
Plots the distribution of a variable using various plot types and backends.
|
|
66
|
+
|
|
67
|
+
plot_correlation_matrix(data: Union[pd.DataFrame, str, Path],
|
|
68
|
+
filename: Optional[str] = None, **kwargs):
|
|
69
|
+
Visualizes the correlation matrix using a heatmap.
|
|
70
|
+
|
|
71
|
+
plot_scatter_matrix(data: Union[pd.DataFrame, str, Path],
|
|
72
|
+
filename: Optional[str] = None, **kwargs):
|
|
73
|
+
Creates a scatter matrix (pairplot) for visualizing relationships between variables.
|
|
74
|
+
|
|
75
|
+
plot_distribution_with_ci(data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
|
|
76
|
+
column: Optional[str] = None, confidence_level: float = 0.95,
|
|
77
|
+
ci_method: str = 'parametric', bins: int = 30,
|
|
78
|
+
filename: Optional[str] = None, **kwargs) -> plt.Figure:
|
|
79
|
+
Plots the distribution of a variable with confidence intervals.
|
|
80
|
+
|
|
81
|
+
get_descriptive_stats(data, column=None) -> dict:
|
|
82
|
+
Returns a dictionary of descriptive statistics for the given data.
|
|
83
|
+
|
|
84
|
+
help():
|
|
85
|
+
Displays a complete help guide for the UtilsStats class.
|
|
29
86
|
"""
|
|
30
87
|
|
|
88
|
+
|
|
31
89
|
def __init__(self):
|
|
32
90
|
"""Inicializar la clase utilitaria"""
|
|
33
91
|
self._plot_backend = 'seaborn'
|
|
@@ -398,11 +456,35 @@ class UtilsStats:
|
|
|
398
456
|
|
|
399
457
|
return outliers
|
|
400
458
|
|
|
401
|
-
def calculate_effect_size(self,
|
|
402
|
-
|
|
459
|
+
def calculate_effect_size(self,
|
|
460
|
+
data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path] = None,
|
|
461
|
+
group1: Union[str, pd.Series, np.ndarray] = None,
|
|
462
|
+
group2: Union[str, pd.Series, np.ndarray] = None,
|
|
463
|
+
method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
|
|
403
464
|
"""
|
|
404
465
|
Calcula el tamaño del efecto entre dos grupos
|
|
405
466
|
"""
|
|
467
|
+
|
|
468
|
+
# --- Preparar arrays ---
|
|
469
|
+
# Caso 1: data es DataFrame y group1/group2 son nombres de columna
|
|
470
|
+
if isinstance(data, pd.DataFrame):
|
|
471
|
+
group1 = np.array(data[group1])
|
|
472
|
+
group2 = np.array(data[group2])
|
|
473
|
+
# Caso 2: data no es None, y es una serie o array, usarlo como group1
|
|
474
|
+
elif isinstance(data, (pd.Series, np.ndarray)) and group2 is not None:
|
|
475
|
+
group1 = np.array(data)
|
|
476
|
+
group2 = np.array(group2)
|
|
477
|
+
# Caso 3: group1 y group2 ya son arrays o Series
|
|
478
|
+
else:
|
|
479
|
+
group1 = np.array(group1)
|
|
480
|
+
group2 = np.array(group2)
|
|
481
|
+
|
|
482
|
+
# Eliminar nan automáticamente
|
|
483
|
+
group1 = group1[~np.isnan(group1)]
|
|
484
|
+
group2 = group2[~np.isnan(group2)]
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
# --- Calcular estadísticas ---
|
|
406
488
|
mean1, mean2 = np.mean(group1), np.mean(group2)
|
|
407
489
|
std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
|
|
408
490
|
n1, n2 = len(group1), len(group2)
|
|
@@ -434,6 +516,7 @@ class UtilsStats:
|
|
|
434
516
|
'pooled_std': pooled_std
|
|
435
517
|
}
|
|
436
518
|
|
|
519
|
+
|
|
437
520
|
# ============= MÉTODOS DE VISUALIZACIÓN COMPLETOS =============
|
|
438
521
|
|
|
439
522
|
def _plot_distribution_seaborn(self, data, plot_type, bins, figsize, title, **kwargs):
|
|
@@ -528,6 +611,47 @@ class UtilsStats:
|
|
|
528
611
|
plt.tight_layout()
|
|
529
612
|
|
|
530
613
|
return fig
|
|
614
|
+
|
|
615
|
+
def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
|
|
616
|
+
"""Implementación con plotly"""
|
|
617
|
+
try:
|
|
618
|
+
import plotly.graph_objects as go
|
|
619
|
+
import plotly.express as px
|
|
620
|
+
from plotly.subplots import make_subplots
|
|
621
|
+
except ImportError:
|
|
622
|
+
raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
|
|
623
|
+
|
|
624
|
+
if plot_type == 'all':
|
|
625
|
+
fig = make_subplots(
|
|
626
|
+
rows=2, cols=2,
|
|
627
|
+
subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
# Histograma
|
|
631
|
+
fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
|
|
632
|
+
|
|
633
|
+
# Box plot
|
|
634
|
+
fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
|
|
635
|
+
|
|
636
|
+
# Violin plot
|
|
637
|
+
fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
|
|
638
|
+
|
|
639
|
+
# Distribución acumulada
|
|
640
|
+
hist, bin_edges = np.histogram(data, bins=bins, density=True)
|
|
641
|
+
cdf = np.cumsum(hist * np.diff(bin_edges))
|
|
642
|
+
fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
|
|
643
|
+
|
|
644
|
+
else:
|
|
645
|
+
if plot_type == 'hist':
|
|
646
|
+
fig = px.histogram(data, nbins=bins, title=title)
|
|
647
|
+
elif plot_type == 'box':
|
|
648
|
+
fig = px.box(y=data, title=title)
|
|
649
|
+
elif plot_type == 'violin':
|
|
650
|
+
fig = px.violin(y=data, title=title, box=True)
|
|
651
|
+
else:
|
|
652
|
+
fig = px.histogram(data, nbins=bins, title=title)
|
|
653
|
+
|
|
654
|
+
return fig
|
|
531
655
|
|
|
532
656
|
def plot_distribution(self,
|
|
533
657
|
data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
|
|
@@ -536,7 +660,7 @@ class UtilsStats:
|
|
|
536
660
|
backend: Optional[Literal['matplotlib', 'seaborn', 'plotly']] = "seaborn",
|
|
537
661
|
bins: int = 30,
|
|
538
662
|
figsize: Optional[Tuple[int, int]] = None,
|
|
539
|
-
save_fig: Optional[bool] =
|
|
663
|
+
save_fig: Optional[bool] = False,
|
|
540
664
|
filename: Optional[str] = None,
|
|
541
665
|
**kwargs):
|
|
542
666
|
"""
|
|
@@ -568,7 +692,7 @@ class UtilsStats:
|
|
|
568
692
|
"""
|
|
569
693
|
backend = backend or self._plot_backend
|
|
570
694
|
figsize = figsize or self._default_figsize
|
|
571
|
-
|
|
695
|
+
self._save_fig = save_fig
|
|
572
696
|
|
|
573
697
|
# Resolver datos
|
|
574
698
|
data, source = self._resolve_data(data, column)
|
|
@@ -605,59 +729,20 @@ class UtilsStats:
|
|
|
605
729
|
if save_fig and backend != 'plotly':
|
|
606
730
|
self._save_figure(fig, filename)
|
|
607
731
|
|
|
608
|
-
|
|
732
|
+
if backend == 'plotly':
|
|
733
|
+
return fig
|
|
609
734
|
|
|
610
735
|
except Exception as e:
|
|
611
736
|
print(f"Error en plot_distribution: {e}")
|
|
612
737
|
raise
|
|
613
738
|
|
|
614
|
-
def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
|
|
615
|
-
"""Implementación con plotly"""
|
|
616
|
-
try:
|
|
617
|
-
import plotly.graph_objects as go
|
|
618
|
-
import plotly.express as px
|
|
619
|
-
from plotly.subplots import make_subplots
|
|
620
|
-
except ImportError:
|
|
621
|
-
raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
|
|
622
|
-
|
|
623
|
-
if plot_type == 'all':
|
|
624
|
-
fig = make_subplots(
|
|
625
|
-
rows=2, cols=2,
|
|
626
|
-
subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
|
|
627
|
-
)
|
|
628
|
-
|
|
629
|
-
# Histograma
|
|
630
|
-
fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
|
|
631
|
-
|
|
632
|
-
# Box plot
|
|
633
|
-
fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
|
|
634
|
-
|
|
635
|
-
# Violin plot
|
|
636
|
-
fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
|
|
637
|
-
|
|
638
|
-
# Distribución acumulada
|
|
639
|
-
hist, bin_edges = np.histogram(data, bins=bins, density=True)
|
|
640
|
-
cdf = np.cumsum(hist * np.diff(bin_edges))
|
|
641
|
-
fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
|
|
642
|
-
|
|
643
|
-
else:
|
|
644
|
-
if plot_type == 'hist':
|
|
645
|
-
fig = px.histogram(data, nbins=bins, title=title)
|
|
646
|
-
elif plot_type == 'box':
|
|
647
|
-
fig = px.box(y=data, title=title)
|
|
648
|
-
elif plot_type == 'violin':
|
|
649
|
-
fig = px.violin(y=data, title=title, box=True)
|
|
650
|
-
else:
|
|
651
|
-
fig = px.histogram(data, nbins=bins, title=title)
|
|
652
|
-
|
|
653
|
-
return fig
|
|
654
|
-
|
|
655
739
|
def plot_correlation_matrix(self,
|
|
656
740
|
data: Union[pd.DataFrame, str, Path],
|
|
657
|
-
method:
|
|
658
|
-
backend: Optional[Literal['seaborn', 'plotly']] =
|
|
741
|
+
method: Literal['pearson', 'kendall', 'spearman'] = 'pearson',
|
|
742
|
+
backend: Optional[Literal['seaborn', 'plotly']] = "seaborn",
|
|
743
|
+
triangular: Optional[bool] = False,
|
|
659
744
|
figsize: Optional[Tuple[int, int]] = None,
|
|
660
|
-
save_fig: Optional[bool] =
|
|
745
|
+
save_fig: Optional[bool] = False,
|
|
661
746
|
filename: Optional[str] = None,
|
|
662
747
|
**kwargs):
|
|
663
748
|
"""
|
|
@@ -674,25 +759,32 @@ class UtilsStats:
|
|
|
674
759
|
"""
|
|
675
760
|
backend = backend or self._plot_backend
|
|
676
761
|
figsize = figsize or self._default_figsize
|
|
677
|
-
save_fig = save_fig
|
|
762
|
+
self.save_fig = save_fig
|
|
678
763
|
filename = filename or "matriz_correlacion"
|
|
679
|
-
|
|
764
|
+
|
|
680
765
|
# Resolver datos
|
|
681
766
|
data, source = self._resolve_data(data)
|
|
682
767
|
|
|
683
768
|
if not isinstance(data, pd.DataFrame):
|
|
684
769
|
raise ValueError("Se requiere un DataFrame para calcular matriz de correlación")
|
|
770
|
+
else:
|
|
771
|
+
data = data.select_dtypes(include=['float64', 'int64'])
|
|
685
772
|
|
|
686
773
|
# Calcular matriz de correlación
|
|
687
774
|
corr_matrix = data.corr(method=method)
|
|
688
775
|
|
|
689
776
|
if backend == 'seaborn':
|
|
690
777
|
fig, ax = plt.subplots(figsize=figsize)
|
|
691
|
-
|
|
778
|
+
if triangular:
|
|
779
|
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
|
692
780
|
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
781
|
+
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
|
|
782
|
+
cmap='coolwarm', center=0, ax=ax,
|
|
783
|
+
square=True, linewidths=0.5, **kwargs)
|
|
784
|
+
else:
|
|
785
|
+
sns.heatmap(corr_matrix, annot=True, fmt='.2f',
|
|
786
|
+
cmap='coolwarm', center=0, ax=ax,
|
|
787
|
+
square=True, linewidths=0.5, **kwargs)
|
|
696
788
|
ax.set_title(f'Matriz de Correlación ({method})', fontsize=14, pad=20)
|
|
697
789
|
plt.tight_layout()
|
|
698
790
|
|
|
@@ -731,15 +823,15 @@ class UtilsStats:
|
|
|
731
823
|
print(f"✓ Figura Plotly guardada: {filepath}")
|
|
732
824
|
except Exception as e:
|
|
733
825
|
print(f"✗ Error guardando figura Plotly: {e}")
|
|
734
|
-
|
|
735
|
-
|
|
826
|
+
if backend == 'plotly':
|
|
827
|
+
return fig
|
|
736
828
|
|
|
737
829
|
def plot_scatter_matrix(self,
|
|
738
830
|
data: Union[pd.DataFrame, str, Path],
|
|
739
831
|
columns: Optional[List[str]] = None,
|
|
740
832
|
backend: Optional[Literal['seaborn', 'plotly', 'pandas']] = None,
|
|
741
833
|
figsize: Optional[Tuple[int, int]] = None,
|
|
742
|
-
save_fig: Optional[bool] =
|
|
834
|
+
save_fig: Optional[bool] = False,
|
|
743
835
|
filename: Optional[str] = None,
|
|
744
836
|
**kwargs):
|
|
745
837
|
"""
|
|
@@ -752,7 +844,7 @@ class UtilsStats:
|
|
|
752
844
|
"""
|
|
753
845
|
backend = backend or self._plot_backend
|
|
754
846
|
figsize = figsize or self._default_figsize
|
|
755
|
-
save_fig = save_fig
|
|
847
|
+
self.save_fig = save_fig
|
|
756
848
|
filename = filename or "scatter_matrix"
|
|
757
849
|
|
|
758
850
|
# Resolver datos
|
|
@@ -791,7 +883,8 @@ class UtilsStats:
|
|
|
791
883
|
except Exception as e:
|
|
792
884
|
print(f"✗ Error guardando figura Plotly: {e}")
|
|
793
885
|
|
|
794
|
-
|
|
886
|
+
if backend == 'plotly':
|
|
887
|
+
return fig
|
|
795
888
|
|
|
796
889
|
# ============= GRÁFICOS CON INTERVALOS DE CONFIANZA =============
|
|
797
890
|
|
|
@@ -802,7 +895,7 @@ class UtilsStats:
|
|
|
802
895
|
ci_method: str = 'parametric',
|
|
803
896
|
bins: int = 30,
|
|
804
897
|
figsize: Optional[Tuple[int, int]] = None,
|
|
805
|
-
save_fig: Optional[bool] =
|
|
898
|
+
save_fig: Optional[bool] = False,
|
|
806
899
|
filename: Optional[str] = None,
|
|
807
900
|
**kwargs) -> plt.Figure:
|
|
808
901
|
"""
|
|
@@ -838,7 +931,7 @@ class UtilsStats:
|
|
|
838
931
|
x_range = np.linspace(data_array.min(), data_array.max(), 300)
|
|
839
932
|
|
|
840
933
|
# ======= FIGURA =======
|
|
841
|
-
fig, (ax1, ax2) = plt.subplots(
|
|
934
|
+
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize or (14, 6))
|
|
842
935
|
|
|
843
936
|
# ============================================================
|
|
844
937
|
# PANEL 1: HISTOGRAMA + KDE
|
|
@@ -903,109 +996,37 @@ class UtilsStats:
|
|
|
903
996
|
plt.tight_layout()
|
|
904
997
|
|
|
905
998
|
# Guardado opcional
|
|
906
|
-
save_fig = save_fig
|
|
999
|
+
self.save_fig = save_fig
|
|
907
1000
|
if save_fig:
|
|
908
1001
|
self._save_figure(fig, filename)
|
|
909
1002
|
|
|
910
|
-
return fig
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
def plot_multiple_distributions_with_ci(self,
|
|
914
|
-
data_dict: dict,
|
|
915
|
-
confidence_level: float = 0.95,
|
|
916
|
-
figsize: Optional[Tuple[int, int]] = None,
|
|
917
|
-
save_fig: Optional[bool] = None,
|
|
918
|
-
filename: Optional[str] = None,
|
|
919
|
-
**kwargs) -> plt.Figure:
|
|
920
|
-
"""
|
|
921
|
-
Grafica múltiples distribuciones con sus intervalos de confianza
|
|
922
|
-
"""
|
|
923
|
-
n_distributions = len(data_dict)
|
|
924
|
-
fig, axes = plt.subplots(n_distributions, 2,
|
|
925
|
-
figsize=figsize or (14, 5 * n_distributions))
|
|
926
|
-
|
|
927
|
-
if n_distributions == 1:
|
|
928
|
-
axes = axes.reshape(1, -1)
|
|
929
|
-
|
|
930
|
-
colors = plt.cm.Set3(np.linspace(0, 1, n_distributions))
|
|
931
|
-
|
|
932
|
-
for idx, (name, data) in enumerate(data_dict.items()):
|
|
933
|
-
ax1, ax2 = axes[idx]
|
|
934
|
-
|
|
935
|
-
if isinstance(data, pd.Series):
|
|
936
|
-
data_array = data.dropna().values
|
|
937
|
-
else:
|
|
938
|
-
data_array = np.array(data)
|
|
939
|
-
data_array = data_array[~np.isnan(data_array)]
|
|
940
|
-
|
|
941
|
-
# Calcular estadísticas
|
|
942
|
-
ci_result = self.calculate_confidence_intervals(data_array, confidence_level=confidence_level)
|
|
943
|
-
|
|
944
|
-
# Gráfica izquierda: Distribución básica
|
|
945
|
-
ax1.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
|
|
946
|
-
edgecolor='black', density=True)
|
|
947
|
-
|
|
948
|
-
kde = stats.gaussian_kde(data_array)
|
|
949
|
-
x_range = np.linspace(data_array.min(), data_array.max(), 200)
|
|
950
|
-
ax1.plot(x_range, kde(x_range), 'k-', linewidth=2)
|
|
951
|
-
ax1.axvline(ci_result['mean'], color='red', linestyle='--', linewidth=2)
|
|
952
|
-
|
|
953
|
-
ax1.set_title(f'{name}\nMedia: {ci_result["mean"]:.2f}')
|
|
954
|
-
ax1.grid(True, alpha=0.3)
|
|
955
|
-
|
|
956
|
-
# Gráfica derecha: Con intervalos de confianza
|
|
957
|
-
ax2.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
|
|
958
|
-
edgecolor='black', density=True)
|
|
959
|
-
ax2.plot(x_range, kde(x_range), 'k-', linewidth=2)
|
|
960
|
-
|
|
961
|
-
ax2.axvline(ci_result['mean'], color='red', linestyle='-', linewidth=3)
|
|
962
|
-
ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
|
|
963
|
-
alpha=0.3, color='orange')
|
|
964
|
-
ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
|
|
965
|
-
ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
|
|
966
|
-
|
|
967
|
-
ax2.set_title(f'{name} con IC {confidence_level*100}%')
|
|
968
|
-
ax2.grid(True, alpha=0.3)
|
|
969
|
-
|
|
970
|
-
plt.tight_layout()
|
|
971
|
-
|
|
972
|
-
# Guardar figura si está activado
|
|
973
|
-
save_fig = save_fig if save_fig is not None else self._save_fig
|
|
974
|
-
if save_fig:
|
|
975
|
-
filename = filename or "multiples_distribuciones_ci"
|
|
976
|
-
self._save_figure(fig, filename)
|
|
977
|
-
|
|
978
|
-
return fig
|
|
979
1003
|
|
|
980
1004
|
# ============= MÉTODOS UTILITARIOS ADICIONALES =============
|
|
981
1005
|
|
|
982
|
-
def get_descriptive_stats(self,
|
|
983
|
-
|
|
984
|
-
column: Optional[str] = None) -> dict:
|
|
985
|
-
"""
|
|
986
|
-
Obtiene estadísticas descriptivas completas
|
|
987
|
-
|
|
988
|
-
Ahora acepta rutas de archivos
|
|
989
|
-
"""
|
|
990
|
-
# Resolver datos
|
|
991
|
-
data, source = self._resolve_data(data, column)
|
|
992
|
-
|
|
1006
|
+
def get_descriptive_stats(self, data, column=None):
|
|
1007
|
+
|
|
993
1008
|
if isinstance(data, pd.DataFrame):
|
|
994
1009
|
if column is None:
|
|
995
|
-
raise ValueError("Debe
|
|
1010
|
+
raise ValueError("Debe especificarse una columna")
|
|
996
1011
|
data_series = data[column]
|
|
997
|
-
elif isinstance(data, pd.Series):
|
|
998
|
-
data_series = data
|
|
999
1012
|
else:
|
|
1000
1013
|
data_series = pd.Series(data)
|
|
1001
|
-
|
|
1014
|
+
|
|
1002
1015
|
data_clean = data_series.dropna()
|
|
1003
|
-
|
|
1016
|
+
|
|
1017
|
+
if len(data_clean) == 0:
|
|
1018
|
+
return {k: np.nan for k in [
|
|
1019
|
+
'count','mean','median','mode','std','variance',
|
|
1020
|
+
'min','max','q1','q3','iqr','skewness','kurtosis','range'
|
|
1021
|
+
]}
|
|
1022
|
+
|
|
1023
|
+
mode_result = stats.mode(data_clean, keepdims=False)
|
|
1024
|
+
|
|
1004
1025
|
return {
|
|
1005
1026
|
'count': len(data_clean),
|
|
1006
1027
|
'mean': np.mean(data_clean),
|
|
1007
1028
|
'median': np.median(data_clean),
|
|
1008
|
-
'mode':
|
|
1029
|
+
'mode': mode_result.mode,
|
|
1009
1030
|
'std': np.std(data_clean, ddof=1),
|
|
1010
1031
|
'variance': np.var(data_clean, ddof=1),
|
|
1011
1032
|
'min': np.min(data_clean),
|
|
@@ -1017,7 +1038,6 @@ class UtilsStats:
|
|
|
1017
1038
|
'kurtosis': stats.kurtosis(data_clean),
|
|
1018
1039
|
'range': np.max(data_clean) - np.min(data_clean)
|
|
1019
1040
|
}
|
|
1020
|
-
|
|
1021
1041
|
def help(self):
|
|
1022
1042
|
"""
|
|
1023
1043
|
Muestra ayuda completa de la clase DescriptiveStats
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: statslibx
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: StatsLibx - Librería de estadística descriptiva
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: StatsLibx - Librería de estadística descriptiva, inferencial y computacional
|
|
5
5
|
Author-email: Emmanuel Ascendra Perez <ascendraemmanuel@gmail.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
Classifier: Development Status :: 3 - Alpha
|
|
@@ -24,11 +24,13 @@ Provides-Extra: advanced
|
|
|
24
24
|
Requires-Dist: scikit-learn>=1.0; extra == "advanced"
|
|
25
25
|
Requires-Dist: statsmodels>=0.13; extra == "advanced"
|
|
26
26
|
|
|
27
|
-
# 📦
|
|
27
|
+
# 📦 StatsLibX
|
|
28
28
|
|
|
29
29
|
StatsLibX es un paquete de Python diseñado para proporcionar una solución sencilla, eficiente y flexible para manejar volumenes de datos.
|
|
30
30
|
|
|
31
|
-
Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y ligera que permita a desarrolladores y entusiastas integrar la **estadistica descriptiva
|
|
31
|
+
Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y ligera que permita a desarrolladores y entusiastas integrar la **estadistica descriptiva, inferencial y computacional (En desarrollo)** sin complicaciones, con multiples funcionalidades y utilidades pensadas para el futuro.
|
|
32
|
+
|
|
33
|
+
GitHub del Proyecto: [https://github.com/GhostAnalyst30/StatsLibX](https://github.com/GhostAnalyst30/StatsLibX)
|
|
32
34
|
|
|
33
35
|
## ✨ Características principales
|
|
34
36
|
|
|
@@ -45,16 +47,28 @@ Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y
|
|
|
45
47
|
## 🚀 Ejemplo rápido
|
|
46
48
|
```python
|
|
47
49
|
from statslibx import DescriptiveStats, InferentialStats, UtilsStats
|
|
50
|
+
from statslibx.datasets import load_iris()
|
|
51
|
+
|
|
52
|
+
data = load_iris()
|
|
48
53
|
|
|
49
54
|
stats = DescriptiveStats(data) # InferentialStats(data), UtilsStats()
|
|
50
|
-
|
|
55
|
+
|
|
56
|
+
stats.summary()
|
|
51
57
|
```
|
|
58
|
+
Para ver mas funciones: [https://github.com/GhostAnalyst30/StatsLibX/blob/main/how_use_statslibx.ipynb](https://github.com/GhostAnalyst30/StatsLibX/blob/main/how_use_statslibx.ipynb)
|
|
52
59
|
|
|
53
60
|
## 📦 Instalación
|
|
54
61
|
```bash
|
|
55
62
|
pip install statslibx
|
|
56
63
|
```
|
|
57
64
|
|
|
65
|
+
## 👩💻 ¡Usalo en la terminal! (De forma preliminar)
|
|
66
|
+
```bash
|
|
67
|
+
statslibx describe .\archive.csv # Devuelve una descripcion de la data
|
|
68
|
+
statslibx quality .\archive.csv # Devuelve la calidad de los datos
|
|
69
|
+
statslibx preview .\archive.csv # Devuelve una visualizacion de los datos
|
|
70
|
+
```
|
|
71
|
+
|
|
58
72
|
🤝 Contribuciones
|
|
59
73
|
|
|
60
74
|
¡Todas las mejoras e ideas son bienvenidas!
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
statslibx/__init__.py,sha256=YUKUQhO1vUYvcUQmlz1ZtvU6MWNZERdAG55-trf25ZY,1500
|
|
2
|
+
statslibx/cli.py,sha256=DqXaoP85n9xgLDlFnEkeqj-HJG0_IKX0uSqxRcHbzII,1122
|
|
3
|
+
statslibx/computacional.py,sha256=z46bRUiH9a3ajxVTYE2sGO-pg20L87MdOKM3Y_Tcq44,4062
|
|
4
|
+
statslibx/descriptive.py,sha256=GrUR4QfstUeLTXdxKSZsmKaOJkDso-QH51hlwTUaubA,63513
|
|
5
|
+
statslibx/inferential.py,sha256=xiJCppezhWK4TrAARdOufuxjZcoGKsfHtRujKfuXbgg,83068
|
|
6
|
+
statslibx/io.py,sha256=v7pxpmlEMeKyfXftl3WbkUtC9FOh1pymz7MmKPPNw98,493
|
|
7
|
+
statslibx/utils.py,sha256=gWXduW8LMN1q4ZwNggmodRsT9Rcsot-S82NsQiqrjUo,69992
|
|
8
|
+
statslibx/datasets/__init__.py,sha256=wiSp4qGwpILCiaN5vVuwWgKnbdELpbi5pxnNB9Wg2nI,7282
|
|
9
|
+
statslibx/datasets/course_completion.csv,sha256=jaqyxAh4YCsYuH5OFsjvGV7KUyM_7vQt6LgnqnNAFsI,22422135
|
|
10
|
+
statslibx/datasets/iris.csv,sha256=xSdC5QMVqZ-Vajg_rt91dVUmdfZAnvD5pHB23QhHmTA,3858
|
|
11
|
+
statslibx/datasets/penguins.csv,sha256=4HY2vYr3QmAJnqL4Z44uq7813vV5lAzHb2cGHuFsBsE,13478
|
|
12
|
+
statslibx/datasets/sp500_companies.csv,sha256=WKS72YOGnAbyLR6kD95fOpIYZt5oXGjPryyFVqLRF_k,803820
|
|
13
|
+
statslibx/datasets/titanic.csv,sha256=5seOS8ybyBMBCCWhgKZrsbu06m_OWyKtD9l0YXOImXU,29474
|
|
14
|
+
statslibx/preprocessing/__init__.py,sha256=ZwdwjBodxeOry-umJ__6yUSeubpRlZg41yve366ArkY,7395
|
|
15
|
+
statslibx-0.2.0.dist-info/METADATA,sha256=w7f-3RgizY3PHUSxoBl6YuHImHz2qFyillhZk82WUfE,2993
|
|
16
|
+
statslibx-0.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
17
|
+
statslibx-0.2.0.dist-info/entry_points.txt,sha256=bkCY7JDWNCZFE3I4sjgJ2oGrUgoBBbCbYmWkBAymT70,49
|
|
18
|
+
statslibx-0.2.0.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
|
|
19
|
+
statslibx-0.2.0.dist-info/RECORD,,
|
statslibx-0.1.7.dist-info/RECORD
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
statslibx/__init__.py,sha256=vXAOPdog5n_b64FRybiWI4VNA_eou7eQuZBcQiQz79E,1297
|
|
2
|
-
statslibx/cli.py,sha256=DqXaoP85n9xgLDlFnEkeqj-HJG0_IKX0uSqxRcHbzII,1122
|
|
3
|
-
statslibx/descriptive.py,sha256=UTb104Gho0uNeSALlukgrYwXrGMDwmIEy39-yvHuy8M,60184
|
|
4
|
-
statslibx/inferential.py,sha256=0lpVAp2SiKDgWkH3z3JoVFAjMaXW2VboxtA2vwPwq04,49947
|
|
5
|
-
statslibx/io.py,sha256=v7pxpmlEMeKyfXftl3WbkUtC9FOh1pymz7MmKPPNw98,493
|
|
6
|
-
statslibx/utils.py,sha256=qDqF_XgvEJbdQURA2v0gF0sw0nNQR4-MFXDvVTl_00s,68480
|
|
7
|
-
statslibx/datasets/__init__.py,sha256=HlOjJFalKVAycJEi7_J_OB7ss8jgSWpPQnsHTynt0uo,2273
|
|
8
|
-
statslibx/datasets/course_completion.csv,sha256=jaqyxAh4YCsYuH5OFsjvGV7KUyM_7vQt6LgnqnNAFsI,22422135
|
|
9
|
-
statslibx/datasets/iris.csv,sha256=xSdC5QMVqZ-Vajg_rt91dVUmdfZAnvD5pHB23QhHmTA,3858
|
|
10
|
-
statslibx/datasets/penguins.csv,sha256=4HY2vYr3QmAJnqL4Z44uq7813vV5lAzHb2cGHuFsBsE,13478
|
|
11
|
-
statslibx/datasets/sp500_companies.csv,sha256=WKS72YOGnAbyLR6kD95fOpIYZt5oXGjPryyFVqLRF_k,803820
|
|
12
|
-
statslibx/datasets/titanic.csv,sha256=5seOS8ybyBMBCCWhgKZrsbu06m_OWyKtD9l0YXOImXU,29474
|
|
13
|
-
statslibx/preprocessing/__init__.py,sha256=B6qI_KuqWf0FFnLLFafIaPOIM9ABo73InKCscSypdqI,7107
|
|
14
|
-
statslibx-0.1.7.dist-info/METADATA,sha256=GN3chKZ7qSdoAKeD54rCxiwRoWk0wiFpLxHmxtc6Skc,2321
|
|
15
|
-
statslibx-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
16
|
-
statslibx-0.1.7.dist-info/entry_points.txt,sha256=bkCY7JDWNCZFE3I4sjgJ2oGrUgoBBbCbYmWkBAymT70,49
|
|
17
|
-
statslibx-0.1.7.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
|
|
18
|
-
statslibx-0.1.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|