PyPI - statslibx - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

statslibx 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

statslibx/__init__.py +12 -8
statslibx/computacional.py +2 -0
statslibx/datasets/__init__.py +227 -54
statslibx/descriptive.py +8 -9
statslibx/inferential.py +746 -307
statslibx/preprocessing/__init__.py +12 -5
statslibx/probability.py +2 -0
statslibx/utils.py +112 -150
{statslibx-0.1.7.dist-info → statslibx-0.1.8.dist-info}/METADATA +17 -3
statslibx-0.1.8.dist-info/RECORD +15 -0
statslibx/datasets/course_completion.csv +0 -100001
statslibx/datasets/iris.csv +0 -151
statslibx/datasets/penguins.csv +0 -345
statslibx/datasets/sp500_companies.csv +0 -504
statslibx/datasets/titanic.csv +0 -419
statslibx-0.1.7.dist-info/RECORD +0 -18
{statslibx-0.1.7.dist-info → statslibx-0.1.8.dist-info}/WHEEL +0 -0
{statslibx-0.1.7.dist-info → statslibx-0.1.8.dist-info}/entry_points.txt +0 -0
{statslibx-0.1.7.dist-info → statslibx-0.1.8.dist-info}/top_level.txt +0 -0

statslibx/preprocessing/__init__.py CHANGED Viewed

@@ -169,26 +169,33 @@ class Preprocessing:
         column: str,
         method: str = "iqr"
     ) -> pd.DataFrame:
         if self._is_pandas():
             series = self.data[column]
         else:
             series = self.data[column].to_pandas()
+        # 2. Calcular la máscara según el método
         if method == "iqr":
             q1 = series.quantile(0.25)
             q3 = series.quantile(0.75)
             iqr = q3 - q1
-            mask = (series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)
+            mask_values = (series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)
         elif method == "zscore":
             z = (series - series.mean()) / series.std()
-            mask = z.abs() > 3
+            mask_values = z.abs() > 3
         else:
             raise ValueError("method must be 'iqr' or 'zscore'")
-        return self.data[mask]
+        outliers = self.data[mask_values.values]
+        # 4. Manejo de retorno profesional
+        if len(outliers) == 0:
+            print(f"No outliers found in column '{column}'")
+            return outliers
+        return outliers
     # ------------------------------------------------------------------
     # Data Quality Report

statslibx/probability.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ class ProbabilityStats:
2	+ pass

statslibx/utils.py CHANGED Viewed

@@ -398,11 +398,35 @@ class UtilsStats:
         return outliers
-    def calculate_effect_size(self, group1: np.ndarray, group2: np.ndarray,
-                                method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
+    def calculate_effect_size(self,
+                            data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path] = None,
+                            group1: Union[str, pd.Series, np.ndarray] = None,
+                            group2: Union[str, pd.Series, np.ndarray] = None,
+                            method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
         """
         Calcula el tamaño del efecto entre dos grupos
         """
+        # --- Preparar arrays ---
+        # Caso 1: data es DataFrame y group1/group2 son nombres de columna
+        if isinstance(data, pd.DataFrame):
+            group1 = np.array(data[group1])
+            group2 = np.array(data[group2])
+        # Caso 2: data no es None, y es una serie o array, usarlo como group1
+        elif isinstance(data, (pd.Series, np.ndarray)) and group2 is not None:
+            group1 = np.array(data)
+            group2 = np.array(group2)
+        # Caso 3: group1 y group2 ya son arrays o Series
+        else:
+            group1 = np.array(group1)
+            group2 = np.array(group2)
+        # Eliminar nan automáticamente
+        group1 = group1[~np.isnan(group1)]
+        group2 = group2[~np.isnan(group2)]
+        # --- Calcular estadísticas ---
         mean1, mean2 = np.mean(group1), np.mean(group2)
         std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
         n1, n2 = len(group1), len(group2)
@@ -434,6 +458,7 @@ class UtilsStats:
             'pooled_std': pooled_std
         }
     # ============= MÉTODOS DE VISUALIZACIÓN COMPLETOS =============
     def _plot_distribution_seaborn(self, data, plot_type, bins, figsize, title, **kwargs):
@@ -528,6 +553,47 @@ class UtilsStats:
             plt.tight_layout()
         return fig
+    def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
+        """Implementación con plotly"""
+        try:
+            import plotly.graph_objects as go
+            import plotly.express as px
+            from plotly.subplots import make_subplots
+        except ImportError:
+            raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
+        if plot_type == 'all':
+            fig = make_subplots(
+                rows=2, cols=2,
+                subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
+            )
+            # Histograma
+            fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
+            # Box plot
+            fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
+            # Violin plot
+            fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
+            # Distribución acumulada
+            hist, bin_edges = np.histogram(data, bins=bins, density=True)
+            cdf = np.cumsum(hist * np.diff(bin_edges))
+            fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
+        else:
+            if plot_type == 'hist':
+                fig = px.histogram(data, nbins=bins, title=title)
+            elif plot_type == 'box':
+                fig = px.box(y=data, title=title)
+            elif plot_type == 'violin':
+                fig = px.violin(y=data, title=title, box=True)
+            else:
+                fig = px.histogram(data, nbins=bins, title=title)
+        return fig
     def plot_distribution(self,
                             data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
@@ -536,7 +602,7 @@ class UtilsStats:
                             backend: Optional[Literal['matplotlib', 'seaborn', 'plotly']] = "seaborn",
                             bins: int = 30,
                             figsize: Optional[Tuple[int, int]] = None,
-                            save_fig: Optional[bool] = None,
+                            save_fig: Optional[bool] = False,
                             filename: Optional[str] = None,
                             **kwargs):
         """
@@ -568,7 +634,7 @@ class UtilsStats:
         """
         backend = backend or self._plot_backend
         figsize = figsize or self._default_figsize
-        save_fig = save_fig if save_fig is not None else self._save_fig
+        self._save_fig = save_fig
         # Resolver datos
         data, source = self._resolve_data(data, column)
@@ -605,59 +671,20 @@ class UtilsStats:
             if save_fig and backend != 'plotly':
                 self._save_figure(fig, filename)
-            return fig
+            if backend == 'plotly':
+                return fig
         except Exception as e:
             print(f"Error en plot_distribution: {e}")
             raise
-    def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
-        """Implementación con plotly"""
-        try:
-            import plotly.graph_objects as go
-            import plotly.express as px
-            from plotly.subplots import make_subplots
-        except ImportError:
-            raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
-        if plot_type == 'all':
-            fig = make_subplots(
-                rows=2, cols=2,
-                subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
-            )
-            # Histograma
-            fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
-            # Box plot
-            fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
-            # Violin plot
-            fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
-            # Distribución acumulada
-            hist, bin_edges = np.histogram(data, bins=bins, density=True)
-            cdf = np.cumsum(hist * np.diff(bin_edges))
-            fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
-        else:
-            if plot_type == 'hist':
-                fig = px.histogram(data, nbins=bins, title=title)
-            elif plot_type == 'box':
-                fig = px.box(y=data, title=title)
-            elif plot_type == 'violin':
-                fig = px.violin(y=data, title=title, box=True)
-            else:
-                fig = px.histogram(data, nbins=bins, title=title)
-        return fig
     def plot_correlation_matrix(self,
                                 data: Union[pd.DataFrame, str, Path],
-                                method: str = 'pearson',
-                                backend: Optional[Literal['seaborn', 'plotly']] = None,
+                                method: Literal['pearson', 'kendall', 'spearman'] = 'pearson',
+                                backend: Optional[Literal['seaborn', 'plotly']] = "seaborn",
+                                triangular: Optional[bool] = False,
                                 figsize: Optional[Tuple[int, int]] = None,
-                                save_fig: Optional[bool] = None,
+                                save_fig: Optional[bool] = False,
                                 filename: Optional[str] = None,
                                 **kwargs):
         """
@@ -674,25 +701,32 @@ class UtilsStats:
         """
         backend = backend or self._plot_backend
         figsize = figsize or self._default_figsize
-        save_fig = save_fig if save_fig is not None else self._save_fig
+        self.save_fig = save_fig
         filename = filename or "matriz_correlacion"
         # Resolver datos
         data, source = self._resolve_data(data)
         if not isinstance(data, pd.DataFrame):
             raise ValueError("Se requiere un DataFrame para calcular matriz de correlación")
+        else:
+            data = data.select_dtypes(include=['float64', 'int64'])
         # Calcular matriz de correlación
         corr_matrix = data.corr(method=method)
         if backend == 'seaborn':
             fig, ax = plt.subplots(figsize=figsize)
-            mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
+            if triangular:
+                mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
-            sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
-                        cmap='coolwarm', center=0, ax=ax,
-                        square=True, linewidths=0.5, **kwargs)
+                sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
+                            cmap='coolwarm', center=0, ax=ax,
+                            square=True, linewidths=0.5, **kwargs)
+            else:
+                sns.heatmap(corr_matrix, annot=True, fmt='.2f',
+                            cmap='coolwarm', center=0, ax=ax,
+                            square=True, linewidths=0.5, **kwargs)
             ax.set_title(f'Matriz de Correlación ({method})', fontsize=14, pad=20)
             plt.tight_layout()
@@ -731,15 +765,15 @@ class UtilsStats:
                     print(f"✓ Figura Plotly guardada: {filepath}")
                 except Exception as e:
                     print(f"✗ Error guardando figura Plotly: {e}")
-        return fig
+        if backend == 'plotly':
+            return fig
     def plot_scatter_matrix(self,
                             data: Union[pd.DataFrame, str, Path],
                             columns: Optional[List[str]] = None,
                             backend: Optional[Literal['seaborn', 'plotly', 'pandas']] = None,
                             figsize: Optional[Tuple[int, int]] = None,
-                            save_fig: Optional[bool] = None,
+                            save_fig: Optional[bool] = False,
                             filename: Optional[str] = None,
                             **kwargs):
         """
@@ -752,7 +786,7 @@ class UtilsStats:
         """
         backend = backend or self._plot_backend
         figsize = figsize or self._default_figsize
-        save_fig = save_fig if save_fig is not None else self._save_fig
+        self.save_fig = save_fig
         filename = filename or "scatter_matrix"
         # Resolver datos
@@ -791,7 +825,8 @@ class UtilsStats:
                 except Exception as e:
                     print(f"✗ Error guardando figura Plotly: {e}")
-        return fig
+        if backend == 'plotly':
+            return fig
     # ============= GRÁFICOS CON INTERVALOS DE CONFIANZA =============
@@ -802,7 +837,7 @@ class UtilsStats:
                                 ci_method: str = 'parametric',
                                 bins: int = 30,
                                 figsize: Optional[Tuple[int, int]] = None,
-                                save_fig: Optional[bool] = None,
+                                save_fig: Optional[bool] = False,
                                 filename: Optional[str] = None,
                                 **kwargs) -> plt.Figure:
         """
@@ -838,7 +873,7 @@ class UtilsStats:
         x_range = np.linspace(data_array.min(), data_array.max(), 300)
         # ======= FIGURA =======
-        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize or (14, 6))
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize or (14, 6))
         # ============================================================
         # PANEL 1: HISTOGRAMA + KDE
@@ -903,109 +938,37 @@ class UtilsStats:
         plt.tight_layout()
         # Guardado opcional
-        save_fig = save_fig if save_fig is not None else self._save_fig
+        self.save_fig = save_fig
         if save_fig:
             self._save_figure(fig, filename)
-        return fig
-    def plot_multiple_distributions_with_ci(self,
-                                            data_dict: dict,
-                                            confidence_level: float = 0.95,
-                                            figsize: Optional[Tuple[int, int]] = None,
-                                            save_fig: Optional[bool] = None,
-                                            filename: Optional[str] = None,
-                                            **kwargs) -> plt.Figure:
-        """
-        Grafica múltiples distribuciones con sus intervalos de confianza
-        """
-        n_distributions = len(data_dict)
-        fig, axes = plt.subplots(n_distributions, 2,
-                               figsize=figsize or (14, 5 * n_distributions))
-        if n_distributions == 1:
-            axes = axes.reshape(1, -1)
-        colors = plt.cm.Set3(np.linspace(0, 1, n_distributions))
-        for idx, (name, data) in enumerate(data_dict.items()):
-            ax1, ax2 = axes[idx]
-            if isinstance(data, pd.Series):
-                data_array = data.dropna().values
-            else:
-                data_array = np.array(data)
-                data_array = data_array[~np.isnan(data_array)]
-            # Calcular estadísticas
-            ci_result = self.calculate_confidence_intervals(data_array, confidence_level=confidence_level)
-            # Gráfica izquierda: Distribución básica
-            ax1.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
-                    edgecolor='black', density=True)
-            kde = stats.gaussian_kde(data_array)
-            x_range = np.linspace(data_array.min(), data_array.max(), 200)
-            ax1.plot(x_range, kde(x_range), 'k-', linewidth=2)
-            ax1.axvline(ci_result['mean'], color='red', linestyle='--', linewidth=2)
-            ax1.set_title(f'{name}\nMedia: {ci_result["mean"]:.2f}')
-            ax1.grid(True, alpha=0.3)
-            # Gráfica derecha: Con intervalos de confianza
-            ax2.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
-                    edgecolor='black', density=True)
-            ax2.plot(x_range, kde(x_range), 'k-', linewidth=2)
-            ax2.axvline(ci_result['mean'], color='red', linestyle='-', linewidth=3)
-            ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
-                        alpha=0.3, color='orange')
-            ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
-            ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
-            ax2.set_title(f'{name} con IC {confidence_level*100}%')
-            ax2.grid(True, alpha=0.3)
-        plt.tight_layout()
-        # Guardar figura si está activado
-        save_fig = save_fig if save_fig is not None else self._save_fig
-        if save_fig:
-            filename = filename or "multiples_distribuciones_ci"
-            self._save_figure(fig, filename)
-        return fig
     # ============= MÉTODOS UTILITARIOS ADICIONALES =============
-    def get_descriptive_stats(self,
-                                data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
-                                column: Optional[str] = None) -> dict:
-        """
-        Obtiene estadísticas descriptivas completas
-        Ahora acepta rutas de archivos
-        """
-        # Resolver datos
-        data, source = self._resolve_data(data, column)
+    def get_descriptive_stats(self, data, column=None):
         if isinstance(data, pd.DataFrame):
             if column is None:
-                raise ValueError("Debe especificar 'column' cuando data es DataFrame")
+                raise ValueError("Debe especificarse una columna")
             data_series = data[column]
-        elif isinstance(data, pd.Series):
-            data_series = data
         else:
             data_series = pd.Series(data)
         data_clean = data_series.dropna()
+        if len(data_clean) == 0:
+            return {k: np.nan for k in [
+                'count','mean','median','mode','std','variance',
+                'min','max','q1','q3','iqr','skewness','kurtosis','range'
+            ]}
+        mode_result = stats.mode(data_clean, keepdims=False)
         return {
             'count': len(data_clean),
             'mean': np.mean(data_clean),
             'median': np.median(data_clean),
-            'mode': stats.mode(data_clean)[0][0] if len(data_clean) > 0 else np.nan,
+            'mode': mode_result.mode,
             'std': np.std(data_clean, ddof=1),
             'variance': np.var(data_clean, ddof=1),
             'min': np.min(data_clean),
@@ -1017,7 +980,6 @@ class UtilsStats:
             'kurtosis': stats.kurtosis(data_clean),
             'range': np.max(data_clean) - np.min(data_clean)
         }
     def help(self):
         """
         Muestra ayuda completa de la clase DescriptiveStats

{statslibx-0.1.7.dist-info → statslibx-0.1.8.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: statslibx
-Version: 0.1.7
+Version: 0.1.8
 Summary: StatsLibx - Librería de estadística descriptiva e inferencial
 Author-email: Emmanuel Ascendra Perez <ascendraemmanuel@gmail.com>
 License: MIT
@@ -24,12 +24,14 @@ Provides-Extra: advanced
 Requires-Dist: scikit-learn>=1.0; extra == "advanced"
 Requires-Dist: statsmodels>=0.13; extra == "advanced"
-# 📦 Descripción para PyPI (Plantilla Profesional)
+# 📦 StatsLibX
 StatsLibX es un paquete de Python diseñado para proporcionar una solución sencilla, eficiente y flexible para manejar volumenes de datos.
 Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y ligera que permita a desarrolladores y entusiastas integrar la **estadistica descriptiva e inferencial** sin complicaciones, con multiples funcionalidades y utilidades pensadas para el futuro.
+GitHub del Proyecto: [text](https://github.com/GhostAnalyst30/StatsLibX)
 ## ✨ Características principales
 - ⚡ Rápido y eficiente: optimizado para ofrecer un rendimiento suave incluso en tareas exigentes.
@@ -45,16 +47,28 @@ Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y
 ## 🚀 Ejemplo rápido
 ```python
 from statslibx import DescriptiveStats, InferentialStats, UtilsStats
+from statslibx.datasets import load_iris()
+data = load_iris()
 stats = DescriptiveStats(data) # InferentialStats(data), UtilsStats()
-stats.help()
+stats.summary()
 ```
+Para ver mas funciones: [text](https://github.com/GhostAnalyst30/StatsLibX/blob/main/how_use_statslibx.ipynb)
 ##  📦 Instalación
 ```bash
 pip install statslibx
 ```
+## 👩‍💻 ¡Usalo en la terminal! (De forma preliminar)
+```bash
+statslibx describe .\archive.csv # Devuelve una descripcion de la data
+statslibx quality .\archive.csv # Devuelve la calidad de los datos
+statslibx preview .\archive.csv # Devuelve una visualizacion de los datos
+```
 🤝 Contribuciones
 ¡Todas las mejoras e ideas son bienvenidas!

statslibx-0.1.8.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+statslibx/__init__.py,sha256=KeEoEZVPUR_PZACWoCpS_2l6luPbEee7VRlcrLgbKQQ,1490
+statslibx/cli.py,sha256=DqXaoP85n9xgLDlFnEkeqj-HJG0_IKX0uSqxRcHbzII,1122
+statslibx/computacional.py,sha256=Nv8wk67RUuuv15oBRu2XPp0_k7O4ZgmT51vThH2OuFk,35
+statslibx/descriptive.py,sha256=r5D4reP1Cdzsu1tSLmf2OEaFAkGvHSd3FIYfUclEaRU,60178
+statslibx/inferential.py,sha256=H0R6g3dJFk-53m1bKldrXObgk0SSmpcdqQg_tIgRKBI,79169
+statslibx/io.py,sha256=v7pxpmlEMeKyfXftl3WbkUtC9FOh1pymz7MmKPPNw98,493
+statslibx/probability.py,sha256=MUME4eXWzbdU93F-QdKwmmyd9IgZK1flFUYQHitp10o,33
+statslibx/utils.py,sha256=iJzt0jDacaoUfjtp4dU2PFuIBEheMP9Qrq-HnLTW_Qw,66515
+statslibx/datasets/__init__.py,sha256=GuUl_7-d6YanuDFht1dwB1bFrqjShvKh1m-iRYAbYZE,6875
+statslibx/preprocessing/__init__.py,sha256=ZwdwjBodxeOry-umJ__6yUSeubpRlZg41yve366ArkY,7395
+statslibx-0.1.8.dist-info/METADATA,sha256=uyhAd0xghADIfVee7WzDp76nLA2snjqQcNayio_UrIc,2835
+statslibx-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+statslibx-0.1.8.dist-info/entry_points.txt,sha256=bkCY7JDWNCZFE3I4sjgJ2oGrUgoBBbCbYmWkBAymT70,49
+statslibx-0.1.8.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
+statslibx-0.1.8.dist-info/RECORD,,

statslibx 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

statslibx 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl