PyPI - statslibx - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

statslibx 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

statslibx/__init__.py +15 -8
statslibx/cli.py +47 -0
statslibx/computacional.py +2 -0
statslibx/datasets/__init__.py +236 -8
statslibx/descriptive.py +502 -160
statslibx/inferential.py +746 -307
statslibx/io.py +21 -0
statslibx/preprocessing/__init__.py +228 -0
statslibx/probability.py +2 -0
statslibx/utils.py +112 -150
{statslibx-0.1.6.dist-info → statslibx-0.1.8.dist-info}/METADATA +27 -32
statslibx-0.1.8.dist-info/RECORD +15 -0
statslibx-0.1.8.dist-info/entry_points.txt +2 -0
statslibx/datasets/course_completion.csv +0 -100001
statslibx/datasets/iris.csv +0 -151
statslibx/datasets/penguins.csv +0 -345
statslibx/datasets/sp500_companies.csv +0 -504
statslibx/datasets/titanic.csv +0 -419
statslibx-0.1.6.dist-info/RECORD +0 -14
{statslibx-0.1.6.dist-info → statslibx-0.1.8.dist-info}/WHEEL +0 -0
{statslibx-0.1.6.dist-info → statslibx-0.1.8.dist-info}/top_level.txt +0 -0

statslibx/io.py ADDED Viewed

@@ -0,0 +1,21 @@
+import pandas as pd
+import polars as pl
+from pathlib import Path
+def load_file(path: str):
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"{path} not found")
+    if path.suffix == ".csv":
+        return pd.read_csv(path)
+    if path.suffix == ".json":
+        return pd.read_json(path)
+    if path.suffix in {".txt", ".tsv"}:
+        return pd.read_csv(path, sep="\t")
+    raise ValueError(f"Unsupported file type: {path.suffix}")

statslibx/preprocessing/__init__.py ADDED Viewed

@@ -0,0 +1,228 @@
+from typing import Optional, Union, List, Dict, Any
+import pandas as pd
+import polars as pl
+import numpy as np
+class Preprocessing:
+    def __init__(self, data: Union[pd.DataFrame, pl.DataFrame]):
+        if not isinstance(data, (pd.DataFrame, pl.DataFrame)):
+            raise TypeError("data must be a pandas or polars DataFrame")
+        self.data = data
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _is_pandas(self) -> bool:
+        return isinstance(self.data, pd.DataFrame)
+    def _is_polars(self) -> bool:
+        return isinstance(self.data, pl.DataFrame)
+    def _count_nulls(self, column: str) -> int:
+        if self._is_pandas():
+            return int(self.data[column].isna().sum())
+        return int(self.data[column].null_count())
+    def _get_columns(self, columns):
+        if columns is None:
+            return list(self.data.columns)
+        if isinstance(columns, str):
+            return [columns]
+        return columns
+    # ------------------------------------------------------------------
+    # Inspection
+    # ------------------------------------------------------------------
+    def detect_nulls(
+        self,
+        columns: Optional[Union[str, List[str]]] = None
+    ) -> pd.DataFrame:
+        columns = self._get_columns(columns)
+        total = self.data.shape[0]
+        rows = []
+        for col in columns:
+            nulls = self._count_nulls(col)
+            rows.append({
+                "column": col,
+                "nulls": nulls,
+                "non_nulls": total - nulls,
+                "null_pct": nulls / total
+            })
+        return pd.DataFrame(rows)
+    def check_uniqueness(self) -> pd.DataFrame:
+        if self._is_pandas():
+            unique = self.data.nunique()
+            return pd.DataFrame({
+                "column": unique.index,
+                "unique_values": unique.values
+            })
+        unique = self.data.select(pl.all().n_unique())
+        return unique.to_pandas().melt(
+            var_name="column",
+            value_name="unique_values"
+        )
+    def preview_data(self, n: int = 5):
+        return self.data.head(n)
+    # ------------------------------------------------------------------
+    # Description
+    # ------------------------------------------------------------------
+    def describe_numeric(self):
+        if self._is_pandas():
+            return self.data.select_dtypes(include=np.number).describe()
+        return self.data.select(pl.all().filter(pl.col(pl.NUMERIC))).describe()
+    def describe_categorical(self):
+        if self._is_pandas():
+            return self.data.select_dtypes(include="object").describe()
+        return self.data.select(pl.all().filter(pl.col(pl.Utf8))).describe()
+    # ------------------------------------------------------------------
+    # Transformations
+    # ------------------------------------------------------------------
+    def fill_nulls(
+        self,
+        fill_with: Any,
+        columns: Optional[Union[str, List[str]]] = None
+    ):
+        columns = self._get_columns(columns)
+        if self._is_pandas():
+            self.data[columns] = self.data[columns].fillna(fill_with)
+        else:
+            self.data = self.data.with_columns([
+                pl.col(col).fill_null(fill_with) for col in columns
+            ])
+        return self
+    def normalize(self, column: str):
+        if self._is_pandas():
+            col = self.data[column]
+            self.data[column] = (col - col.min()) / (col.max() - col.min())
+        else:
+            self.data = self.data.with_columns(
+                ((pl.col(column) - pl.col(column).min()) /
+                 (pl.col(column).max() - pl.col(column).min()))
+                .alias(column)
+            )
+        return self
+    def standardize(self, column: str):
+        if self._is_pandas():
+            col = self.data[column]
+            self.data[column] = (col - col.mean()) / col.std()
+        else:
+            self.data = self.data.with_columns(
+                ((pl.col(column) - pl.col(column).mean()) /
+                 pl.col(column).std())
+                .alias(column)
+            )
+        return self
+    # ------------------------------------------------------------------
+    # Filtering
+    # ------------------------------------------------------------------
+    def filter_rows(self, condition):
+        if self._is_pandas():
+            self.data = self.data.loc[condition]
+        else:
+            self.data = self.data.filter(condition)
+        return self
+    def filter_columns(self, columns: List[str]):
+        if self._is_pandas():
+            self.data = self.data[columns]
+        else:
+            self.data = self.data.select(columns)
+        return self
+    def rename_columns(self, mapping: Dict[str, str]):
+        if self._is_pandas():
+            self.data = self.data.rename(columns=mapping)
+        else:
+            self.data = self.data.rename(mapping)
+        return self
+    # ------------------------------------------------------------------
+    # Outliers
+    # ------------------------------------------------------------------
+    def detect_outliers(
+        self,
+        column: str,
+        method: str = "iqr"
+    ) -> pd.DataFrame:
+        if self._is_pandas():
+            series = self.data[column]
+        else:
+            series = self.data[column].to_pandas()
+        # 2. Calcular la máscara según el método
+        if method == "iqr":
+            q1 = series.quantile(0.25)
+            q3 = series.quantile(0.75)
+            iqr = q3 - q1
+            mask_values = (series < q1 - 1.5 * iqr) | (series > q3 + 1.5 * iqr)
+        elif method == "zscore":
+            z = (series - series.mean()) / series.std()
+            mask_values = z.abs() > 3
+        else:
+            raise ValueError("method must be 'iqr' or 'zscore'")
+        outliers = self.data[mask_values.values]
+        # 4. Manejo de retorno profesional
+        if len(outliers) == 0:
+            print(f"No outliers found in column '{column}'")
+            return outliers
+        return outliers
+    # ------------------------------------------------------------------
+    # Data Quality Report
+    # ------------------------------------------------------------------
+    def data_quality(self) -> pd.DataFrame:
+        total_rows = self.data.shape[0]
+        rows = []
+        for col in self.data.columns:
+            nulls = self._count_nulls(col)
+            if self._is_pandas():
+                dtype = str(self.data[col].dtype)
+                unique = self.data[col].nunique()
+            else:
+                dtype = str(self.data.schema[col])
+                unique = self.data[col].n_unique()
+            rows.append({
+                "column": col,
+                "dtype": dtype,
+                "nulls": nulls,
+                "null_pct": nulls / total_rows,
+                "unique_values": unique,
+                "completeness_pct": 1 - (nulls / total_rows)
+            })
+        return pd.DataFrame(rows)

statslibx/probability.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ class ProbabilityStats:
2	+ pass

statslibx/utils.py CHANGED Viewed

@@ -398,11 +398,35 @@ class UtilsStats:
         return outliers
-    def calculate_effect_size(self, group1: np.ndarray, group2: np.ndarray,
-                                method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
+    def calculate_effect_size(self,
+                            data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path] = None,
+                            group1: Union[str, pd.Series, np.ndarray] = None,
+                            group2: Union[str, pd.Series, np.ndarray] = None,
+                            method: Literal['cohen', 'hedges'] = 'cohen') -> dict:
         """
         Calcula el tamaño del efecto entre dos grupos
         """
+        # --- Preparar arrays ---
+        # Caso 1: data es DataFrame y group1/group2 son nombres de columna
+        if isinstance(data, pd.DataFrame):
+            group1 = np.array(data[group1])
+            group2 = np.array(data[group2])
+        # Caso 2: data no es None, y es una serie o array, usarlo como group1
+        elif isinstance(data, (pd.Series, np.ndarray)) and group2 is not None:
+            group1 = np.array(data)
+            group2 = np.array(group2)
+        # Caso 3: group1 y group2 ya son arrays o Series
+        else:
+            group1 = np.array(group1)
+            group2 = np.array(group2)
+        # Eliminar nan automáticamente
+        group1 = group1[~np.isnan(group1)]
+        group2 = group2[~np.isnan(group2)]
+        # --- Calcular estadísticas ---
         mean1, mean2 = np.mean(group1), np.mean(group2)
         std1, std2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
         n1, n2 = len(group1), len(group2)
@@ -434,6 +458,7 @@ class UtilsStats:
             'pooled_std': pooled_std
         }
     # ============= MÉTODOS DE VISUALIZACIÓN COMPLETOS =============
     def _plot_distribution_seaborn(self, data, plot_type, bins, figsize, title, **kwargs):
@@ -528,6 +553,47 @@ class UtilsStats:
             plt.tight_layout()
         return fig
+    def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
+        """Implementación con plotly"""
+        try:
+            import plotly.graph_objects as go
+            import plotly.express as px
+            from plotly.subplots import make_subplots
+        except ImportError:
+            raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
+        if plot_type == 'all':
+            fig = make_subplots(
+                rows=2, cols=2,
+                subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
+            )
+            # Histograma
+            fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
+            # Box plot
+            fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
+            # Violin plot
+            fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
+            # Distribución acumulada
+            hist, bin_edges = np.histogram(data, bins=bins, density=True)
+            cdf = np.cumsum(hist * np.diff(bin_edges))
+            fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
+        else:
+            if plot_type == 'hist':
+                fig = px.histogram(data, nbins=bins, title=title)
+            elif plot_type == 'box':
+                fig = px.box(y=data, title=title)
+            elif plot_type == 'violin':
+                fig = px.violin(y=data, title=title, box=True)
+            else:
+                fig = px.histogram(data, nbins=bins, title=title)
+        return fig
     def plot_distribution(self,
                             data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
@@ -536,7 +602,7 @@ class UtilsStats:
                             backend: Optional[Literal['matplotlib', 'seaborn', 'plotly']] = "seaborn",
                             bins: int = 30,
                             figsize: Optional[Tuple[int, int]] = None,
-                            save_fig: Optional[bool] = None,
+                            save_fig: Optional[bool] = False,
                             filename: Optional[str] = None,
                             **kwargs):
         """
@@ -568,7 +634,7 @@ class UtilsStats:
         """
         backend = backend or self._plot_backend
         figsize = figsize or self._default_figsize
-        save_fig = save_fig if save_fig is not None else self._save_fig
+        self._save_fig = save_fig
         # Resolver datos
         data, source = self._resolve_data(data, column)
@@ -605,59 +671,20 @@ class UtilsStats:
             if save_fig and backend != 'plotly':
                 self._save_figure(fig, filename)
-            return fig
+            if backend == 'plotly':
+                return fig
         except Exception as e:
             print(f"Error en plot_distribution: {e}")
             raise
-    def _plot_distribution_plotly(self, data, plot_type, bins, title, **kwargs):
-        """Implementación con plotly"""
-        try:
-            import plotly.graph_objects as go
-            import plotly.express as px
-            from plotly.subplots import make_subplots
-        except ImportError:
-            raise ImportError("Plotly no está instalado. Instale con: pip install plotly")
-        if plot_type == 'all':
-            fig = make_subplots(
-                rows=2, cols=2,
-                subplot_titles=('Histograma', 'Box Plot', 'Violin Plot', 'Distribución Acumulada')
-            )
-            # Histograma
-            fig.add_trace(go.Histogram(x=data, nbinsx=bins, name='Histograma'), row=1, col=1)
-            # Box plot
-            fig.add_trace(go.Box(y=data, name='Box Plot'), row=1, col=2)
-            # Violin plot
-            fig.add_trace(go.Violin(y=data, name='Violin Plot'), row=2, col=1)
-            # Distribución acumulada
-            hist, bin_edges = np.histogram(data, bins=bins, density=True)
-            cdf = np.cumsum(hist * np.diff(bin_edges))
-            fig.add_trace(go.Scatter(x=bin_edges[1:], y=cdf, name='CDF'), row=2, col=2)
-        else:
-            if plot_type == 'hist':
-                fig = px.histogram(data, nbins=bins, title=title)
-            elif plot_type == 'box':
-                fig = px.box(y=data, title=title)
-            elif plot_type == 'violin':
-                fig = px.violin(y=data, title=title, box=True)
-            else:
-                fig = px.histogram(data, nbins=bins, title=title)
-        return fig
     def plot_correlation_matrix(self,
                                 data: Union[pd.DataFrame, str, Path],
-                                method: str = 'pearson',
-                                backend: Optional[Literal['seaborn', 'plotly']] = None,
+                                method: Literal['pearson', 'kendall', 'spearman'] = 'pearson',
+                                backend: Optional[Literal['seaborn', 'plotly']] = "seaborn",
+                                triangular: Optional[bool] = False,
                                 figsize: Optional[Tuple[int, int]] = None,
-                                save_fig: Optional[bool] = None,
+                                save_fig: Optional[bool] = False,
                                 filename: Optional[str] = None,
                                 **kwargs):
         """
@@ -674,25 +701,32 @@ class UtilsStats:
         """
         backend = backend or self._plot_backend
         figsize = figsize or self._default_figsize
-        save_fig = save_fig if save_fig is not None else self._save_fig
+        self.save_fig = save_fig
         filename = filename or "matriz_correlacion"
         # Resolver datos
         data, source = self._resolve_data(data)
         if not isinstance(data, pd.DataFrame):
             raise ValueError("Se requiere un DataFrame para calcular matriz de correlación")
+        else:
+            data = data.select_dtypes(include=['float64', 'int64'])
         # Calcular matriz de correlación
         corr_matrix = data.corr(method=method)
         if backend == 'seaborn':
             fig, ax = plt.subplots(figsize=figsize)
-            mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
+            if triangular:
+                mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
-            sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
-                        cmap='coolwarm', center=0, ax=ax,
-                        square=True, linewidths=0.5, **kwargs)
+                sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
+                            cmap='coolwarm', center=0, ax=ax,
+                            square=True, linewidths=0.5, **kwargs)
+            else:
+                sns.heatmap(corr_matrix, annot=True, fmt='.2f',
+                            cmap='coolwarm', center=0, ax=ax,
+                            square=True, linewidths=0.5, **kwargs)
             ax.set_title(f'Matriz de Correlación ({method})', fontsize=14, pad=20)
             plt.tight_layout()
@@ -731,15 +765,15 @@ class UtilsStats:
                     print(f"✓ Figura Plotly guardada: {filepath}")
                 except Exception as e:
                     print(f"✗ Error guardando figura Plotly: {e}")
-        return fig
+        if backend == 'plotly':
+            return fig
     def plot_scatter_matrix(self,
                             data: Union[pd.DataFrame, str, Path],
                             columns: Optional[List[str]] = None,
                             backend: Optional[Literal['seaborn', 'plotly', 'pandas']] = None,
                             figsize: Optional[Tuple[int, int]] = None,
-                            save_fig: Optional[bool] = None,
+                            save_fig: Optional[bool] = False,
                             filename: Optional[str] = None,
                             **kwargs):
         """
@@ -752,7 +786,7 @@ class UtilsStats:
         """
         backend = backend or self._plot_backend
         figsize = figsize or self._default_figsize
-        save_fig = save_fig if save_fig is not None else self._save_fig
+        self.save_fig = save_fig
         filename = filename or "scatter_matrix"
         # Resolver datos
@@ -791,7 +825,8 @@ class UtilsStats:
                 except Exception as e:
                     print(f"✗ Error guardando figura Plotly: {e}")
-        return fig
+        if backend == 'plotly':
+            return fig
     # ============= GRÁFICOS CON INTERVALOS DE CONFIANZA =============
@@ -802,7 +837,7 @@ class UtilsStats:
                                 ci_method: str = 'parametric',
                                 bins: int = 30,
                                 figsize: Optional[Tuple[int, int]] = None,
-                                save_fig: Optional[bool] = None,
+                                save_fig: Optional[bool] = False,
                                 filename: Optional[str] = None,
                                 **kwargs) -> plt.Figure:
         """
@@ -838,7 +873,7 @@ class UtilsStats:
         x_range = np.linspace(data_array.min(), data_array.max(), 300)
         # ======= FIGURA =======
-        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize or (14, 6))
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize or (14, 6))
         # ============================================================
         # PANEL 1: HISTOGRAMA + KDE
@@ -903,109 +938,37 @@ class UtilsStats:
         plt.tight_layout()
         # Guardado opcional
-        save_fig = save_fig if save_fig is not None else self._save_fig
+        self.save_fig = save_fig
         if save_fig:
             self._save_figure(fig, filename)
-        return fig
-    def plot_multiple_distributions_with_ci(self,
-                                            data_dict: dict,
-                                            confidence_level: float = 0.95,
-                                            figsize: Optional[Tuple[int, int]] = None,
-                                            save_fig: Optional[bool] = None,
-                                            filename: Optional[str] = None,
-                                            **kwargs) -> plt.Figure:
-        """
-        Grafica múltiples distribuciones con sus intervalos de confianza
-        """
-        n_distributions = len(data_dict)
-        fig, axes = plt.subplots(n_distributions, 2,
-                               figsize=figsize or (14, 5 * n_distributions))
-        if n_distributions == 1:
-            axes = axes.reshape(1, -1)
-        colors = plt.cm.Set3(np.linspace(0, 1, n_distributions))
-        for idx, (name, data) in enumerate(data_dict.items()):
-            ax1, ax2 = axes[idx]
-            if isinstance(data, pd.Series):
-                data_array = data.dropna().values
-            else:
-                data_array = np.array(data)
-                data_array = data_array[~np.isnan(data_array)]
-            # Calcular estadísticas
-            ci_result = self.calculate_confidence_intervals(data_array, confidence_level=confidence_level)
-            # Gráfica izquierda: Distribución básica
-            ax1.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
-                    edgecolor='black', density=True)
-            kde = stats.gaussian_kde(data_array)
-            x_range = np.linspace(data_array.min(), data_array.max(), 200)
-            ax1.plot(x_range, kde(x_range), 'k-', linewidth=2)
-            ax1.axvline(ci_result['mean'], color='red', linestyle='--', linewidth=2)
-            ax1.set_title(f'{name}\nMedia: {ci_result["mean"]:.2f}')
-            ax1.grid(True, alpha=0.3)
-            # Gráfica derecha: Con intervalos de confianza
-            ax2.hist(data_array, bins=30, alpha=0.7, color=colors[idx],
-                    edgecolor='black', density=True)
-            ax2.plot(x_range, kde(x_range), 'k-', linewidth=2)
-            ax2.axvline(ci_result['mean'], color='red', linestyle='-', linewidth=3)
-            ax2.axvspan(ci_result['ci_lower'], ci_result['ci_upper'],
-                        alpha=0.3, color='orange')
-            ax2.axvline(ci_result['ci_lower'], color='orange', linestyle='--', linewidth=2)
-            ax2.axvline(ci_result['ci_upper'], color='orange', linestyle='--', linewidth=2)
-            ax2.set_title(f'{name} con IC {confidence_level*100}%')
-            ax2.grid(True, alpha=0.3)
-        plt.tight_layout()
-        # Guardar figura si está activado
-        save_fig = save_fig if save_fig is not None else self._save_fig
-        if save_fig:
-            filename = filename or "multiples_distribuciones_ci"
-            self._save_figure(fig, filename)
-        return fig
     # ============= MÉTODOS UTILITARIOS ADICIONALES =============
-    def get_descriptive_stats(self,
-                                data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
-                                column: Optional[str] = None) -> dict:
-        """
-        Obtiene estadísticas descriptivas completas
-        Ahora acepta rutas de archivos
-        """
-        # Resolver datos
-        data, source = self._resolve_data(data, column)
+    def get_descriptive_stats(self, data, column=None):
         if isinstance(data, pd.DataFrame):
             if column is None:
-                raise ValueError("Debe especificar 'column' cuando data es DataFrame")
+                raise ValueError("Debe especificarse una columna")
             data_series = data[column]
-        elif isinstance(data, pd.Series):
-            data_series = data
         else:
             data_series = pd.Series(data)
         data_clean = data_series.dropna()
+        if len(data_clean) == 0:
+            return {k: np.nan for k in [
+                'count','mean','median','mode','std','variance',
+                'min','max','q1','q3','iqr','skewness','kurtosis','range'
+            ]}
+        mode_result = stats.mode(data_clean, keepdims=False)
         return {
             'count': len(data_clean),
             'mean': np.mean(data_clean),
             'median': np.median(data_clean),
-            'mode': stats.mode(data_clean)[0][0] if len(data_clean) > 0 else np.nan,
+            'mode': mode_result.mode,
             'std': np.std(data_clean, ddof=1),
             'variance': np.var(data_clean, ddof=1),
             'min': np.min(data_clean),
@@ -1017,7 +980,6 @@ class UtilsStats:
             'kurtosis': stats.kurtosis(data_clean),
             'range': np.max(data_clean) - np.min(data_clean)
         }
     def help(self):
         """
         Muestra ayuda completa de la clase DescriptiveStats

statslibx 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

statslibx 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl