PyPI - statslibx - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

statslibx 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

statslibx/__init__.py +2 -2
statslibx/datasets/__init__.py +1 -0
statslibx/datasets/course_completion.csv +100001 -0
statslibx/descriptive.py +274 -148
statslibx/inferential.py +139 -72
statslibx/utils.py +288 -82
{statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/METADATA +1 -1
statslibx-0.1.6.dist-info/RECORD +14 -0
statslibx-0.1.4.dist-info/RECORD +0 -13
{statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/WHEEL +0 -0
{statslibx-0.1.4.dist-info → statslibx-0.1.6.dist-info}/top_level.txt +0 -0

statslibx/inferential.py CHANGED Viewed

@@ -4,6 +4,7 @@ import pandas as pd
 from typing import Optional, Union, Literal, List, Dict, Any
 from datetime import datetime
 from scipy import stats
+import os
 class InferentialStats:
     """
@@ -11,10 +12,21 @@ class InferentialStats:
     """
     def __init__(self, data: Union[pd.DataFrame, np.ndarray],
-                 backend: Literal['pandas', 'polars'] = 'pandas'):
+                backend: Literal['pandas', 'polars'] = 'pandas'):
         """
         Inicializar con DataFrame o array numpy
+        Parameters:
+        -----------
+        data : DataFrame o ndarray
+            Datos a analizar
+        backend : str
+            'pandas' o 'polars' para procesamiento
         """
+        if isinstance(data, str) and os.path.exists(data):
+                data = InferentialStats.from_file(data).data
         if isinstance(data, np.ndarray):
             if data.ndim == 1:
                 data = pd.DataFrame({'var': data})
@@ -24,11 +36,45 @@ class InferentialStats:
         self.data = data
         self.backend = backend
         self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
+    @staticmethod
+    def from_file(path: str):
+        """
+        Carga automática de archivos y devuelve instancia de Intelligence.
+        Soporta CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
+        """
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Archivo no encontrado: {path}")
+        ext = os.path.splitext(path)[1].lower()
+        if ext == ".csv":
+            df = pd.read_csv(path)
+        elif ext in [".xlsx", ".xls"]:
+            df = pd.read_excel(path)
+        elif ext in [".txt", ".tsv"]:
+            df = pd.read_table(path)
+        elif ext == ".json":
+            df = pd.read_json(path)
+        elif ext == ".parquet":
+            df = pd.read_parquet(path)
+        elif ext == ".feather":
+            df = pd.read_feather(path)
+        else:
+            raise ValueError(f"Formato no soportado: {ext}")
+        return InferentialStats(df)
     # ============= INTERVALOS DE CONFIANZA =============
     def confidence_interval(self, column: str, confidence: float = 0.95,
-                           statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
+                            statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
         """
         Intervalo de confianza para diferentes estadísticos
@@ -81,8 +127,8 @@ class InferentialStats:
     # ============= PRUEBAS DE HIPÓTESIS =============
     def t_test_1sample(self, column: str, popmean: float = None,
-                       popmedian: float = None,
-                       alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
+                        popmedian: float = None,
+                        alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
         """
         Prueba t de una muestra (para media o mediana)
@@ -137,8 +183,8 @@ class InferentialStats:
             raise ValueError("Debe especificar popmean o popmedian")
     def t_test_2sample(self, column1: str, column2: str,
-                       equal_var: bool = True,
-                       alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
+                        equal_var: bool = True,
+                        alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
         """
         Prueba t de dos muestras independientes
@@ -172,9 +218,16 @@ class InferentialStats:
         )
     def t_test_paired(self, column1: str, column2: str,
-                     alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
+                        alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
         """
         Prueba t pareada
+        Parameters:
+        -----------
+        column1, column2:
+            Datos a analizar
+        alternative:
+            "two-sided", "less" o "greater"
         """
         from scipy import stats
@@ -192,7 +245,7 @@ class InferentialStats:
         )
     def mann_whitney_test(self, column1: str, column2: str,
-                         alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
+                            alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
         """
         Prueba de Mann-Whitney U (alternativa no paramétrica al t-test)
@@ -300,8 +353,8 @@ class InferentialStats:
         )
     def normality_test(self, column: str,
-                      method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
-                      test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
+                        method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
+                        test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
         """
         Prueba de normalidad con múltiples métodos y estadísticos
@@ -456,74 +509,88 @@ class InferentialStats:
             column1: str = None,
             column2: str = None,
             alpha: float = 0.05,
-            homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene"
-        ) -> Dict[str, Any]:
-            data = self.data
-            if column1 is None:
-                raise ValueError("Debes especificar 'column1'.")
-            x = data[column1].dropna()
-            if method in ["difference_mean", "variance"] and column2 is None:
-                raise ValueError("Para este método debes pasar 'column2'.")
-            y = data[column2].dropna() if column2 else None
-            # --- homoscedasticity test ---
-            homo_result = None
-            if method in ["difference_mean", "variance"]:
-                homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
-            # --- MAIN HYPOTHESIS TESTS ---
-            if method == "mean":
-                # One-sample t-test
-                t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
-                test_name = "One-sample t-test"
-            elif method == "difference_mean":
-                # Two-sample t-test
-                equal_var = homo_result["equal_var"]
-                t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
-                test_name = "Two-sample t-test"
-            elif method == "proportion":
-                # Proportion test (z-test)
-                p_hat = np.mean(x)
-                n = len(x)
-                z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
-                p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
-                t_stat = z_stat
-                test_name = "Proportion Z-test"
-            elif method == "variance":
-                # Classic F-test
-                var_x = np.var(x, ddof=1)
-                var_y = np.var(y, ddof=1)
-                F = var_x / var_y
-                dfn = len(x) - 1
-                dfd = len(y) - 1
-                p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
-                t_stat = F
-                test_name = "Variance F-test"
+            homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
+        """
+        Test de Hipotesis
-            return {
-                "test": test_name,
-                "statistic": t_stat,
-                "p_value": p_value,
-                "alpha": alpha,
-                "reject_H0": p_value < alpha,
-                "homoscedasticity_test": homo_result
-            }
+        Parameters:
+        -----------
+        method : str
+            'mean', 'difference_mean', 'proportion' o 'variance'
+        column1, column2 : str
+            Columnas numéricas a comparar
+        alpha : float
+            Nivel de significancia (default 0.05)
+        homoscedasticity : str
+            Método de homocedasticidad
+            'levene', 'bartlett' o 'var_test'
+        """
+        data = self.data
+        if column1 is None:
+            raise ValueError("Debes especificar 'column1'.")
+        x = data[column1].dropna()
+        if method in ["difference_mean", "variance"] and column2 is None:
+            raise ValueError("Para este método debes pasar 'column2'.")
+        y = data[column2].dropna() if column2 else None
+        # --- homoscedasticity test ---
+        homo_result = None
+        if method in ["difference_mean", "variance"]:
+            homo_result = self._homoscedasticity_test(x, y, homoscedasticity)
+        # --- MAIN HYPOTHESIS TESTS ---
+        if method == "mean":
+            # One-sample t-test
+            t_stat, p_value = stats.ttest_1samp(x, popmean=np.mean(x))
+            test_name = "One-sample t-test"
+        elif method == "difference_mean":
+            # Two-sample t-test
+            equal_var = homo_result["equal_var"]
+            t_stat, p_value = stats.ttest_ind(x, y, equal_var=equal_var)
+            test_name = "Two-sample t-test"
+        elif method == "proportion":
+            # Proportion test (z-test)
+            p_hat = np.mean(x)
+            n = len(x)
+            z_stat = (p_hat - 0.5) / np.sqrt(0.5 * 0.5 / n)
+            p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
+            t_stat = z_stat
+            test_name = "Proportion Z-test"
+        elif method == "variance":
+            # Classic F-test
+            var_x = np.var(x, ddof=1)
+            var_y = np.var(y, ddof=1)
+            F = var_x / var_y
+            dfn = len(x) - 1
+            dfd = len(y) - 1
+            p_value = 2 * min(stats.f.cdf(F, dfn, dfd), 1 - stats.f.cdf(F, dfn, dfd))
+            t_stat = F
+            test_name = "Variance F-test"
+        return {
+            "test": test_name,
+            "statistic": t_stat,
+            "p_value": p_value,
+            "alpha": alpha,
+            "reject_H0": p_value < alpha,
+            "homoscedasticity_test": homo_result
+        }
     def _homoscedasticity_test(
         self,
         x,
         y,
-        method: Literal["levene", "bartlett", "var_test"] = "levene"
-    ) -> Dict[str, Any]:
+        method: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
         if method == "levene":
             stat, p = stats.levene(x, y)

statslibx 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

statslibx 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl