PyPI - statslibx - Versions diffs - 0.1.0__py3-none-any.whl - Mend

statslibx 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

statslib/__init__.py +35 -0
statslib/descriptive.py +579 -0
statslib/inferential.py +547 -0
statslib/utils.py +889 -0
statslibx-0.1.0.dist-info/METADATA +46 -0
statslibx-0.1.0.dist-info/RECORD +8 -0
statslibx-0.1.0.dist-info/WHEEL +5 -0
statslibx-0.1.0.dist-info/top_level.txt +1 -0

statslib/inferential.py ADDED Viewed

@@ -0,0 +1,547 @@
+import numpy as np
+import pandas as pd
+from typing import Optional, Union, Literal, List
+from datetime import datetime
+class InferentialStats:
+    """
+    Clase para estadística inferencial (pruebas de hipótesis, intervalos de confianza, etc.)
+    """
+    def __init__(self, data: Union[pd.DataFrame, np.ndarray],
+                 backend: Literal['pandas', 'polars'] = 'pandas'):
+        """
+        Inicializar con DataFrame o array numpy
+        """
+        if isinstance(data, np.ndarray):
+            if data.ndim == 1:
+                data = pd.DataFrame({'var': data})
+            else:
+                data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
+        self.data = data
+        self.backend = backend
+        self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
+    # ============= INTERVALOS DE CONFIANZA =============
+    def confidence_interval(self, column: str, confidence: float = 0.95,
+                           statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
+        """
+        Intervalo de confianza para diferentes estadísticos
+        Parameters:
+        -----------
+        column : str
+            Columna a analizar
+        confidence : float
+            Nivel de confianza (default 0.95 = 95%)
+        statistic : str
+            'mean', 'median' o 'proportion'
+        Returns:
+        --------
+        tuple : (lower_bound, upper_bound, point_estimate)
+        """
+        from scipy import stats
+        data = self.data[column].dropna()
+        n = len(data)
+        alpha = 1 - confidence
+        if statistic == 'mean':
+            point_est = data.mean()
+            se = stats.sem(data)
+            margin = se * stats.t.ppf((1 + confidence) / 2, n - 1)
+            return (point_est - margin, point_est + margin, point_est)
+        elif statistic == 'median':
+            # Bootstrap para mediana
+            point_est = data.median()
+            n_bootstrap = 10000
+            bootstrap_medians = []
+            for _ in range(n_bootstrap):
+                sample = np.random.choice(data, size=n, replace=True)
+                bootstrap_medians.append(np.median(sample))
+            lower = np.percentile(bootstrap_medians, (alpha/2) * 100)
+            upper = np.percentile(bootstrap_medians, (1 - alpha/2) * 100)
+            return (lower, upper, point_est)
+        elif statistic == 'proportion':
+            # Asume datos binarios (0/1)
+            point_est = data.mean()
+            se = np.sqrt(point_est * (1 - point_est) / n)
+            z_critical = stats.norm.ppf((1 + confidence) / 2)
+            margin = z_critical * se
+            return (point_est - margin, point_est + margin, point_est)
+    # ============= PRUEBAS DE HIPÓTESIS =============
+    def t_test_1sample(self, column: str, popmean: float = None,
+                       popmedian: float = None,
+                       alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
+        """
+        Prueba t de una muestra (para media o mediana)
+        Parameters:
+        -----------
+        column : str
+            Columna a analizar
+        popmean : float, optional
+            Media poblacional hipotética
+        popmedian : float, optional
+            Mediana poblacional hipotética (usa signed-rank test)
+        alternative : str
+            Hipótesis alternativa
+        """
+        from scipy import stats
+        data = self.data[column].dropna()
+        if popmean is not None:
+            statistic, pvalue = stats.ttest_1samp(data, popmean, alternative=alternative)
+            return TestResult(
+                test_name='T-Test de Una Muestra (Media)',
+                statistic=statistic,
+                pvalue=pvalue,
+                alternative=alternative,
+                params={
+                    'popmean': popmean,
+                    'sample_mean': data.mean(),
+                    'n': len(data),
+                    'df': len(data) - 1
+                }
+            )
+        elif popmedian is not None:
+            # Wilcoxon signed-rank test para mediana
+            statistic, pvalue = stats.wilcoxon(data - popmedian, alternative=alternative)
+            return TestResult(
+                test_name='Wilcoxon Signed-Rank Test (Mediana)',
+                statistic=statistic,
+                pvalue=pvalue,
+                alternative=alternative,
+                params={
+                    'popmedian': popmedian,
+                    'sample_median': data.median(),
+                    'n': len(data)
+                }
+            )
+        else:
+            raise ValueError("Debe especificar popmean o popmedian")
+    def t_test_2sample(self, column1: str, column2: str,
+                       equal_var: bool = True,
+                       alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
+        """
+        Prueba t de dos muestras independientes
+        Parameters:
+        -----------
+        column1, column2 : str
+            Columnas a comparar
+        equal_var : bool
+            Asumir varianzas iguales
+        alternative : str
+            Hipótesis alternativa
+        """
+        from scipy import stats
+        data1 = self.data[column1].dropna()
+        data2 = self.data[column2].dropna()
+        statistic, pvalue = stats.ttest_ind(data1, data2, equal_var=equal_var, alternative=alternative)
+        return TestResult(
+            test_name='T-Test de Dos Muestras',
+            statistic=statistic,
+            pvalue=pvalue,
+            alternative=alternative,
+            params={
+                'mean1': data1.mean(), 'mean2': data2.mean(),
+                'std1': data1.std(), 'std2': data2.std(),
+                'n1': len(data1), 'n2': len(data2),
+                'equal_var': equal_var
+            }
+        )
+    def t_test_paired(self, column1: str, column2: str,
+                     alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
+        """
+        Prueba t pareada
+        """
+        from scipy import stats
+        data1 = self.data[column1].dropna()
+        data2 = self.data[column2].dropna()
+        statistic, pvalue = stats.ttest_rel(data1, data2, alternative=alternative)
+        return TestResult(
+            test_name='T-Test Pareado',
+            statistic=statistic,
+            pvalue=pvalue,
+            alternative=alternative,
+            params={'mean_diff': (data1 - data2).mean(), 'n': len(data1)}
+        )
+    def mann_whitney_test(self, column1: str, column2: str,
+                         alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
+        """
+        Prueba de Mann-Whitney U (alternativa no paramétrica al t-test)
+        Parameters:
+        -----------
+        column1, column2 : str
+            Columnas a comparar
+        alternative : str
+            Hipótesis alternativa
+        """
+        from scipy import stats
+        data1 = self.data[column1].dropna()
+        data2 = self.data[column2].dropna()
+        statistic, pvalue = stats.mannwhitneyu(data1, data2, alternative=alternative)
+        return TestResult(
+            test_name='Mann-Whitney U Test',
+            statistic=statistic,
+            pvalue=pvalue,
+            alternative=alternative,
+            params={
+                'median1': data1.median(),
+                'median2': data2.median(),
+                'n1': len(data1),
+                'n2': len(data2)
+            }
+        )
+    def chi_square_test(self, column1: str, column2: str) -> 'TestResult':
+        """
+        Prueba Chi-cuadrado de independencia
+        Parameters:
+        -----------
+        column1, column2 : str
+            Variables categóricas a probar
+        """
+        from scipy import stats
+        contingency_table = pd.crosstab(self.data[column1], self.data[column2])
+        chi2, pvalue, dof, expected = stats.chi2_contingency(contingency_table)
+        return TestResult(
+            test_name='Prueba Chi-Cuadrado de Independencia',
+            statistic=chi2,
+            pvalue=pvalue,
+            alternative='two-sided',
+            params={'dof': dof, 'contingency_table': contingency_table}
+        )
+    def anova_oneway(self, column: str, groups: str) -> 'TestResult':
+        """
+        ANOVA de un factor
+        Parameters:
+        -----------
+        column : str
+            Variable dependiente (numérica)
+        groups : str
+            Variable de agrupación (categórica)
+        """
+        from scipy import stats
+        groups_data = [group[column].values for name, group in self.data.groupby(groups)]
+        statistic, pvalue = stats.f_oneway(*groups_data)
+        return TestResult(
+            test_name='ANOVA de Un Factor',
+            statistic=statistic,
+            pvalue=pvalue,
+            alternative='two-sided',
+            params={
+                'groups': len(groups_data),
+                'n_total': sum(len(g) for g in groups_data)
+            }
+        )
+    def kruskal_wallis_test(self, column: str, groups: str) -> 'TestResult':
+        """
+        Prueba de Kruskal-Wallis (ANOVA no paramétrico)
+        Parameters:
+        -----------
+        column : str
+            Variable dependiente (numérica)
+        groups : str
+            Variable de agrupación (categórica)
+        """
+        from scipy import stats
+        groups_data = [group[column].values for name, group in self.data.groupby(groups)]
+        statistic, pvalue = stats.kruskal(*groups_data)
+        return TestResult(
+            test_name='Kruskal-Wallis Test',
+            statistic=statistic,
+            pvalue=pvalue,
+            alternative='two-sided',
+            params={
+                'groups': len(groups_data),
+                'n_total': sum(len(g) for g in groups_data)
+            }
+        )
+    def normality_test(self, column: str,
+                      method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
+                      test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
+        """
+        Prueba de normalidad con múltiples métodos y estadísticos
+        Parameters:
+        -----------
+        column : str
+            Columna a analizar
+        method : str
+            'shapiro' (Shapiro-Wilk)
+            'ks' (Kolmogorov-Smirnov)
+            'anderson' (Anderson-Darling)
+            'jarque_bera' (Jarque-Bera)
+            'all' (ejecutar todos los tests)
+        test_statistic : str
+            'mean', 'median' o 'mode' - estadístico para centrar la distribución
+        Returns:
+        --------
+        TestResult o dict
+            Si method='all', retorna dict con todos los resultados
+        """
+        from scipy import stats
+        data = self.data[column].dropna().values
+        n = len(data)
+        # Centrar los datos según el estadístico elegido
+        if test_statistic == 'mean':
+            loc = np.mean(data)
+            scale = np.std(data, ddof=1)
+        elif test_statistic == 'median':
+            loc = np.median(data)
+            # MAD (Median Absolute Deviation) como escala
+            scale = np.median(np.abs(data - loc)) * 1.4826
+        elif test_statistic == 'mode':
+            from scipy.stats import mode as scipy_mode
+            mode_result = scipy_mode(data, keepdims=True)
+            loc = mode_result.mode[0]
+            scale = np.std(data, ddof=1)
+        else:
+            raise ValueError(f"test_statistic '{test_statistic}' no reconocido")
+        if method == 'all':
+            results = {}
+            # Shapiro-Wilk
+            if n <= 5000:  # Shapiro tiene límite de muestra
+                stat_sw, p_sw = stats.shapiro(data)
+                results['shapiro'] = TestResult(
+                    test_name=f'Shapiro-Wilk ({test_statistic})',
+                    statistic=stat_sw,
+                    pvalue=p_sw,
+                    alternative='two-sided',
+                    params={'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
+                )
+            # Kolmogorov-Smirnov
+            stat_ks, p_ks = stats.kstest(data, 'norm', args=(loc, scale))
+            results['kolmogorov_smirnov'] = TestResult(
+                test_name=f'Kolmogorov-Smirnov ({test_statistic})',
+                statistic=stat_ks,
+                pvalue=p_ks,
+                alternative='two-sided',
+                params={'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
+            )
+            # Anderson-Darling
+            anderson_result = stats.anderson(data, dist='norm')
+            results['anderson_darling'] = {
+                'test_name': f'Anderson-Darling ({test_statistic})',
+                'statistic': anderson_result.statistic,
+                'critical_values': anderson_result.critical_values,
+                'significance_levels': anderson_result.significance_level,
+                'params': {'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
+            }
+            # Jarque-Bera
+            stat_jb, p_jb = stats.jarque_bera(data)
+            results['jarque_bera'] = TestResult(
+                test_name=f'Jarque-Bera ({test_statistic})',
+                statistic=stat_jb,
+                pvalue=p_jb,
+                alternative='two-sided',
+                params={
+                    'n': n,
+                    'test_statistic': test_statistic,
+                    'skewness': stats.skew(data),
+                    'kurtosis': stats.kurtosis(data)
+                }
+            )
+            return results
+        elif method == 'shapiro':
+            if n > 5000:
+                raise ValueError("Shapiro-Wilk requiere n <= 5000. Use otro método o 'all'")
+            statistic, pvalue = stats.shapiro(data)
+            test_name = f'Shapiro-Wilk ({test_statistic})'
+            params = {'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
+        elif method == 'ks':
+            statistic, pvalue = stats.kstest(data, 'norm', args=(loc, scale))
+            test_name = f'Kolmogorov-Smirnov ({test_statistic})'
+            params = {'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
+        elif method == 'anderson':
+            anderson_result = stats.anderson(data, dist='norm')
+            return {
+                'test_name': f'Anderson-Darling ({test_statistic})',
+                'statistic': anderson_result.statistic,
+                'critical_values': anderson_result.critical_values,
+                'significance_levels': anderson_result.significance_level,
+                'params': {'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale},
+                'interpretation': self._interpret_anderson(anderson_result)
+            }
+        elif method == 'jarque_bera':
+            statistic, pvalue = stats.jarque_bera(data)
+            test_name = f'Jarque-Bera ({test_statistic})'
+            params = {
+                'n': n,
+                'test_statistic': test_statistic,
+                'skewness': stats.skew(data),
+                'kurtosis': stats.kurtosis(data)
+            }
+        else:
+            raise ValueError(f"Método '{method}' no reconocido")
+        return TestResult(
+            test_name=test_name,
+            statistic=statistic,
+            pvalue=pvalue,
+            alternative='two-sided',
+            params=params
+        )
+    def _interpret_anderson(self, anderson_result):
+        """Interpreta resultados de Anderson-Darling"""
+        interpretations = []
+        for i, (crit_val, sig_level) in enumerate(zip(anderson_result.critical_values,
+                                                    anderson_result.significance_level)):
+            if anderson_result.statistic < crit_val:
+                interpretations.append(f"No se rechaza normalidad al {sig_level}% de significancia")
+            else:
+                interpretations.append(f"Se RECHAZA normalidad al {sig_level}% de significancia")
+        return interpretations
+    def help(self):
+        """
+        Muestra ayuda completa de la clase DescriptiveStats
+        """
+        help_text = """
+            📈 CLASE InferencialStats - AYUDA COMPLETA
+            Clase para análisis estadístico inferencial univariado y multivariado
+            🔧 MÉTODOS PRINCIPALES:
+            1. 📊 ESTADÍSTICAS UNIVARIADAS:
+            • .mean(), .median(), .mode()        # Tendencia central
+            • .std(), .variance()                # Dispersión
+            • .skewness(), .kurtosis()           # Forma de distribución
+            • .quantile(0.25)                    # Cuantiles
+            • .outliers('columna')               # Detección de outliers
+            2. 🔗 ESTADÍSTICAS MULTIVARIADAS:
+            • .correlation()                     # Matriz de correlación
+            • .covariance()                      # Matriz de covarianza
+            3. 📋 RESUMEN COMPLETO:
+            • .summary()                         # Resumen descriptivo completo
+            • .summary(show_plot=True)           # Con visualizaciones
+            4. 📈 REGRESIÓN LINEAL:
+            • .linear_regression(y, X)           # Regresión simple/múltiple
+            💡 EJEMPLOS DE USO:
+            # Inicializar
+            estadisticas = DescriptiveStats(mi_dataframe)
+            # Análisis univariado
+            media = estadisticas.mean('edad')
+            resumen = estadisticas.summary()
+            # Regresión
+            modelo = estadisticas.linear_regression(
+                y='ventas',
+                X=['publicidad', 'precio'],
+                show_plot=True
+            )
+            print(modelo.summary())
+            """
+        print(help_text)
+class TestResult:
+    """Clase para resultados de pruebas de hipótesis"""
+    def __init__(self, test_name: str, statistic: float, pvalue: float,
+                 alternative: str, params: dict):
+        self.test_name = test_name
+        self.statistic = statistic
+        self.pvalue = pvalue
+        self.alternative = alternative
+        self.params = params
+    def __repr__(self):
+        return self._format_output()
+    def _format_output(self):
+        """Formato de salida para pruebas de hipótesis"""
+        output = []
+        output.append("=" * 80)
+        output.append(self.test_name.center(80))
+        output.append("=" * 80)
+        output.append(f"Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        output.append(f"Hipótesis Alternativa: {self.alternative}")
+        output.append("-" * 80)
+        output.append("\nRESULTADOS:")
+        output.append("-" * 80)
+        output.append(f"{'Estadístico':<40} {self.statistic:>20.6f}")
+        output.append(f"{'Valor p':<40} {self.pvalue:>20.6e}")
+        # Interpretación
+        alpha = 0.05
+        output.append(f"\nInterpretación (α = {alpha}):")
+        if self.pvalue < alpha:
+            output.append(f"  → Se RECHAZA la hipótesis nula (p < {alpha})")
+        else:
+            output.append(f"  → NO se rechaza la hipótesis nula (p >= {alpha})")
+        # Parámetros adicionales
+        if self.params:
+            output.append("\nPARÁMETROS ADICIONALES:")
+            output.append("-" * 80)
+            for key, value in self.params.items():
+                if isinstance(value, (int, float)):
+                    output.append(f"{key:<40} {value:>20.6f}")
+                else:
+                    output.append(f"{key}: {value}")
+        output.append("=" * 80)
+        return "\n".join(output)