statslibx 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statslib/inferential.py DELETED
@@ -1,547 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- from typing import Optional, Union, Literal, List
4
- from datetime import datetime
5
-
6
- class InferentialStats:
7
- """
8
- Clase para estadística inferencial (pruebas de hipótesis, intervalos de confianza, etc.)
9
- """
10
-
11
- def __init__(self, data: Union[pd.DataFrame, np.ndarray],
12
- backend: Literal['pandas', 'polars'] = 'pandas'):
13
- """
14
- Inicializar con DataFrame o array numpy
15
- """
16
- if isinstance(data, np.ndarray):
17
- if data.ndim == 1:
18
- data = pd.DataFrame({'var': data})
19
- else:
20
- data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
21
-
22
- self.data = data
23
- self.backend = backend
24
- self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
25
-
26
- # ============= INTERVALOS DE CONFIANZA =============
27
-
28
- def confidence_interval(self, column: str, confidence: float = 0.95,
29
- statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
30
- """
31
- Intervalo de confianza para diferentes estadísticos
32
-
33
- Parameters:
34
- -----------
35
- column : str
36
- Columna a analizar
37
- confidence : float
38
- Nivel de confianza (default 0.95 = 95%)
39
- statistic : str
40
- 'mean', 'median' o 'proportion'
41
-
42
- Returns:
43
- --------
44
- tuple : (lower_bound, upper_bound, point_estimate)
45
- """
46
- from scipy import stats
47
-
48
- data = self.data[column].dropna()
49
- n = len(data)
50
- alpha = 1 - confidence
51
-
52
- if statistic == 'mean':
53
- point_est = data.mean()
54
- se = stats.sem(data)
55
- margin = se * stats.t.ppf((1 + confidence) / 2, n - 1)
56
- return (point_est - margin, point_est + margin, point_est)
57
-
58
- elif statistic == 'median':
59
- # Bootstrap para mediana
60
- point_est = data.median()
61
- n_bootstrap = 10000
62
- bootstrap_medians = []
63
- for _ in range(n_bootstrap):
64
- sample = np.random.choice(data, size=n, replace=True)
65
- bootstrap_medians.append(np.median(sample))
66
-
67
- lower = np.percentile(bootstrap_medians, (alpha/2) * 100)
68
- upper = np.percentile(bootstrap_medians, (1 - alpha/2) * 100)
69
- return (lower, upper, point_est)
70
-
71
- elif statistic == 'proportion':
72
- # Asume datos binarios (0/1)
73
- point_est = data.mean()
74
- se = np.sqrt(point_est * (1 - point_est) / n)
75
- z_critical = stats.norm.ppf((1 + confidence) / 2)
76
- margin = z_critical * se
77
- return (point_est - margin, point_est + margin, point_est)
78
-
79
- # ============= PRUEBAS DE HIPÓTESIS =============
80
-
81
- def t_test_1sample(self, column: str, popmean: float = None,
82
- popmedian: float = None,
83
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
84
- """
85
- Prueba t de una muestra (para media o mediana)
86
-
87
- Parameters:
88
- -----------
89
- column : str
90
- Columna a analizar
91
- popmean : float, optional
92
- Media poblacional hipotética
93
- popmedian : float, optional
94
- Mediana poblacional hipotética (usa signed-rank test)
95
- alternative : str
96
- Hipótesis alternativa
97
- """
98
- from scipy import stats
99
-
100
- data = self.data[column].dropna()
101
-
102
- if popmean is not None:
103
- statistic, pvalue = stats.ttest_1samp(data, popmean, alternative=alternative)
104
-
105
- return TestResult(
106
- test_name='T-Test de Una Muestra (Media)',
107
- statistic=statistic,
108
- pvalue=pvalue,
109
- alternative=alternative,
110
- params={
111
- 'popmean': popmean,
112
- 'sample_mean': data.mean(),
113
- 'n': len(data),
114
- 'df': len(data) - 1
115
- }
116
- )
117
-
118
- elif popmedian is not None:
119
- # Wilcoxon signed-rank test para mediana
120
- statistic, pvalue = stats.wilcoxon(data - popmedian, alternative=alternative)
121
-
122
- return TestResult(
123
- test_name='Wilcoxon Signed-Rank Test (Mediana)',
124
- statistic=statistic,
125
- pvalue=pvalue,
126
- alternative=alternative,
127
- params={
128
- 'popmedian': popmedian,
129
- 'sample_median': data.median(),
130
- 'n': len(data)
131
- }
132
- )
133
-
134
- else:
135
- raise ValueError("Debe especificar popmean o popmedian")
136
-
137
- def t_test_2sample(self, column1: str, column2: str,
138
- equal_var: bool = True,
139
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
140
- """
141
- Prueba t de dos muestras independientes
142
-
143
- Parameters:
144
- -----------
145
- column1, column2 : str
146
- Columnas a comparar
147
- equal_var : bool
148
- Asumir varianzas iguales
149
- alternative : str
150
- Hipótesis alternativa
151
- """
152
- from scipy import stats
153
-
154
- data1 = self.data[column1].dropna()
155
- data2 = self.data[column2].dropna()
156
-
157
- statistic, pvalue = stats.ttest_ind(data1, data2, equal_var=equal_var, alternative=alternative)
158
-
159
- return TestResult(
160
- test_name='T-Test de Dos Muestras',
161
- statistic=statistic,
162
- pvalue=pvalue,
163
- alternative=alternative,
164
- params={
165
- 'mean1': data1.mean(), 'mean2': data2.mean(),
166
- 'std1': data1.std(), 'std2': data2.std(),
167
- 'n1': len(data1), 'n2': len(data2),
168
- 'equal_var': equal_var
169
- }
170
- )
171
-
172
- def t_test_paired(self, column1: str, column2: str,
173
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
174
- """
175
- Prueba t pareada
176
- """
177
- from scipy import stats
178
-
179
- data1 = self.data[column1].dropna()
180
- data2 = self.data[column2].dropna()
181
-
182
- statistic, pvalue = stats.ttest_rel(data1, data2, alternative=alternative)
183
-
184
- return TestResult(
185
- test_name='T-Test Pareado',
186
- statistic=statistic,
187
- pvalue=pvalue,
188
- alternative=alternative,
189
- params={'mean_diff': (data1 - data2).mean(), 'n': len(data1)}
190
- )
191
-
192
- def mann_whitney_test(self, column1: str, column2: str,
193
- alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided') -> 'TestResult':
194
- """
195
- Prueba de Mann-Whitney U (alternativa no paramétrica al t-test)
196
-
197
- Parameters:
198
- -----------
199
- column1, column2 : str
200
- Columnas a comparar
201
- alternative : str
202
- Hipótesis alternativa
203
- """
204
- from scipy import stats
205
-
206
- data1 = self.data[column1].dropna()
207
- data2 = self.data[column2].dropna()
208
-
209
- statistic, pvalue = stats.mannwhitneyu(data1, data2, alternative=alternative)
210
-
211
- return TestResult(
212
- test_name='Mann-Whitney U Test',
213
- statistic=statistic,
214
- pvalue=pvalue,
215
- alternative=alternative,
216
- params={
217
- 'median1': data1.median(),
218
- 'median2': data2.median(),
219
- 'n1': len(data1),
220
- 'n2': len(data2)
221
- }
222
- )
223
-
224
- def chi_square_test(self, column1: str, column2: str) -> 'TestResult':
225
- """
226
- Prueba Chi-cuadrado de independencia
227
-
228
- Parameters:
229
- -----------
230
- column1, column2 : str
231
- Variables categóricas a probar
232
- """
233
- from scipy import stats
234
-
235
- contingency_table = pd.crosstab(self.data[column1], self.data[column2])
236
- chi2, pvalue, dof, expected = stats.chi2_contingency(contingency_table)
237
-
238
- return TestResult(
239
- test_name='Prueba Chi-Cuadrado de Independencia',
240
- statistic=chi2,
241
- pvalue=pvalue,
242
- alternative='two-sided',
243
- params={'dof': dof, 'contingency_table': contingency_table}
244
- )
245
-
246
- def anova_oneway(self, column: str, groups: str) -> 'TestResult':
247
- """
248
- ANOVA de un factor
249
-
250
- Parameters:
251
- -----------
252
- column : str
253
- Variable dependiente (numérica)
254
- groups : str
255
- Variable de agrupación (categórica)
256
- """
257
- from scipy import stats
258
-
259
- groups_data = [group[column].values for name, group in self.data.groupby(groups)]
260
- statistic, pvalue = stats.f_oneway(*groups_data)
261
-
262
- return TestResult(
263
- test_name='ANOVA de Un Factor',
264
- statistic=statistic,
265
- pvalue=pvalue,
266
- alternative='two-sided',
267
- params={
268
- 'groups': len(groups_data),
269
- 'n_total': sum(len(g) for g in groups_data)
270
- }
271
- )
272
-
273
- def kruskal_wallis_test(self, column: str, groups: str) -> 'TestResult':
274
- """
275
- Prueba de Kruskal-Wallis (ANOVA no paramétrico)
276
-
277
- Parameters:
278
- -----------
279
- column : str
280
- Variable dependiente (numérica)
281
- groups : str
282
- Variable de agrupación (categórica)
283
- """
284
- from scipy import stats
285
-
286
- groups_data = [group[column].values for name, group in self.data.groupby(groups)]
287
- statistic, pvalue = stats.kruskal(*groups_data)
288
-
289
- return TestResult(
290
- test_name='Kruskal-Wallis Test',
291
- statistic=statistic,
292
- pvalue=pvalue,
293
- alternative='two-sided',
294
- params={
295
- 'groups': len(groups_data),
296
- 'n_total': sum(len(g) for g in groups_data)
297
- }
298
- )
299
-
300
- def normality_test(self, column: str,
301
- method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro',
302
- test_statistic: Literal['mean', 'median', 'mode'] = 'mean') -> Union['TestResult', dict]:
303
- """
304
- Prueba de normalidad con múltiples métodos y estadísticos
305
-
306
- Parameters:
307
- -----------
308
- column : str
309
- Columna a analizar
310
- method : str
311
- 'shapiro' (Shapiro-Wilk)
312
- 'ks' (Kolmogorov-Smirnov)
313
- 'anderson' (Anderson-Darling)
314
- 'jarque_bera' (Jarque-Bera)
315
- 'all' (ejecutar todos los tests)
316
- test_statistic : str
317
- 'mean', 'median' o 'mode' - estadístico para centrar la distribución
318
-
319
- Returns:
320
- --------
321
- TestResult o dict
322
- Si method='all', retorna dict con todos los resultados
323
- """
324
- from scipy import stats
325
-
326
- data = self.data[column].dropna().values
327
- n = len(data)
328
-
329
- # Centrar los datos según el estadístico elegido
330
- if test_statistic == 'mean':
331
- loc = np.mean(data)
332
- scale = np.std(data, ddof=1)
333
- elif test_statistic == 'median':
334
- loc = np.median(data)
335
- # MAD (Median Absolute Deviation) como escala
336
- scale = np.median(np.abs(data - loc)) * 1.4826
337
- elif test_statistic == 'mode':
338
- from scipy.stats import mode as scipy_mode
339
- mode_result = scipy_mode(data, keepdims=True)
340
- loc = mode_result.mode[0]
341
- scale = np.std(data, ddof=1)
342
- else:
343
- raise ValueError(f"test_statistic '{test_statistic}' no reconocido")
344
-
345
- if method == 'all':
346
- results = {}
347
-
348
- # Shapiro-Wilk
349
- if n <= 5000: # Shapiro tiene límite de muestra
350
- stat_sw, p_sw = stats.shapiro(data)
351
- results['shapiro'] = TestResult(
352
- test_name=f'Shapiro-Wilk ({test_statistic})',
353
- statistic=stat_sw,
354
- pvalue=p_sw,
355
- alternative='two-sided',
356
- params={'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
357
- )
358
-
359
- # Kolmogorov-Smirnov
360
- stat_ks, p_ks = stats.kstest(data, 'norm', args=(loc, scale))
361
- results['kolmogorov_smirnov'] = TestResult(
362
- test_name=f'Kolmogorov-Smirnov ({test_statistic})',
363
- statistic=stat_ks,
364
- pvalue=p_ks,
365
- alternative='two-sided',
366
- params={'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
367
- )
368
-
369
- # Anderson-Darling
370
- anderson_result = stats.anderson(data, dist='norm')
371
- results['anderson_darling'] = {
372
- 'test_name': f'Anderson-Darling ({test_statistic})',
373
- 'statistic': anderson_result.statistic,
374
- 'critical_values': anderson_result.critical_values,
375
- 'significance_levels': anderson_result.significance_level,
376
- 'params': {'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
377
- }
378
-
379
- # Jarque-Bera
380
- stat_jb, p_jb = stats.jarque_bera(data)
381
- results['jarque_bera'] = TestResult(
382
- test_name=f'Jarque-Bera ({test_statistic})',
383
- statistic=stat_jb,
384
- pvalue=p_jb,
385
- alternative='two-sided',
386
- params={
387
- 'n': n,
388
- 'test_statistic': test_statistic,
389
- 'skewness': stats.skew(data),
390
- 'kurtosis': stats.kurtosis(data)
391
- }
392
- )
393
-
394
- return results
395
-
396
- elif method == 'shapiro':
397
- if n > 5000:
398
- raise ValueError("Shapiro-Wilk requiere n <= 5000. Use otro método o 'all'")
399
- statistic, pvalue = stats.shapiro(data)
400
- test_name = f'Shapiro-Wilk ({test_statistic})'
401
- params = {'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
402
-
403
- elif method == 'ks':
404
- statistic, pvalue = stats.kstest(data, 'norm', args=(loc, scale))
405
- test_name = f'Kolmogorov-Smirnov ({test_statistic})'
406
- params = {'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale}
407
-
408
- elif method == 'anderson':
409
- anderson_result = stats.anderson(data, dist='norm')
410
- return {
411
- 'test_name': f'Anderson-Darling ({test_statistic})',
412
- 'statistic': anderson_result.statistic,
413
- 'critical_values': anderson_result.critical_values,
414
- 'significance_levels': anderson_result.significance_level,
415
- 'params': {'n': n, 'test_statistic': test_statistic, 'loc': loc, 'scale': scale},
416
- 'interpretation': self._interpret_anderson(anderson_result)
417
- }
418
-
419
- elif method == 'jarque_bera':
420
- statistic, pvalue = stats.jarque_bera(data)
421
- test_name = f'Jarque-Bera ({test_statistic})'
422
- params = {
423
- 'n': n,
424
- 'test_statistic': test_statistic,
425
- 'skewness': stats.skew(data),
426
- 'kurtosis': stats.kurtosis(data)
427
- }
428
-
429
- else:
430
- raise ValueError(f"Método '{method}' no reconocido")
431
-
432
- return TestResult(
433
- test_name=test_name,
434
- statistic=statistic,
435
- pvalue=pvalue,
436
- alternative='two-sided',
437
- params=params
438
- )
439
-
440
- def _interpret_anderson(self, anderson_result):
441
- """Interpreta resultados de Anderson-Darling"""
442
- interpretations = []
443
- for i, (crit_val, sig_level) in enumerate(zip(anderson_result.critical_values,
444
- anderson_result.significance_level)):
445
- if anderson_result.statistic < crit_val:
446
- interpretations.append(f"No se rechaza normalidad al {sig_level}% de significancia")
447
- else:
448
- interpretations.append(f"Se RECHAZA normalidad al {sig_level}% de significancia")
449
- return interpretations
450
-
451
- def help(self):
452
- """
453
- Muestra ayuda completa de la clase DescriptiveStats
454
- """
455
- help_text = """
456
- 📈 CLASE InferencialStats - AYUDA COMPLETA
457
-
458
- Clase para análisis estadístico inferencial univariado y multivariado
459
-
460
- 🔧 MÉTODOS PRINCIPALES:
461
-
462
- 1. 📊 ESTADÍSTICAS UNIVARIADAS:
463
- • .mean(), .median(), .mode() # Tendencia central
464
- • .std(), .variance() # Dispersión
465
- • .skewness(), .kurtosis() # Forma de distribución
466
- • .quantile(0.25) # Cuantiles
467
- • .outliers('columna') # Detección de outliers
468
-
469
- 2. 🔗 ESTADÍSTICAS MULTIVARIADAS:
470
- • .correlation() # Matriz de correlación
471
- • .covariance() # Matriz de covarianza
472
-
473
- 3. 📋 RESUMEN COMPLETO:
474
- • .summary() # Resumen descriptivo completo
475
- • .summary(show_plot=True) # Con visualizaciones
476
-
477
- 4. 📈 REGRESIÓN LINEAL:
478
- • .linear_regression(y, X) # Regresión simple/múltiple
479
-
480
- 💡 EJEMPLOS DE USO:
481
-
482
- # Inicializar
483
- estadisticas = DescriptiveStats(mi_dataframe)
484
-
485
- # Análisis univariado
486
- media = estadisticas.mean('edad')
487
- resumen = estadisticas.summary()
488
-
489
- # Regresión
490
- modelo = estadisticas.linear_regression(
491
- y='ventas',
492
- X=['publicidad', 'precio'],
493
- show_plot=True
494
- )
495
- print(modelo.summary())
496
- """
497
- print(help_text)
498
-
499
- class TestResult:
500
- """Clase para resultados de pruebas de hipótesis"""
501
-
502
- def __init__(self, test_name: str, statistic: float, pvalue: float,
503
- alternative: str, params: dict):
504
- self.test_name = test_name
505
- self.statistic = statistic
506
- self.pvalue = pvalue
507
- self.alternative = alternative
508
- self.params = params
509
-
510
- def __repr__(self):
511
- return self._format_output()
512
-
513
- def _format_output(self):
514
- """Formato de salida para pruebas de hipótesis"""
515
- output = []
516
- output.append("=" * 80)
517
- output.append(self.test_name.center(80))
518
- output.append("=" * 80)
519
- output.append(f"Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
520
- output.append(f"Hipótesis Alternativa: {self.alternative}")
521
- output.append("-" * 80)
522
-
523
- output.append("\nRESULTADOS:")
524
- output.append("-" * 80)
525
- output.append(f"{'Estadístico':<40} {self.statistic:>20.6f}")
526
- output.append(f"{'Valor p':<40} {self.pvalue:>20.6e}")
527
-
528
- # Interpretación
529
- alpha = 0.05
530
- output.append(f"\nInterpretación (α = {alpha}):")
531
- if self.pvalue < alpha:
532
- output.append(f" → Se RECHAZA la hipótesis nula (p < {alpha})")
533
- else:
534
- output.append(f" → NO se rechaza la hipótesis nula (p >= {alpha})")
535
-
536
- # Parámetros adicionales
537
- if self.params:
538
- output.append("\nPARÁMETROS ADICIONALES:")
539
- output.append("-" * 80)
540
- for key, value in self.params.items():
541
- if isinstance(value, (int, float)):
542
- output.append(f"{key:<40} {value:>20.6f}")
543
- else:
544
- output.append(f"{key}: {value}")
545
-
546
- output.append("=" * 80)
547
- return "\n".join(output)
@@ -1,8 +0,0 @@
1
- statslib/__init__.py,sha256=x8UYU-ZUJpaoPrga2YV94-z_pBkj0hYSiOPkJvpHtJU,1066
2
- statslib/descriptive.py,sha256=iUugVdB84HAKIjcCWBJLIGD8LdQUfx8O7N4ejKKoWSA,23210
3
- statslib/inferential.py,sha256=Vf0Y5rywohQ78q4PZ6un2BVmV3PA-9d_uXNujbOqHPQ,20780
4
- statslib/utils.py,sha256=H-KEdoV1mYkXg84Kk2JU0uT09_wNEeUIvW6F-AvU9s8,36709
5
- statslibx-0.1.0.dist-info/METADATA,sha256=otgOet6JPtXwFPOErJHk12x0I_vC5HL4fcyzQ6YN-HU,1737
6
- statslibx-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- statslibx-0.1.0.dist-info/top_level.txt,sha256=lFFmLUn3rF6qe_vokWJPM5xtNAqS-_OL-CnPXWRsuiM,9
8
- statslibx-0.1.0.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- statslib