statslibx 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statslibx/inferential.py CHANGED
@@ -8,12 +8,67 @@ from scipy import stats
8
8
  import os
9
9
 
10
10
  class InferentialStats:
11
- """
12
- Class for inferential statistics (hypothesis tests, confidence intervals, etc.)
11
+ """
12
+ InferentialStats
13
+ A class for performing inferential statistical analysis, including hypothesis tests, confidence intervals,
14
+ normality tests, and more. This class supports operations on pandas DataFrame or numpy arrays.
15
+ Attributes:
16
+ -----------
17
+ data : pd.DataFrame
18
+ The dataset to analyze.
19
+ The backend used for processing ('pandas' or 'polars').
20
+ sep : str
21
+ Separator for reading files.
22
+ decimal : str
23
+ Decimal separator for reading files.
24
+ thousand : str
25
+ Thousand separator for reading files.
26
+ lang : str
27
+ Language for help and error messages ('es-ES' or 'en-US').
28
+
29
+ Methods:
30
+ --------
31
+ from_file(path: str):
32
+ Load data from a file and return an instance of InferentialStats.
33
+
34
+ confidence_interval(column: str, confidence: float = 0.95, statistic: Literal['mean', 'median', 'proportion'] = 'mean') -> tuple:
35
+ Calculate confidence intervals for mean, median, or proportion.
36
+
37
+ t_test_1sample(column: str, popmean: float = None, popmedian: float = None, alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided', alpha: float = 0.05) -> 'TestResult':
38
+ Perform a one-sample t-test or Wilcoxon signed-rank test for median.
39
+
40
+ t_test_2sample(column1: str, column2: str, equal_var: bool = True, alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided', alpha: float = 0.05) -> 'TestResult':
41
+ Perform a two-sample independent t-test.
42
+
43
+ t_test_paired(column1: str, column2: str, alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided', alpha: float = 0.05) -> 'TestResult':
44
+ Perform a paired t-test for dependent samples.
45
+
46
+ mann_whitney_test(column1: str, column2: str, alternative: Literal['two-sided', 'less', 'greater'] = 'two-sided', alpha: float = 0.05) -> 'TestResult':
47
+ Perform the Mann-Whitney U test, a non-parametric alternative to the two-sample t-test.
48
+
49
+ chi_square_test(column1: str, column2: str, alpha: float = 0.05) -> 'TestResult':
50
+ Perform a Chi-square test of independence between two categorical variables.
51
+
52
+ anova_oneway(column: str, groups: str, alpha: float = 0.05) -> 'TestResult':
53
+ Perform a one-way ANOVA test to compare means across multiple groups.
54
+
55
+ kruskal_wallis_test(column: str, groups: str, alpha: float = 0.05) -> 'TestResult':
56
+ Perform the Kruskal-Wallis test, a non-parametric alternative to one-way ANOVA.
57
+
58
+ normality_test(column: str, method: Literal['shapiro', 'ks', 'anderson', 'jarque_bera', 'all'] = 'shapiro', test_statistic: Literal['mean', 'median', 'mode'] = 'mean', alpha: float = 0.05) -> Union['TestResult', dict]:
59
+ Perform normality tests using various methods.
60
+
61
+ hypothesis_test(method: Literal["mean", "difference_mean", "proportion", "variance"] = "mean", column1: str = None, column2: str = None, pop_mean: float = None, pop_proportion: Union[float, Tuple[float, float]] = 0.5, alpha: float = 0.05, homoscedasticity: Literal["levene", "bartlett", "var_test"] = "levene") -> Dict[str, Any]:
62
+ Perform hypothesis testing for mean, difference of means, proportion, or variance.
63
+
64
+ variance_test(column1: str, column2: str, method: Literal['levene', 'bartlett', 'var_test'] = 'levene', center: Literal['mean', 'median', 'trimmed'] = 'median', alpha: float = 0.05) -> 'TestResult':
65
+ Perform a test for equality of variances between two columns.
66
+
67
+ help():
68
+ Display a detailed help guide for the InferentialStats class and its methods.
13
69
  """
14
70
 
15
71
  def __init__(self, data: Union[pd.DataFrame, np.ndarray],
16
- backend: Literal['pandas', 'polars'] = 'pandas',
17
72
  lang: Literal['es-ES', 'en-US'] = 'es-ES'):
18
73
  """
19
74
  Initialize DataFrame
@@ -22,17 +77,16 @@ class InferentialStats:
22
77
  -----------
23
78
  data : DataFrame o ndarray
24
79
  Data to analyze
25
- backend : str
26
- 'pandas' or 'polars' for processing
27
80
  """
28
81
 
29
- if isinstance(data, str) and os.path.exists(data):
30
- data = InferentialStats.from_file(data).data
31
-
32
- if isinstance(data, pl.DataFrame):
82
+ if isinstance(data, pd.DataFrame):
83
+ self.data = data
84
+ elif isinstance(data, np.ndarray):
85
+ self.data = pd.DataFrame(data)
86
+ else:
33
87
  raise TypeError(
34
- "Polars aún no soportado. Use pandas.DataFrame."
35
- )
88
+ "Data must be a pandas.DataFrame or numpy.ndarray."
89
+ )
36
90
 
37
91
  if isinstance(data, np.ndarray):
38
92
  if data.ndim == 1:
@@ -41,44 +95,9 @@ class InferentialStats:
41
95
  data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
42
96
 
43
97
  self.data = data
44
- self.backend = backend
45
98
  self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
46
99
  self.lang = lang
47
100
 
48
- @classmethod
49
- def from_file(path: str):
50
- """
51
- Carga automática de archivos y devuelve instancia de Intelligence.
52
- Soporta CSV, Excel, TXT, JSON, Parquet, Feather, TSV.
53
- """
54
- if not os.path.exists(path):
55
- raise FileNotFoundError(f"Archivo no encontrado: {path}")
56
-
57
- ext = os.path.splitext(path)[1].lower()
58
-
59
- if ext == ".csv":
60
- df = pd.read_csv(path)
61
-
62
- elif ext in [".xlsx", ".xls"]:
63
- df = pd.read_excel(path)
64
-
65
- elif ext in [".txt", ".tsv"]:
66
- df = pd.read_table(path)
67
-
68
- elif ext == ".json":
69
- df = pd.read_json(path)
70
-
71
- elif ext == ".parquet":
72
- df = pd.read_parquet(path)
73
-
74
- elif ext == ".feather":
75
- df = pd.read_feather(path)
76
-
77
- else:
78
- raise ValueError(f"Formato no soportado: {ext}")
79
-
80
- return InferentialStats(df)
81
-
82
101
  # ============= INTERVALOS DE CONFIANZA =============
83
102
 
84
103
  def confidence_interval(self, column: str, confidence: float = 0.95,
statslibx/utils.py CHANGED
@@ -11,23 +11,81 @@ from pathlib import Path
11
11
 
12
12
  class UtilsStats:
13
13
  """
14
- Clase utilitaria para operaciones estadísticas comunes y visualización
14
+ UtilsStats
15
+ A utility class for common statistical operations and visualization.
16
+ This class provides methods for data validation, basic statistical analysis,
17
+ and visualization of results. It also supports loading data directly from files.
18
+ >>> # Load data from a file
19
+ >>> data = utils.load_data("data.csv")
20
+ >>> utils.check_normality(data, column='age')
21
+ >>> # Analyze data from an array
22
+ Methods:
23
+ --------
24
+ _setup_plotting_style():
25
+ Configures default plotting styles for matplotlib.
15
26
 
16
- Esta clase proporciona métodos para validación de datos, análisis estadísticos
17
- básicos y visualización de resultados. Ahora con soporte para leer archivos directamente.
27
+ set_plot_backend(backend: Literal['matplotlib', 'seaborn', 'plotly']):
28
+ Sets the default visualization backend.
18
29
 
19
- Examples:
20
- ---------
21
- >>> utils = UtilsStats()
22
- >>> # Desde archivo
23
- >>> data = utils.load_data("datos.csv")
24
- >>> utils.check_normality(data, column='edad')
25
- >>> # Desde array
26
- >>> data = np.random.normal(0, 1, 100)
27
- >>> utils.check_normality(data)
28
- >>> utils.plot_distribution(data)
30
+ set_default_figsize(figsize: Tuple[int, int]):
31
+ Sets the default figure size for plots.
32
+
33
+ set_save_fig_options(save_fig: Optional[bool] = False, fig_format: str = 'png',
34
+ fig_dpi: int = 300, figures_dir: str = 'figures'):
35
+ Configures options for saving figures.
36
+
37
+ load_data(path: Union[str, Path], **kwargs) -> pd.DataFrame:
38
+ Loads data from a file in various formats (CSV, Excel, JSON, etc.).
39
+
40
+ validate_dataframe(data: Union[pd.DataFrame, np.ndarray, list, str, Path]) -> pd.DataFrame:
41
+ Validates and converts data to a DataFrame. Also accepts file paths.
42
+
43
+ format_number(num: float, decimals: int = 6, scientific: bool = False) -> str:
44
+ Formats a number with specified decimal places.
45
+
46
+ check_normality(data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path],
47
+ column: Optional[str] = None, alpha: float = 0.05) -> dict:
48
+ Checks if the data follows a normal distribution using the Shapiro-Wilk test.
49
+
50
+ calculate_confidence_intervals(data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path],
51
+ column: Optional[str] = None, confidence_level: float = 0.95,
52
+ Calculates confidence intervals for the mean using parametric or bootstrap methods.
53
+
54
+ detect_outliers(data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path],
55
+ column: Optional[str] = None, method: Literal['iqr', 'zscore', 'isolation_forest'] = 'iqr',
56
+ Detects outliers using different methods: 'iqr', 'zscore', or 'isolation_forest'.
57
+
58
+ calculate_effect_size(data: Union[pd.Series, np.ndarray, pd.DataFrame, str, Path] = None,
59
+ Calculates the effect size between two groups using Cohen's d or Hedges' g.
60
+
61
+ plot_distribution(data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
62
+ column: Optional[str] = None, plot_type: Literal['hist', 'kde', 'box', 'violin', 'all'] = 'hist',
63
+ bins: int = 30, figsize: Optional[Tuple[int, int]] = None,
64
+ save_fig: Optional[bool] = False, filename: Optional[str] = None, **kwargs):
65
+ Plots the distribution of a variable using various plot types and backends.
66
+
67
+ plot_correlation_matrix(data: Union[pd.DataFrame, str, Path],
68
+ filename: Optional[str] = None, **kwargs):
69
+ Visualizes the correlation matrix using a heatmap.
70
+
71
+ plot_scatter_matrix(data: Union[pd.DataFrame, str, Path],
72
+ filename: Optional[str] = None, **kwargs):
73
+ Creates a scatter matrix (pairplot) for visualizing relationships between variables.
74
+
75
+ plot_distribution_with_ci(data: Union[pd.DataFrame, pd.Series, np.ndarray, str, Path],
76
+ column: Optional[str] = None, confidence_level: float = 0.95,
77
+ ci_method: str = 'parametric', bins: int = 30,
78
+ filename: Optional[str] = None, **kwargs) -> plt.Figure:
79
+ Plots the distribution of a variable with confidence intervals.
80
+
81
+ get_descriptive_stats(data, column=None) -> dict:
82
+ Returns a dictionary of descriptive statistics for the given data.
83
+
84
+ help():
85
+ Displays a complete help guide for the UtilsStats class.
29
86
  """
30
87
 
88
+
31
89
  def __init__(self):
32
90
  """Inicializar la clase utilitaria"""
33
91
  self._plot_backend = 'seaborn'
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: statslibx
3
- Version: 0.1.8
4
- Summary: StatsLibx - Librería de estadística descriptiva e inferencial
3
+ Version: 0.2.1
4
+ Summary: StatsLibx - Librería de estadística descriptiva, inferencial y computacional
5
5
  Author-email: Emmanuel Ascendra Perez <ascendraemmanuel@gmail.com>
6
6
  License: MIT
7
7
  Classifier: Development Status :: 3 - Alpha
@@ -16,7 +16,6 @@ Classifier: Programming Language :: Python :: 3.12
16
16
  Requires-Python: >=3.8
17
17
  Description-Content-Type: text/markdown
18
18
  Requires-Dist: pandas>=1.5
19
- Requires-Dist: polars>=0.20
20
19
  Provides-Extra: viz
21
20
  Requires-Dist: seaborn>=0.11; extra == "viz"
22
21
  Requires-Dist: plotly>=5.0; extra == "viz"
@@ -28,9 +27,9 @@ Requires-Dist: statsmodels>=0.13; extra == "advanced"
28
27
 
29
28
  StatsLibX es un paquete de Python diseñado para proporcionar una solución sencilla, eficiente y flexible para manejar volumenes de datos.
30
29
 
31
- Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y ligera que permita a desarrolladores y entusiastas integrar la **estadistica descriptiva e inferencial** sin complicaciones, con multiples funcionalidades y utilidades pensadas para el futuro.
30
+ Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y ligera que permita a desarrolladores y entusiastas integrar la **estadistica descriptiva, inferencial y computacional (En desarrollo)** sin complicaciones, con multiples funcionalidades y utilidades pensadas para el futuro.
32
31
 
33
- GitHub del Proyecto: [text](https://github.com/GhostAnalyst30/StatsLibX)
32
+ GitHub del Proyecto: [https://github.com/GhostAnalyst30/StatsLibX](https://github.com/GhostAnalyst30/StatsLibX)
34
33
 
35
34
  ## ✨ Características principales
36
35
 
@@ -55,7 +54,7 @@ stats = DescriptiveStats(data) # InferentialStats(data), UtilsStats()
55
54
 
56
55
  stats.summary()
57
56
  ```
58
- Para ver mas funciones: [text](https://github.com/GhostAnalyst30/StatsLibX/blob/main/how_use_statslibx.ipynb)
57
+ Para ver mas funciones: [https://github.com/GhostAnalyst30/StatsLibX/blob/main/how_use_statslibx.ipynb](https://github.com/GhostAnalyst30/StatsLibX/blob/main/how_use_statslibx.ipynb)
59
58
 
60
59
  ## 📦 Instalación
61
60
  ```bash
@@ -0,0 +1,20 @@
1
+ statslibx/__init__.py,sha256=82KG6z_wJZf_ZF8jpViRvtzn4qV9uEZd8a3sRUucKLE,1500
2
+ statslibx/cli.py,sha256=DqXaoP85n9xgLDlFnEkeqj-HJG0_IKX0uSqxRcHbzII,1122
3
+ statslibx/computacional.py,sha256=z46bRUiH9a3ajxVTYE2sGO-pg20L87MdOKM3Y_Tcq44,4062
4
+ statslibx/descriptive.py,sha256=QLIzPB-pEC2BXCIUsjpDyU7peHAs6fRduPukj1gA160,61671
5
+ statslibx/inferential.py,sha256=_mUzX-Uo2Y55zVTZbQnIRloqKcHjh40djLW1J12HQPU,81617
6
+ statslibx/io.py,sha256=v7pxpmlEMeKyfXftl3WbkUtC9FOh1pymz7MmKPPNw98,493
7
+ statslibx/utils.py,sha256=gWXduW8LMN1q4ZwNggmodRsT9Rcsot-S82NsQiqrjUo,69992
8
+ statslibx/datasets/__init__.py,sha256=KI1N2ByjWpmr9F9_1CDDHEnZ-kDJEKmZON7_4E6Jf_4,7322
9
+ statslibx/datasets/course_completion.csv,sha256=jaqyxAh4YCsYuH5OFsjvGV7KUyM_7vQt6LgnqnNAFsI,22422135
10
+ statslibx/datasets/iris.csv,sha256=xSdC5QMVqZ-Vajg_rt91dVUmdfZAnvD5pHB23QhHmTA,3858
11
+ statslibx/datasets/penguins.csv,sha256=4HY2vYr3QmAJnqL4Z44uq7813vV5lAzHb2cGHuFsBsE,13478
12
+ statslibx/datasets/sp500_companies.csv,sha256=WKS72YOGnAbyLR6kD95fOpIYZt5oXGjPryyFVqLRF_k,803820
13
+ statslibx/datasets/titanic.csv,sha256=5seOS8ybyBMBCCWhgKZrsbu06m_OWyKtD9l0YXOImXU,29474
14
+ statslibx/preprocessing/__init__.py,sha256=ZwdwjBodxeOry-umJ__6yUSeubpRlZg41yve366ArkY,7395
15
+ tests/test1.py,sha256=zGaLe9cKLCLrgNbjo-WeDGIjdH4bODtm1_juOn96Mtk,306
16
+ statslibx-0.2.1.dist-info/METADATA,sha256=mNVj_Qo9pROrznPaOkCvWBH7ypw_0j0p9WdCWHgFt5o,2964
17
+ statslibx-0.2.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
18
+ statslibx-0.2.1.dist-info/entry_points.txt,sha256=bkCY7JDWNCZFE3I4sjgJ2oGrUgoBBbCbYmWkBAymT70,49
19
+ statslibx-0.2.1.dist-info/top_level.txt,sha256=Mz7hCT3d_WEbs8d6hWac4m3fkI4RlxUkXnHYt967KG8,16
20
+ statslibx-0.2.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
tests/test1.py ADDED
@@ -0,0 +1,14 @@
1
+ from statslibx import load_dataset, DescriptiveStats, InferentialStats
2
+ import pandas as pd
3
+ # df = pd.read_csv(r"tests\bank (1).csv", sep=";")
4
+
5
+ df = load_dataset(r"tests\bank (1).csv", sep=";")
6
+ stats = DescriptiveStats(df)
7
+ print(stats.data)
8
+
9
+ infer = InferentialStats(df)
10
+ print(infer.data)
11
+
12
+
13
+
14
+
statslibx/probability.py DELETED
@@ -1,2 +0,0 @@
1
- class ProbabilityStats:
2
- pass
@@ -1,15 +0,0 @@
1
- statslibx/__init__.py,sha256=KeEoEZVPUR_PZACWoCpS_2l6luPbEee7VRlcrLgbKQQ,1490
2
- statslibx/cli.py,sha256=DqXaoP85n9xgLDlFnEkeqj-HJG0_IKX0uSqxRcHbzII,1122
3
- statslibx/computacional.py,sha256=Nv8wk67RUuuv15oBRu2XPp0_k7O4ZgmT51vThH2OuFk,35
4
- statslibx/descriptive.py,sha256=r5D4reP1Cdzsu1tSLmf2OEaFAkGvHSd3FIYfUclEaRU,60178
5
- statslibx/inferential.py,sha256=H0R6g3dJFk-53m1bKldrXObgk0SSmpcdqQg_tIgRKBI,79169
6
- statslibx/io.py,sha256=v7pxpmlEMeKyfXftl3WbkUtC9FOh1pymz7MmKPPNw98,493
7
- statslibx/probability.py,sha256=MUME4eXWzbdU93F-QdKwmmyd9IgZK1flFUYQHitp10o,33
8
- statslibx/utils.py,sha256=iJzt0jDacaoUfjtp4dU2PFuIBEheMP9Qrq-HnLTW_Qw,66515
9
- statslibx/datasets/__init__.py,sha256=GuUl_7-d6YanuDFht1dwB1bFrqjShvKh1m-iRYAbYZE,6875
10
- statslibx/preprocessing/__init__.py,sha256=ZwdwjBodxeOry-umJ__6yUSeubpRlZg41yve366ArkY,7395
11
- statslibx-0.1.8.dist-info/METADATA,sha256=uyhAd0xghADIfVee7WzDp76nLA2snjqQcNayio_UrIc,2835
12
- statslibx-0.1.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- statslibx-0.1.8.dist-info/entry_points.txt,sha256=bkCY7JDWNCZFE3I4sjgJ2oGrUgoBBbCbYmWkBAymT70,49
14
- statslibx-0.1.8.dist-info/top_level.txt,sha256=eeYZXyFm0hIjuI0ba3wF6XW938Mv9tv7Nk9qgjYfCtU,10
15
- statslibx-0.1.8.dist-info/RECORD,,