statslibx 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statslibx/__init__.py CHANGED
@@ -1,30 +1,32 @@
1
1
  """
2
2
  StatsLibx - Librería de Estadística para Python
3
3
  Autor: Emmanuel Ascendra
4
- Versión: 0.1.6
4
+ Versión: 0.1.8
5
5
  """
6
6
 
7
- __version__ = "0.1.6"
7
+ __version__ = "0.1.8"
8
8
  __author__ = "Emmanuel Ascendra"
9
9
 
10
10
  # Importar las clases principales
11
11
  from .descriptive import DescriptiveStats, DescriptiveSummary
12
12
  from .inferential import InferentialStats, TestResult
13
+ from .probability import ProbabilityStats
14
+ from .computacional import ComputacionalStats
13
15
  from .utils import UtilsStats
14
16
  from .preprocessing import Preprocessing
15
- from .datasets import load_dataset
17
+ from .datasets import load_dataset, generate_dataset
16
18
 
17
19
  # Definir qué se expone cuando se hace: from statslib import *
18
20
  __all__ = [
19
21
  # Clases principales
20
22
  'DescriptiveStats',
21
- 'InferentialStats',
22
- 'LinearRegressionResult',
23
- 'DescriptiveSummary',
24
- 'TestResult',
23
+ 'InferentialStats',
24
+ 'ProbabilityStats',
25
+ 'ComputacionalStats',
25
26
  'UtilsStats',
26
27
  'Preprocessing',
27
- 'load_dataset'
28
+ 'load_dataset',
29
+ 'generate_dataset'
28
30
  ]
29
31
 
30
32
  # Mensaje de bienvenida (opcional)
@@ -37,5 +39,7 @@ def welcome():
37
39
  print(f" - DescriptiveStats: Estadística descriptiva")
38
40
  print(f" - InferentialStats: Estadística inferencial")
39
41
  print(f" - UtilsStats: Utilidades Extras")
42
+ print(f"\nMódulos disponibles:")
43
+ print(f" - Datasets: Carga de Datasets")
40
44
  print(f" - Preprocessing: Preprocesamiento de datos")
41
45
  print(f"\nPara más información: help(statslibx)")
@@ -0,0 +1,2 @@
1
+ class ComputacionalStats:
2
+ pass
@@ -1,71 +1,244 @@
1
- from typing import Optional, Union, Literal, List
2
- import polars as pl
3
- import pandas as pd
4
- import pkgutil
1
+ from typing import Optional, Union, Literal, List, Tuple
5
2
  import io
3
+ import pkgutil
4
+
5
+ import pandas as pd
6
+ import polars as pl
7
+ import numpy as np
8
+ from numpy.typing import NDArray
9
+
10
+
11
+ _SUPPORTED_BACKENDS = ("pandas", "polars")
12
+
13
+
14
+ def _validate_columns(
15
+ df: Union[pd.DataFrame, pl.DataFrame],
16
+ X_columns: List[str],
17
+ y_column: str
18
+ ) -> None:
19
+ columns = set(df.columns)
20
+ missing = set(X_columns + [y_column]) - columns
21
+ if missing:
22
+ raise ValueError(f"Columnas no encontradas en el dataset: {missing}")
23
+
24
+
25
+ def _X_y(
26
+ df: Union[pd.DataFrame, pl.DataFrame],
27
+ X_columns: List[str],
28
+ y_column: str
29
+ ) -> Tuple[NDArray, NDArray]:
30
+ """
31
+ Extrae X e y como arrays numpy desde pandas o polars.
32
+ """
33
+ _validate_columns(df, X_columns, y_column)
34
+
35
+ if isinstance(df, pd.DataFrame):
36
+ X = df[X_columns].to_numpy()
37
+ y = df[y_column].to_numpy().ravel()
38
+ return X, y
39
+
40
+ elif isinstance(df, pl.DataFrame):
41
+ X = df.select(X_columns).to_numpy()
42
+ y = df.select(y_column).to_numpy().ravel()
43
+ return X, y
44
+
45
+ else:
46
+ raise TypeError(
47
+ "Backend no soportado. Use pandas.DataFrame o polars.DataFrame."
48
+ )
49
+
6
50
 
7
51
  def load_dataset(
8
- name: str,
9
- backend: Literal['pandas', 'polars'] = 'pandas'
10
- ) -> Union[pd.DataFrame, pl.DataFrame]:
11
- """Carga un dataset interno del paquete.
12
- Datasets Disponibles:
52
+ name: str,
53
+ backend: Literal["pandas", "polars"] = "pandas",
54
+ return_X_y: Optional[Tuple[List[str], str]] = None
55
+ ) -> Union[pd.DataFrame, pl.DataFrame, Tuple[NDArray, NDArray]]:
56
+ """
57
+ Carga un dataset interno del paquete.
58
+
59
+ Datasets disponibles:
13
60
  - iris.csv
14
61
  - penguins.csv
15
62
  - sp500_companies.csv
16
63
  - titanic.csv
17
64
  - course_completion.csv
65
+
66
+ Parámetros
67
+ ----------
68
+ name : str
69
+ Nombre del archivo CSV.
70
+ backend : {'pandas', 'polars'}, default='pandas'
71
+ Backend de DataFrame a utilizar.
72
+ return_X_y : tuple[list[str], str], optional
73
+ Si se especifica, devuelve (X, y) como arrays numpy,
74
+
75
+ Retorna
76
+ -------
77
+ DataFrame o (X, y)
18
78
  """
19
- data_bytes = pkgutil.get_data("statslibx.datasets", name)
20
- if data_bytes is None:
21
- raise FileNotFoundError(f"Dataset '{name}' no encontrado.")
22
-
23
- if backend == "pandas":
24
- return pd.read_csv(io.BytesIO(data_bytes))
25
- elif backend == "polars":
26
- return pl.read_csv(io.BytesIO(data_bytes))
27
- else:
79
+ if backend not in _SUPPORTED_BACKENDS:
28
80
  raise ValueError(
29
- "Backend no soportado. Use 'pandas' o 'polars'."
81
+ f"Backend '{backend}' no soportado. "
82
+ f"Use uno de {_SUPPORTED_BACKENDS}."
30
83
  )
31
-
32
- def load_iris(
33
- backend: Literal['pandas', 'polars'] = 'pandas'
34
- ) -> Union[pd.DataFrame, pl.DataFrame]:
35
- """Carga el dataset interno de la libreria: Iris
36
- """
37
- data_bytes = pkgutil.get_data("statslibx.datasets", "iris.csv")
38
- if data_bytes is None:
39
- raise FileNotFoundError(f"Dataset \"iris.csv\" no encontrado.")
40
-
41
- if backend == "pandas":
42
- return pd.read_csv(io.BytesIO(data_bytes))
43
- elif backend == "polars":
44
- raise ValueError(
45
- "Backend no soportado aun. Use 'pandas'."
84
+
85
+ # ---------- 1️⃣ Intentar cargar desde el paquete ----------
86
+ data_bytes = pkgutil.get_data("statslibx.datasets", name)
87
+
88
+ if data_bytes is not None:
89
+ df = (
90
+ pd.read_csv(io.BytesIO(data_bytes))
91
+ if backend == "pandas"
92
+ else pl.read_csv(io.BytesIO(data_bytes))
46
93
  )
94
+
95
+ # ---------- 2️⃣ Si no está en el paquete, buscar en ruta actual ----------
47
96
  else:
48
- raise ValueError(
49
- "Backend no soportado. Use 'pandas' o 'polars'."
50
- )
51
-
97
+ try:
98
+ df = (
99
+ pd.read_csv(name)
100
+ if backend == "pandas"
101
+ else pl.read_csv(name)
102
+ )
103
+ except FileNotFoundError:
104
+ raise FileNotFoundError(
105
+ f"Dataset '{name}' no encontrado "
106
+ f"ni en statslibx.datasets ni en la ruta actual."
107
+ )
108
+
109
+ # ---------- 3️⃣ Devolver X, y si se solicita ----------
110
+ if return_X_y is not None:
111
+ X_columns, y_column = return_X_y
112
+ return _X_y(df, X_columns, y_column)
113
+
114
+ return df
115
+
116
+ # =========================
117
+ # Datasets específicos
118
+ # =========================
119
+
120
+ def load_iris(
121
+ backend: Literal["pandas", "polars"] = "pandas",
122
+ return_X_y: Optional[Tuple[List[str], str]] = None
123
+ ):
124
+ return load_dataset(
125
+ "iris.csv",
126
+ backend=backend,
127
+ return_X_y=return_X_y
128
+ )
129
+
130
+
52
131
  def load_penguins(
53
- backend: Literal['pandas', 'polars'] = 'pandas'
54
- ) -> Union[pd.DataFrame, pl.DataFrame]:
55
- """Carga un dataset interno de la libreria: Penguins
56
- """
57
- data_bytes = pkgutil.get_data("statslibx.datasets", "penguins.csv")
58
- if data_bytes is None:
59
- raise FileNotFoundError(f"Dataset \"penguins.csv\" no encontrado.")
132
+ backend: Literal["pandas", "polars"] = "pandas",
133
+ return_X_y: Optional[Tuple[List[str], str]] = None
134
+ ):
135
+ return load_dataset(
136
+ "penguins.csv",
137
+ backend=backend,
138
+ return_X_y=return_X_y
139
+ )
140
+
141
+
142
+ from typing import Optional
143
+
144
+ def generate_dataset(n_rows, schema, seed=None, save: Optional[bool] = False, filename: Optional[str] = None):
145
+ if seed is not None:
146
+ if not isinstance(seed, int):
147
+ raise TypeError("seed debe ser un entero o None")
148
+ np.random.seed(seed)
149
+ else:
150
+ np.random.seed(42)
151
+
152
+ if not isinstance(schema, dict):
153
+ raise TypeError("schema debe ser un diccionario")
60
154
 
61
- if backend == "pandas":
62
- return pd.read_csv(io.BytesIO(data_bytes))
63
- elif backend == "polars":
64
- raise ValueError(
65
- "Backend no soportado aun. Use 'pandas'."
66
- )
155
+
156
+
157
+ data = {}
158
+
159
+ for col, config in schema.items():
160
+ if "dist" not in config:
161
+ raise ValueError(f"La columna '{col}' no tiene 'dist' definido")
162
+
163
+ dist = config["dist"]
164
+ dtype = config.get("type", "float")
165
+ nround = config.get("round", 0)
166
+
167
+ # ---------- DISTRIBUCIONES ----------
168
+ if dist == "normal":
169
+ values = np.random.normal(
170
+ loc=config.get("mean", 0),
171
+ scale=config.get("std", 1),
172
+ size=n_rows
173
+ )
174
+
175
+ elif dist == "uniform":
176
+ values = np.random.uniform(
177
+ low=config.get("low", 0),
178
+ high=config.get("high", 1),
179
+ size=n_rows
180
+ )
181
+
182
+ elif dist == "exponential":
183
+ values = np.random.exponential(
184
+ scale=config.get("scale", 1),
185
+ size=n_rows
186
+ )
187
+
188
+ elif dist == "lognormal":
189
+ values = np.random.lognormal(
190
+ mean=config.get("mean", 0),
191
+ sigma=config.get("std", 1),
192
+ size=n_rows
193
+ )
194
+
195
+ elif dist == "poisson":
196
+ values = np.random.poisson(
197
+ lam=config.get("lam", 1),
198
+ size=n_rows
199
+ )
200
+
201
+ elif dist == "binomial":
202
+ values = np.random.binomial(
203
+ n=config.get("n", 1),
204
+ p=config.get("p", 0.5),
205
+ size=n_rows
206
+ )
207
+
208
+ elif dist == "categorical":
209
+ if "choices" not in config:
210
+ raise ValueError(f"'choices' es requerido para categorical ({col})")
211
+ values = np.random.choice(
212
+ config["choices"],
213
+ size=n_rows
214
+ )
215
+ data[col] = values
216
+ continue
217
+
218
+ else:
219
+ raise ValueError(f"Distribución no soportada: {dist}")
220
+
221
+ # ---------- CASTEO DE TIPO ----------
222
+ if dtype == "int":
223
+ values = np.round(values).astype(int)
224
+ elif dtype == "float":
225
+ values = values.astype(float)
226
+ else:
227
+ raise ValueError(f"Tipo no soportado: {dtype}")
228
+
229
+ # ---------- REDONDEO ----------
230
+ if nround > 0:
231
+ values = np.round(values, nround)
232
+ else:
233
+ values = np.round(values, 2)
234
+
235
+ data[col] = values
236
+
237
+ if save and filename:
238
+ df = pd.DataFrame(data)
239
+ df.to_csv(f"{filename}.csv", index=False)
67
240
  else:
68
- raise ValueError(
69
- "Backend no soportado. Use 'pandas' o 'polars'."
70
- )
241
+ df = pd.DataFrame(data)
242
+ df.to_csv("dataset.csv", index=False)
71
243
 
244
+ return pd.DataFrame(data)
statslibx/descriptive.py CHANGED
@@ -13,7 +13,6 @@ import plotly.express as px
13
13
 
14
14
  class DescriptiveStats:
15
15
  """
16
- Clase para estadística descriptiva univariada y multivariada
17
16
  Class for univariate and multivariate descriptive statistics
18
17
  """
19
18
 
@@ -23,7 +22,7 @@ class DescriptiveStats:
23
22
  thousand: str = None,
24
23
  backend: Literal['pandas', 'polars'] = 'pandas'):
25
24
  """
26
- # Inicialize DataFrame
25
+ # Initialize DataFrame
27
26
 
28
27
  ## **Parameters:**
29
28
 
@@ -54,7 +53,8 @@ class DescriptiveStats:
54
53
  if data.ndim == 1:
55
54
  data = pd.DataFrame({'var': data})
56
55
  else:
57
- data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
56
+ data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])]) \
57
+ if isinstance(data, pd.DataFrame) else pl.DataFrame(data, )
58
58
 
59
59
  self.data = data
60
60
  self.backend = backend
@@ -394,7 +394,7 @@ class DescriptiveStats:
394
394
 
395
395
 
396
396
 
397
- def help(self, lang="es-Es"):
397
+ def help(self, lang="es-ES"):
398
398
  """
399
399
  Muestra ayuda completa de la clase DescriptiveStats
400
400
 
@@ -407,7 +407,8 @@ class DescriptiveStats:
407
407
  if lang in ["en-US", "English", "english"]:
408
408
  lang = "en-US"
409
409
  else:
410
- lang = ""
410
+ lang = "es-ES"
411
+ help_text = " "
411
412
 
412
413
  match lang:
413
414
  case "es-ES":
@@ -614,10 +615,9 @@ class DescriptiveStats:
614
615
  ╚════════════════════════════════════════════════════════════════════════════╝
615
616
  """
616
617
  case "en-US":
617
- # --- Falta por traducir
618
618
  help_text = """
619
619
  ╔════════════════════════════════════════════════════════════════════════════╗
620
- ║ 📊 DescriptiveStats CLASS - COMPLETE HELP
620
+ ║ 📊 CLASS DescriptiveStats - COMPLETE HELP
621
621
  ╚════════════════════════════════════════════════════════════════════════════╝
622
622
 
623
623
  📝 DESCRIPTION:
@@ -821,8 +821,7 @@ class DescriptiveStats:
821
821
 
822
822
  print(help_text)
823
823
 
824
-
825
-
824
+
826
825
  class DescriptiveSummary:
827
826
  """Clase para formatear salida de estadística descriptiva"""
828
827