statslibx 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statslibx/__init__.py CHANGED
@@ -1,28 +1,32 @@
1
1
  """
2
2
  StatsLibx - Librería de Estadística para Python
3
3
  Autor: Emmanuel Ascendra
4
- Versión: 0.1.6
4
+ Versión: 0.1.8
5
5
  """
6
6
 
7
- __version__ = "0.1.6"
7
+ __version__ = "0.1.8"
8
8
  __author__ = "Emmanuel Ascendra"
9
9
 
10
10
  # Importar las clases principales
11
11
  from .descriptive import DescriptiveStats, DescriptiveSummary
12
12
  from .inferential import InferentialStats, TestResult
13
+ from .probability import ProbabilityStats
14
+ from .computacional import ComputacionalStats
13
15
  from .utils import UtilsStats
14
- from .datasets import load_dataset
16
+ from .preprocessing import Preprocessing
17
+ from .datasets import load_dataset, generate_dataset
15
18
 
16
19
  # Definir qué se expone cuando se hace: from statslib import *
17
20
  __all__ = [
18
21
  # Clases principales
19
22
  'DescriptiveStats',
20
- 'InferentialStats',
21
- 'LinearRegressionResult',
22
- 'DescriptiveSummary',
23
- 'TestResult',
23
+ 'InferentialStats',
24
+ 'ProbabilityStats',
25
+ 'ComputacionalStats',
24
26
  'UtilsStats',
25
- 'load_dataset'
27
+ 'Preprocessing',
28
+ 'load_dataset',
29
+ 'generate_dataset'
26
30
  ]
27
31
 
28
32
  # Mensaje de bienvenida (opcional)
@@ -35,4 +39,7 @@ def welcome():
35
39
  print(f" - DescriptiveStats: Estadística descriptiva")
36
40
  print(f" - InferentialStats: Estadística inferencial")
37
41
  print(f" - UtilsStats: Utilidades Extras")
42
+ print(f"\nMódulos disponibles:")
43
+ print(f" - Datasets: Carga de Datasets")
44
+ print(f" - Preprocessing: Preprocesamiento de datos")
38
45
  print(f"\nPara más información: help(statslibx)")
statslibx/cli.py ADDED
@@ -0,0 +1,47 @@
1
+ import argparse
2
+ from statslibx.io import load_file
3
+ from statslibx.preprocessing import Preprocessing
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(
8
+ prog="statslibx",
9
+ description="Statslibx - Data analysis from terminal"
10
+ )
11
+
12
+ subparsers = parser.add_subparsers(dest="command")
13
+
14
+ # describe
15
+ describe = subparsers.add_parser("describe")
16
+ describe.add_argument("file")
17
+
18
+ # quality
19
+ quality = subparsers.add_parser("quality")
20
+ quality.add_argument("file")
21
+
22
+ # preview
23
+ preview = subparsers.add_parser("preview")
24
+ preview.add_argument("file")
25
+ preview.add_argument("-n", "--rows", type=int, default=5)
26
+
27
+ args = parser.parse_args()
28
+
29
+ if not args.command:
30
+ parser.print_help()
31
+ return
32
+
33
+ df = load_file(args.file)
34
+ pp = Preprocessing(df)
35
+
36
+ if args.command == "describe":
37
+ print(pp.describe_numeric())
38
+
39
+ elif args.command == "quality":
40
+ print(pp.data_quality())
41
+
42
+ elif args.command == "preview":
43
+ print(pp.preview_data(args.rows))
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
@@ -0,0 +1,2 @@
1
+ class ComputacionalStats:
2
+ pass
@@ -1,16 +1,244 @@
1
- import pandas as pd
2
- import pkgutil
1
+ from typing import Optional, Union, Literal, List, Tuple
3
2
  import io
3
+ import pkgutil
4
+
5
+ import pandas as pd
6
+ import polars as pl
7
+ import numpy as np
8
+ from numpy.typing import NDArray
9
+
10
+
11
+ _SUPPORTED_BACKENDS = ("pandas", "polars")
12
+
13
+
14
+ def _validate_columns(
15
+ df: Union[pd.DataFrame, pl.DataFrame],
16
+ X_columns: List[str],
17
+ y_column: str
18
+ ) -> None:
19
+ columns = set(df.columns)
20
+ missing = set(X_columns + [y_column]) - columns
21
+ if missing:
22
+ raise ValueError(f"Columnas no encontradas en el dataset: {missing}")
23
+
24
+
25
+ def _X_y(
26
+ df: Union[pd.DataFrame, pl.DataFrame],
27
+ X_columns: List[str],
28
+ y_column: str
29
+ ) -> Tuple[NDArray, NDArray]:
30
+ """
31
+ Extrae X e y como arrays numpy desde pandas o polars.
32
+ """
33
+ _validate_columns(df, X_columns, y_column)
4
34
 
5
- def load_dataset(name: str):
6
- """Carga un dataset interno del paquete.
7
- Datasets Disponibles:
35
+ if isinstance(df, pd.DataFrame):
36
+ X = df[X_columns].to_numpy()
37
+ y = df[y_column].to_numpy().ravel()
38
+ return X, y
39
+
40
+ elif isinstance(df, pl.DataFrame):
41
+ X = df.select(X_columns).to_numpy()
42
+ y = df.select(y_column).to_numpy().ravel()
43
+ return X, y
44
+
45
+ else:
46
+ raise TypeError(
47
+ "Backend no soportado. Use pandas.DataFrame o polars.DataFrame."
48
+ )
49
+
50
+
51
+ def load_dataset(
52
+ name: str,
53
+ backend: Literal["pandas", "polars"] = "pandas",
54
+ return_X_y: Optional[Tuple[List[str], str]] = None
55
+ ) -> Union[pd.DataFrame, pl.DataFrame, Tuple[NDArray, NDArray]]:
56
+ """
57
+ Carga un dataset interno del paquete.
58
+
59
+ Datasets disponibles:
8
60
  - iris.csv
9
61
  - penguins.csv
10
62
  - sp500_companies.csv
11
63
  - titanic.csv
64
+ - course_completion.csv
65
+
66
+ Parámetros
67
+ ----------
68
+ name : str
69
+ Nombre del archivo CSV.
70
+ backend : {'pandas', 'polars'}, default='pandas'
71
+ Backend de DataFrame a utilizar.
72
+ return_X_y : tuple[list[str], str], optional
73
+ Si se especifica, devuelve (X, y) como arrays numpy,
74
+
75
+ Retorna
76
+ -------
77
+ DataFrame o (X, y)
12
78
  """
79
+ if backend not in _SUPPORTED_BACKENDS:
80
+ raise ValueError(
81
+ f"Backend '{backend}' no soportado. "
82
+ f"Use uno de {_SUPPORTED_BACKENDS}."
83
+ )
84
+
85
+ # ---------- 1️⃣ Intentar cargar desde el paquete ----------
13
86
  data_bytes = pkgutil.get_data("statslibx.datasets", name)
14
- if data_bytes is None:
15
- raise FileNotFoundError(f"Dataset '{name}' no encontrado.")
16
- return pd.read_csv(io.BytesIO(data_bytes))
87
+
88
+ if data_bytes is not None:
89
+ df = (
90
+ pd.read_csv(io.BytesIO(data_bytes))
91
+ if backend == "pandas"
92
+ else pl.read_csv(io.BytesIO(data_bytes))
93
+ )
94
+
95
+ # ---------- 2️⃣ Si no está en el paquete, buscar en ruta actual ----------
96
+ else:
97
+ try:
98
+ df = (
99
+ pd.read_csv(name)
100
+ if backend == "pandas"
101
+ else pl.read_csv(name)
102
+ )
103
+ except FileNotFoundError:
104
+ raise FileNotFoundError(
105
+ f"Dataset '{name}' no encontrado "
106
+ f"ni en statslibx.datasets ni en la ruta actual."
107
+ )
108
+
109
+ # ---------- 3️⃣ Devolver X, y si se solicita ----------
110
+ if return_X_y is not None:
111
+ X_columns, y_column = return_X_y
112
+ return _X_y(df, X_columns, y_column)
113
+
114
+ return df
115
+
116
+ # =========================
117
+ # Datasets específicos
118
+ # =========================
119
+
120
+ def load_iris(
121
+ backend: Literal["pandas", "polars"] = "pandas",
122
+ return_X_y: Optional[Tuple[List[str], str]] = None
123
+ ):
124
+ return load_dataset(
125
+ "iris.csv",
126
+ backend=backend,
127
+ return_X_y=return_X_y
128
+ )
129
+
130
+
131
+ def load_penguins(
132
+ backend: Literal["pandas", "polars"] = "pandas",
133
+ return_X_y: Optional[Tuple[List[str], str]] = None
134
+ ):
135
+ return load_dataset(
136
+ "penguins.csv",
137
+ backend=backend,
138
+ return_X_y=return_X_y
139
+ )
140
+
141
+
142
+ from typing import Optional
143
+
144
+ def generate_dataset(n_rows, schema, seed=None, save: Optional[bool] = False, filename: Optional[str] = None):
145
+ if seed is not None:
146
+ if not isinstance(seed, int):
147
+ raise TypeError("seed debe ser un entero o None")
148
+ np.random.seed(seed)
149
+ else:
150
+ np.random.seed(42)
151
+
152
+ if not isinstance(schema, dict):
153
+ raise TypeError("schema debe ser un diccionario")
154
+
155
+
156
+
157
+ data = {}
158
+
159
+ for col, config in schema.items():
160
+ if "dist" not in config:
161
+ raise ValueError(f"La columna '{col}' no tiene 'dist' definido")
162
+
163
+ dist = config["dist"]
164
+ dtype = config.get("type", "float")
165
+ nround = config.get("round", 0)
166
+
167
+ # ---------- DISTRIBUCIONES ----------
168
+ if dist == "normal":
169
+ values = np.random.normal(
170
+ loc=config.get("mean", 0),
171
+ scale=config.get("std", 1),
172
+ size=n_rows
173
+ )
174
+
175
+ elif dist == "uniform":
176
+ values = np.random.uniform(
177
+ low=config.get("low", 0),
178
+ high=config.get("high", 1),
179
+ size=n_rows
180
+ )
181
+
182
+ elif dist == "exponential":
183
+ values = np.random.exponential(
184
+ scale=config.get("scale", 1),
185
+ size=n_rows
186
+ )
187
+
188
+ elif dist == "lognormal":
189
+ values = np.random.lognormal(
190
+ mean=config.get("mean", 0),
191
+ sigma=config.get("std", 1),
192
+ size=n_rows
193
+ )
194
+
195
+ elif dist == "poisson":
196
+ values = np.random.poisson(
197
+ lam=config.get("lam", 1),
198
+ size=n_rows
199
+ )
200
+
201
+ elif dist == "binomial":
202
+ values = np.random.binomial(
203
+ n=config.get("n", 1),
204
+ p=config.get("p", 0.5),
205
+ size=n_rows
206
+ )
207
+
208
+ elif dist == "categorical":
209
+ if "choices" not in config:
210
+ raise ValueError(f"'choices' es requerido para categorical ({col})")
211
+ values = np.random.choice(
212
+ config["choices"],
213
+ size=n_rows
214
+ )
215
+ data[col] = values
216
+ continue
217
+
218
+ else:
219
+ raise ValueError(f"Distribución no soportada: {dist}")
220
+
221
+ # ---------- CASTEO DE TIPO ----------
222
+ if dtype == "int":
223
+ values = np.round(values).astype(int)
224
+ elif dtype == "float":
225
+ values = values.astype(float)
226
+ else:
227
+ raise ValueError(f"Tipo no soportado: {dtype}")
228
+
229
+ # ---------- REDONDEO ----------
230
+ if nround > 0:
231
+ values = np.round(values, nround)
232
+ else:
233
+ values = np.round(values, 2)
234
+
235
+ data[col] = values
236
+
237
+ if save and filename:
238
+ df = pd.DataFrame(data)
239
+ df.to_csv(f"{filename}.csv", index=False)
240
+ else:
241
+ df = pd.DataFrame(data)
242
+ df.to_csv("dataset.csv", index=False)
243
+
244
+ return pd.DataFrame(data)