statslibx 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {statslibx-0.2.2/statslibx.egg-info → statslibx-0.2.4}/PKG-INFO +12 -1
- {statslibx-0.2.2 → statslibx-0.2.4}/README.md +3 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/pyproject.toml +10 -2
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/__init__.py +2 -2
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/cli.py +4 -3
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/datasets/__init__.py +28 -38
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/descriptive.py +3 -14
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/inferential.py +3 -5
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/preprocessing/__init__.py +106 -5
- {statslibx-0.2.2 → statslibx-0.2.4/statslibx.egg-info}/PKG-INFO +12 -1
- statslibx-0.2.4/statslibx.egg-info/requires.txt +17 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx.egg-info/top_level.txt +1 -0
- statslibx-0.2.4/tests/test1.py +30 -0
- statslibx-0.2.2/statslibx.egg-info/requires.txt +0 -9
- statslibx-0.2.2/tests/test1.py +0 -20
- {statslibx-0.2.2 → statslibx-0.2.4}/MANIFEST.in +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/setup.cfg +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/computacional.py +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/datasets/Cocoa_Bubbles_Investment_Nigeria_Ghana_1980_2023.xlsx +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/datasets/course_completion.csv +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/datasets/iris.csv +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/datasets/penguins.csv +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/datasets/sp500_companies.csv +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/datasets/titanic.csv +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx/utils.py +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx.egg-info/SOURCES.txt +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx.egg-info/dependency_links.txt +0 -0
- {statslibx-0.2.2 → statslibx-0.2.4}/statslibx.egg-info/entry_points.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: statslibx
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: StatsLibx - Librería de estadística descriptiva, inferencial y computacional
|
|
5
5
|
Author-email: Emmanuel Ascendra Perez <ascendraemmanuel@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -16,6 +16,14 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
16
16
|
Requires-Python: >=3.8
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
Requires-Dist: pandas>=1.5
|
|
19
|
+
Requires-Dist: matplotlib>=3.5
|
|
20
|
+
Requires-Dist: numpy>=1.23
|
|
21
|
+
Requires-Dist: scipy>=1.9
|
|
22
|
+
Requires-Dist: polars>=0.16
|
|
23
|
+
Requires-Dist: scikit-learn>=1.0
|
|
24
|
+
Requires-Dist: statsmodels>=0.13
|
|
25
|
+
Requires-Dist: seaborn>=0.11
|
|
26
|
+
Requires-Dist: plotly>=5.0
|
|
19
27
|
Provides-Extra: viz
|
|
20
28
|
Requires-Dist: seaborn>=0.11; extra == "viz"
|
|
21
29
|
Requires-Dist: plotly>=5.0; extra == "viz"
|
|
@@ -29,6 +37,8 @@ StatsLibX es un paquete de Python diseñado para proporcionar una solución senc
|
|
|
29
37
|
|
|
30
38
|
Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y ligera que permita a desarrolladores y entusiastas integrar la **estadistica descriptiva, inferencial y computacional (En desarrollo)** sin complicaciones, con multiples funcionalidades y utilidades pensadas para el futuro.
|
|
31
39
|
|
|
40
|
+
Pagina Web: [StatsLibX](https://ghostanalyst30.github.io/StatsLibX/Documentation_Page/index.html)
|
|
41
|
+
|
|
32
42
|
GitHub del Proyecto: [https://github.com/GhostAnalyst30/StatsLibX](https://github.com/GhostAnalyst30/StatsLibX)
|
|
33
43
|
|
|
34
44
|
## ✨ Características principales
|
|
@@ -63,6 +73,7 @@ pip install statslibx
|
|
|
63
73
|
|
|
64
74
|
## 👩💻 ¡Usalo en la terminal! (De forma preliminar)
|
|
65
75
|
```bash
|
|
76
|
+
statslibx # Informacion general de la libreria
|
|
66
77
|
statslibx describe .\archive.csv # Devuelve una descripcion de la data
|
|
67
78
|
statslibx quality .\archive.csv # Devuelve la calidad de los datos
|
|
68
79
|
statslibx preview .\archive.csv # Devuelve una visualizacion de los datos
|
|
@@ -4,6 +4,8 @@ StatsLibX es un paquete de Python diseñado para proporcionar una solución senc
|
|
|
4
4
|
|
|
5
5
|
Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y ligera que permita a desarrolladores y entusiastas integrar la **estadistica descriptiva, inferencial y computacional (En desarrollo)** sin complicaciones, con multiples funcionalidades y utilidades pensadas para el futuro.
|
|
6
6
|
|
|
7
|
+
Pagina Web: [StatsLibX](https://ghostanalyst30.github.io/StatsLibX/Documentation_Page/index.html)
|
|
8
|
+
|
|
7
9
|
GitHub del Proyecto: [https://github.com/GhostAnalyst30/StatsLibX](https://github.com/GhostAnalyst30/StatsLibX)
|
|
8
10
|
|
|
9
11
|
## ✨ Características principales
|
|
@@ -38,6 +40,7 @@ pip install statslibx
|
|
|
38
40
|
|
|
39
41
|
## 👩💻 ¡Usalo en la terminal! (De forma preliminar)
|
|
40
42
|
```bash
|
|
43
|
+
statslibx # Informacion general de la libreria
|
|
41
44
|
statslibx describe .\archive.csv # Devuelve una descripcion de la data
|
|
42
45
|
statslibx quality .\archive.csv # Devuelve la calidad de los datos
|
|
43
46
|
statslibx preview .\archive.csv # Devuelve una visualizacion de los datos
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "statslibx"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.4"
|
|
8
8
|
description = "StatsLibx - Librería de estadística descriptiva, inferencial y computacional"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -27,7 +27,15 @@ classifiers = [
|
|
|
27
27
|
]
|
|
28
28
|
|
|
29
29
|
dependencies = [
|
|
30
|
-
"pandas>=1.5"
|
|
30
|
+
"pandas>=1.5",
|
|
31
|
+
"matplotlib>=3.5",
|
|
32
|
+
"numpy>=1.23",
|
|
33
|
+
"scipy>=1.9",
|
|
34
|
+
"polars>=0.16",
|
|
35
|
+
"scikit-learn>=1.0",
|
|
36
|
+
"statsmodels>=0.13",
|
|
37
|
+
"seaborn>=0.11",
|
|
38
|
+
"plotly>=5.0"
|
|
31
39
|
]
|
|
32
40
|
|
|
33
41
|
[project.optional-dependencies]
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import argparse
|
|
2
|
-
|
|
2
|
+
import statslibx as slx
|
|
3
|
+
from statslibx.datasets import load_dataset
|
|
3
4
|
from statslibx.preprocessing import Preprocessing
|
|
4
5
|
|
|
5
6
|
|
|
@@ -27,10 +28,10 @@ def main():
|
|
|
27
28
|
args = parser.parse_args()
|
|
28
29
|
|
|
29
30
|
if not args.command:
|
|
30
|
-
|
|
31
|
+
print(slx.welcome())
|
|
31
32
|
return
|
|
32
33
|
|
|
33
|
-
df =
|
|
34
|
+
df = load_dataset(args.file)
|
|
34
35
|
pp = Preprocessing(df)
|
|
35
36
|
|
|
36
37
|
if args.command == "describe":
|
|
@@ -3,7 +3,6 @@ import io
|
|
|
3
3
|
import pkgutil
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
import pandas as pd
|
|
6
|
-
import polars as pl
|
|
7
6
|
import numpy as np
|
|
8
7
|
from numpy.typing import NDArray
|
|
9
8
|
|
|
@@ -12,10 +11,10 @@ _SUPPORTED_BACKENDS = ("pandas", "polars")
|
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
def _validate_columns(
|
|
15
|
-
df:
|
|
16
|
-
X_columns: List[str],
|
|
17
|
-
y_column: str
|
|
18
|
-
) -> None:
|
|
14
|
+
df: pd.DataFrame, # 输入的数据框,可以是pandas或polars DataFrame
|
|
15
|
+
X_columns: List[str], # 特征列名列表
|
|
16
|
+
y_column: str # 目标列名
|
|
17
|
+
) -> None: # 无返回值,函数仅用于验证
|
|
19
18
|
columns = set(df.columns)
|
|
20
19
|
missing = set(X_columns + [y_column]) - columns
|
|
21
20
|
if missing:
|
|
@@ -23,7 +22,7 @@ def _validate_columns(
|
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
def _X_y(
|
|
26
|
-
df:
|
|
25
|
+
df: pd.DataFrame,
|
|
27
26
|
X_columns: List[str],
|
|
28
27
|
y_column: str
|
|
29
28
|
) -> Tuple[NDArray, NDArray]:
|
|
@@ -37,25 +36,19 @@ def _X_y(
|
|
|
37
36
|
y = df[y_column].to_numpy().ravel()
|
|
38
37
|
return X, y
|
|
39
38
|
|
|
40
|
-
elif isinstance(df, pl.DataFrame):
|
|
41
|
-
X = df.select(X_columns).to_numpy()
|
|
42
|
-
y = df.select(y_column).to_numpy().ravel()
|
|
43
|
-
return X, y
|
|
44
|
-
|
|
45
39
|
else:
|
|
46
40
|
raise TypeError(
|
|
47
|
-
"Backend no soportado. Use pandas.DataFrame
|
|
41
|
+
"Backend no soportado. Use pandas.DataFrame"
|
|
48
42
|
)
|
|
49
43
|
|
|
50
44
|
|
|
51
45
|
import io
|
|
52
46
|
import pkgutil
|
|
53
47
|
import pandas as pd
|
|
54
|
-
import polars as pl
|
|
55
48
|
from typing import Literal, Optional, Tuple, List, Union
|
|
56
49
|
from numpy.typing import NDArray
|
|
57
50
|
|
|
58
|
-
_SUPPORTED_BACKENDS = {"pandas"
|
|
51
|
+
_SUPPORTED_BACKENDS = {"pandas"}
|
|
59
52
|
_SUPPORTED_EXTENSIONS = {".csv", ".parquet", ".xlsx", ".xls", ".json"}
|
|
60
53
|
|
|
61
54
|
def _read_file(
|
|
@@ -73,23 +66,15 @@ def _read_file(
|
|
|
73
66
|
return pd.read_excel(buffer_or_path)
|
|
74
67
|
if ext == ".json":
|
|
75
68
|
return pd.read_json(buffer_or_path)
|
|
76
|
-
else: # polars
|
|
77
|
-
if ext == ".csv":
|
|
78
|
-
return pl.read_csv(buffer_or_path)
|
|
79
|
-
if ext == ".parquet":
|
|
80
|
-
return pl.read_parquet(buffer_or_path)
|
|
81
|
-
if ext == ".json":
|
|
82
|
-
return pl.read_json(buffer_or_path)
|
|
83
69
|
|
|
84
70
|
raise ValueError(f"Extensión '{ext}' no soportada para backend '{backend}'.")
|
|
85
71
|
|
|
86
|
-
|
|
87
72
|
def load_dataset(
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
) -> Union[pd.DataFrame,
|
|
73
|
+
name: str,
|
|
74
|
+
backend: str = "pandas",
|
|
75
|
+
return_X_y: Optional[Tuple[List[str], str]] = None,
|
|
76
|
+
sep: str = ","
|
|
77
|
+
) -> Union[pd.DataFrame, Tuple[NDArray, NDArray]]:
|
|
93
78
|
"""
|
|
94
79
|
Carga un dataset interno del paquete.
|
|
95
80
|
|
|
@@ -99,6 +84,7 @@ def load_dataset(
|
|
|
99
84
|
- sp500_companies.csv
|
|
100
85
|
- titanic.csv
|
|
101
86
|
- course_completion.csv
|
|
87
|
+
- Cocoa_Bubbles_Investment_Nigeria_Ghana_1980_2023.xlsx
|
|
102
88
|
|
|
103
89
|
Parámetros
|
|
104
90
|
----------
|
|
@@ -120,7 +106,10 @@ def load_dataset(
|
|
|
120
106
|
f"Use uno de {_SUPPORTED_BACKENDS}."
|
|
121
107
|
)
|
|
122
108
|
|
|
123
|
-
|
|
109
|
+
path = Path(name)
|
|
110
|
+
resource_name = path.name
|
|
111
|
+
ext = path.suffix.lower()
|
|
112
|
+
|
|
124
113
|
|
|
125
114
|
if ext not in _SUPPORTED_EXTENSIONS:
|
|
126
115
|
raise ValueError(
|
|
@@ -130,26 +119,26 @@ def load_dataset(
|
|
|
130
119
|
|
|
131
120
|
df = None
|
|
132
121
|
|
|
133
|
-
#
|
|
122
|
+
# 1️⃣ Intentar cargar desde el paquete
|
|
134
123
|
try:
|
|
135
|
-
data_bytes = pkgutil.get_data("statslibx.datasets",
|
|
124
|
+
data_bytes = pkgutil.get_data("statslibx.datasets", resource_name)
|
|
125
|
+
|
|
136
126
|
if data_bytes is not None:
|
|
137
127
|
buffer = io.BytesIO(data_bytes)
|
|
138
128
|
df = _read_file(buffer, ext, backend, sep)
|
|
139
129
|
except FileNotFoundError:
|
|
140
130
|
pass
|
|
141
131
|
|
|
142
|
-
#
|
|
132
|
+
# 2️⃣ Intentar cargar desde ruta local
|
|
143
133
|
if df is None:
|
|
144
|
-
|
|
145
|
-
df = _read_file(name, ext, backend, sep)
|
|
146
|
-
except FileNotFoundError:
|
|
134
|
+
if not path.exists():
|
|
147
135
|
raise FileNotFoundError(
|
|
148
136
|
f"Dataset '{name}' no encontrado "
|
|
149
|
-
f"ni en statslibx.datasets ni en la ruta
|
|
137
|
+
f"ni en statslibx.datasets ni en la ruta local."
|
|
150
138
|
)
|
|
139
|
+
df = _read_file(path, ext, backend, sep)
|
|
151
140
|
|
|
152
|
-
#
|
|
141
|
+
# 3️⃣ Devolver X, y si se solicita
|
|
153
142
|
if return_X_y is not None:
|
|
154
143
|
X_columns, y_column = return_X_y
|
|
155
144
|
return _X_y(df, X_columns, y_column)
|
|
@@ -157,12 +146,13 @@ def load_dataset(
|
|
|
157
146
|
return df
|
|
158
147
|
|
|
159
148
|
|
|
149
|
+
|
|
160
150
|
# =========================
|
|
161
151
|
# Datasets específicos
|
|
162
152
|
# =========================
|
|
163
153
|
|
|
164
154
|
def load_iris(
|
|
165
|
-
backend:
|
|
155
|
+
backend: str = "pandas",
|
|
166
156
|
return_X_y: Optional[Tuple[List[str], str]] = None
|
|
167
157
|
):
|
|
168
158
|
return load_dataset(
|
|
@@ -173,7 +163,7 @@ def load_iris(
|
|
|
173
163
|
|
|
174
164
|
|
|
175
165
|
def load_penguins(
|
|
176
|
-
backend:
|
|
166
|
+
backend: str = "pandas",
|
|
177
167
|
return_X_y: Optional[Tuple[List[str], str]] = None
|
|
178
168
|
):
|
|
179
169
|
return load_dataset(
|
|
@@ -1,13 +1,9 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
import pandas as pd
|
|
3
|
-
import polars as pl
|
|
4
3
|
from typing import Optional, Union, Literal, List
|
|
5
4
|
from datetime import datetime
|
|
6
|
-
import os
|
|
7
5
|
import matplotlib.pyplot as plt
|
|
8
6
|
import seaborn as sns
|
|
9
|
-
import io
|
|
10
|
-
import base64
|
|
11
7
|
import plotly.express as px
|
|
12
8
|
|
|
13
9
|
class DescriptiveStats:
|
|
@@ -104,16 +100,9 @@ class DescriptiveStats:
|
|
|
104
100
|
raise TypeError(
|
|
105
101
|
"Data must be a pandas.DataFrame or numpy.ndarray."
|
|
106
102
|
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
data = pd.DataFrame({'var': data})
|
|
111
|
-
else:
|
|
112
|
-
data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])],
|
|
113
|
-
sep=self.sep) \
|
|
114
|
-
if isinstance(data, pd.DataFrame) else pl.DataFrame(data, )
|
|
115
|
-
|
|
116
|
-
self._numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
|
|
103
|
+
|
|
104
|
+
self._numeric_cols = self.data.select_dtypes(include=["number"]).columns.tolist()
|
|
105
|
+
self._categorical_cols = self.data.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
117
106
|
self.lang = lang
|
|
118
107
|
|
|
119
108
|
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
|
-
import
|
|
5
|
-
from typing import Optional, Union, Literal, List, Dict, Any, Tuple
|
|
4
|
+
from typing import Union, Literal, Dict, Any, Tuple
|
|
6
5
|
from datetime import datetime
|
|
7
6
|
from scipy import stats
|
|
8
|
-
import os
|
|
9
7
|
|
|
10
8
|
class InferentialStats:
|
|
11
9
|
"""
|
|
@@ -94,8 +92,8 @@ class InferentialStats:
|
|
|
94
92
|
else:
|
|
95
93
|
data = pd.DataFrame(data, columns=[f'var_{i}' for i in range(data.shape[1])])
|
|
96
94
|
|
|
97
|
-
self.
|
|
98
|
-
self.
|
|
95
|
+
self._numeric_cols = data.select_dtypes(include=["number"]).columns.tolist()
|
|
96
|
+
self._categorical_cols = self.data.select_dtypes(include=["object", "category"]).columns.tolist()
|
|
99
97
|
self.lang = lang
|
|
100
98
|
|
|
101
99
|
# ============= INTERVALOS DE CONFIANZA =============
|
|
@@ -10,6 +10,7 @@ class Preprocessing:
|
|
|
10
10
|
if not isinstance(data, (pd.DataFrame, pl.DataFrame)):
|
|
11
11
|
raise TypeError("data must be a pandas or polars DataFrame")
|
|
12
12
|
self.data = data
|
|
13
|
+
self.columns = list(self.data.columns)
|
|
13
14
|
|
|
14
15
|
# ------------------------------------------------------------------
|
|
15
16
|
# Internal helpers
|
|
@@ -27,11 +28,11 @@ class Preprocessing:
|
|
|
27
28
|
return int(self.data[column].null_count())
|
|
28
29
|
|
|
29
30
|
def _get_columns(self, columns):
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
31
|
+
if columns is None:
|
|
32
|
+
return list(self.data.columns)
|
|
33
|
+
if isinstance(columns, str):
|
|
34
|
+
return [columns]
|
|
35
|
+
return columns
|
|
35
36
|
|
|
36
37
|
# ------------------------------------------------------------------
|
|
37
38
|
# Inspection
|
|
@@ -226,3 +227,103 @@ class Preprocessing:
|
|
|
226
227
|
|
|
227
228
|
return pd.DataFrame(rows)
|
|
228
229
|
|
|
230
|
+
def change_dtypes(
|
|
231
|
+
self,
|
|
232
|
+
columns: Union[List[str], str, None] = None,
|
|
233
|
+
from_type: Optional[str] = None,
|
|
234
|
+
to_type: Optional[str] = None
|
|
235
|
+
) -> pd.DataFrame:
|
|
236
|
+
|
|
237
|
+
data = self.data
|
|
238
|
+
|
|
239
|
+
TYPE_MAP = {
|
|
240
|
+
"string": "string",
|
|
241
|
+
"object": "object",
|
|
242
|
+
"int": "int64",
|
|
243
|
+
"float": "float64",
|
|
244
|
+
"int64": "int64",
|
|
245
|
+
"float64": "float64",
|
|
246
|
+
"number": "float64"
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if columns is None:
|
|
250
|
+
columns = list(data.columns)
|
|
251
|
+
elif isinstance(columns, str):
|
|
252
|
+
columns = [columns]
|
|
253
|
+
|
|
254
|
+
if to_type and to_type not in TYPE_MAP:
|
|
255
|
+
raise ValueError(f"Unsupported to_type: {to_type}")
|
|
256
|
+
|
|
257
|
+
if self._is_pandas():
|
|
258
|
+
|
|
259
|
+
for col in columns:
|
|
260
|
+
|
|
261
|
+
if col not in data.columns:
|
|
262
|
+
print(f"Column '{col}' does not exist in the DataFrame")
|
|
263
|
+
return
|
|
264
|
+
|
|
265
|
+
if from_type is not None:
|
|
266
|
+
current_type = str(data[col].dtype)
|
|
267
|
+
|
|
268
|
+
if from_type not in current_type:
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
if to_type is not None:
|
|
272
|
+
try:
|
|
273
|
+
|
|
274
|
+
if to_type in ["int", "float", "number"]:
|
|
275
|
+
data[col] = pd.to_numeric(data[col], errors="raise")
|
|
276
|
+
|
|
277
|
+
if to_type == "int":
|
|
278
|
+
data[col] = data[col].astype("int64")
|
|
279
|
+
|
|
280
|
+
elif to_type == "string":
|
|
281
|
+
data[col] = data[col].astype("string")
|
|
282
|
+
|
|
283
|
+
elif to_type == "object":
|
|
284
|
+
data[col] = data[col].astype("object")
|
|
285
|
+
|
|
286
|
+
else:
|
|
287
|
+
data[col] = data[col].astype(TYPE_MAP[to_type])
|
|
288
|
+
|
|
289
|
+
except Exception:
|
|
290
|
+
print(f"Cannot convert column '{col}' to {to_type}")
|
|
291
|
+
|
|
292
|
+
return data
|
|
293
|
+
|
|
294
|
+
def clean_data(
|
|
295
|
+
self,
|
|
296
|
+
# 🔍 Missing values
|
|
297
|
+
handle_missing: bool = False,
|
|
298
|
+
missing_strategy: str = "mean", # mean, median, mode, drop, constant
|
|
299
|
+
fill_value=None,
|
|
300
|
+
|
|
301
|
+
# 🧹 Duplicados
|
|
302
|
+
remove_duplicates: bool = False,
|
|
303
|
+
|
|
304
|
+
# 📊 Tipos de datos
|
|
305
|
+
convert_dtypes: bool = False,
|
|
306
|
+
|
|
307
|
+
# 🚨 Outliers
|
|
308
|
+
detect_outliers: bool = False,
|
|
309
|
+
remove_outliers: bool = False,
|
|
310
|
+
outlier_method: str = "iqr", # iqr, zscore
|
|
311
|
+
z_thresh: float = 3.0,
|
|
312
|
+
|
|
313
|
+
# 📏 Escalado / Normalización
|
|
314
|
+
scale: bool = False,
|
|
315
|
+
scaling_method: str = "standard", # standard, minmax, robust
|
|
316
|
+
|
|
317
|
+
# 🔢 Transformaciones
|
|
318
|
+
log_transform: bool = False,
|
|
319
|
+
sqrt_transform: bool = False,
|
|
320
|
+
|
|
321
|
+
# 🧱 Columnas
|
|
322
|
+
drop_columns: list = None,
|
|
323
|
+
keep_columns: list = None,
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# 🧪 Analisis
|
|
327
|
+
analizer: bool = True,
|
|
328
|
+
text_analizer: bool = False) -> pd.DataFrame | str:
|
|
329
|
+
pass
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: statslibx
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: StatsLibx - Librería de estadística descriptiva, inferencial y computacional
|
|
5
5
|
Author-email: Emmanuel Ascendra Perez <ascendraemmanuel@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -16,6 +16,14 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
16
16
|
Requires-Python: >=3.8
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
Requires-Dist: pandas>=1.5
|
|
19
|
+
Requires-Dist: matplotlib>=3.5
|
|
20
|
+
Requires-Dist: numpy>=1.23
|
|
21
|
+
Requires-Dist: scipy>=1.9
|
|
22
|
+
Requires-Dist: polars>=0.16
|
|
23
|
+
Requires-Dist: scikit-learn>=1.0
|
|
24
|
+
Requires-Dist: statsmodels>=0.13
|
|
25
|
+
Requires-Dist: seaborn>=0.11
|
|
26
|
+
Requires-Dist: plotly>=5.0
|
|
19
27
|
Provides-Extra: viz
|
|
20
28
|
Requires-Dist: seaborn>=0.11; extra == "viz"
|
|
21
29
|
Requires-Dist: plotly>=5.0; extra == "viz"
|
|
@@ -29,6 +37,8 @@ StatsLibX es un paquete de Python diseñado para proporcionar una solución senc
|
|
|
29
37
|
|
|
30
38
|
Este proyecto surge con la idea de ofrecer una alternativa moderna, intuitiva y ligera que permita a desarrolladores y entusiastas integrar la **estadistica descriptiva, inferencial y computacional (En desarrollo)** sin complicaciones, con multiples funcionalidades y utilidades pensadas para el futuro.
|
|
31
39
|
|
|
40
|
+
Pagina Web: [StatsLibX](https://ghostanalyst30.github.io/StatsLibX/Documentation_Page/index.html)
|
|
41
|
+
|
|
32
42
|
GitHub del Proyecto: [https://github.com/GhostAnalyst30/StatsLibX](https://github.com/GhostAnalyst30/StatsLibX)
|
|
33
43
|
|
|
34
44
|
## ✨ Características principales
|
|
@@ -63,6 +73,7 @@ pip install statslibx
|
|
|
63
73
|
|
|
64
74
|
## 👩💻 ¡Usalo en la terminal! (De forma preliminar)
|
|
65
75
|
```bash
|
|
76
|
+
statslibx # Informacion general de la libreria
|
|
66
77
|
statslibx describe .\archive.csv # Devuelve una descripcion de la data
|
|
67
78
|
statslibx quality .\archive.csv # Devuelve la calidad de los datos
|
|
68
79
|
statslibx preview .\archive.csv # Devuelve una visualizacion de los datos
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from statslibx import load_dataset, DescriptiveStats, InferentialStats
|
|
2
|
+
import pandas as pd
|
|
3
|
+
# df = pd.read_csv(r"tests\bank (1).csv", sep=";")
|
|
4
|
+
|
|
5
|
+
# df = load_dataset(r"tests\bank (1).csv", sep=";")
|
|
6
|
+
# stats = DescriptiveStats(df)
|
|
7
|
+
# print(stats.data)
|
|
8
|
+
|
|
9
|
+
# infer = InferentialStats(df)
|
|
10
|
+
# print(infer.data)
|
|
11
|
+
|
|
12
|
+
# df = load_dataset(r"statslibx\datasets\Cocoa_Bubbles_Investment_Nigeria_Ghana_1980_2023.xlsx")
|
|
13
|
+
|
|
14
|
+
# ds = DescriptiveStats(df)
|
|
15
|
+
|
|
16
|
+
# print(ds.data)
|
|
17
|
+
|
|
18
|
+
import statslibx as slx
|
|
19
|
+
|
|
20
|
+
df = slx.datasets.load_penguins()
|
|
21
|
+
infer = slx.InferentialStats(df)
|
|
22
|
+
|
|
23
|
+
# Confidence Interval + Point Estimate
|
|
24
|
+
print(infer.confidence_interval(
|
|
25
|
+
column="bill_length_mm",
|
|
26
|
+
statistic="mean"
|
|
27
|
+
))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
statslibx-0.2.2/tests/test1.py
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
from statslibx import load_dataset, DescriptiveStats, InferentialStats
|
|
2
|
-
import pandas as pd
|
|
3
|
-
# df = pd.read_csv(r"tests\bank (1).csv", sep=";")
|
|
4
|
-
|
|
5
|
-
# df = load_dataset(r"tests\bank (1).csv", sep=";")
|
|
6
|
-
# stats = DescriptiveStats(df)
|
|
7
|
-
# print(stats.data)
|
|
8
|
-
|
|
9
|
-
# infer = InferentialStats(df)
|
|
10
|
-
# print(infer.data)
|
|
11
|
-
|
|
12
|
-
df = load_dataset(r"statslibx\datasets\WHR25_Data_Figure_2.1.xlsx")
|
|
13
|
-
|
|
14
|
-
ds = DescriptiveStats(df)
|
|
15
|
-
|
|
16
|
-
print(ds.data)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|