dscience-tools 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dscience_tools-1.0.0/LICENSE +21 -0
- dscience_tools-1.0.0/MANIFEST.in +4 -0
- dscience_tools-1.0.0/PKG-INFO +195 -0
- dscience_tools-1.0.0/README.md +163 -0
- dscience_tools-1.0.0/pyproject.toml +52 -0
- dscience_tools-1.0.0/setup.cfg +46 -0
- dscience_tools-1.0.0/src/dscience_tools.egg-info/PKG-INFO +195 -0
- dscience_tools-1.0.0/src/dscience_tools.egg-info/SOURCES.txt +34 -0
- dscience_tools-1.0.0/src/dscience_tools.egg-info/dependency_links.txt +1 -0
- dscience_tools-1.0.0/src/dscience_tools.egg-info/requires.txt +2 -0
- dscience_tools-1.0.0/src/dscience_tools.egg-info/top_level.txt +1 -0
- dscience_tools-1.0.0/tests/test_add_missing.py +57 -0
- dscience_tools-1.0.0/tests/test_alphanum.py +41 -0
- dscience_tools-1.0.0/tests/test_category_stats.py +38 -0
- dscience_tools-1.0.0/tests/test_chatterjee.py +47 -0
- dscience_tools-1.0.0/tests/test_check_ninf.py +29 -0
- dscience_tools-1.0.0/tests/test_compute_metrics.py +34 -0
- dscience_tools-1.0.0/tests/test_corr_matrix.py +43 -0
- dscience_tools-1.0.0/tests/test_describe_cat.py +69 -0
- dscience_tools-1.0.0/tests/test_describe_num.py +63 -0
- dscience_tools-1.0.0/tests/test_df_stats.py +65 -0
- dscience_tools-1.0.0/tests/test_entropy.py +43 -0
- dscience_tools-1.0.0/tests/test_evaluate_cls.py +38 -0
- dscience_tools-1.0.0/tests/test_generate_dist.py +52 -0
- dscience_tools-1.0.0/tests/test_generate_from_metrics.py +87 -0
- dscience_tools-1.0.0/tests/test_grubbs.py +52 -0
- dscience_tools-1.0.0/tests/test_kl_divergence.py +44 -0
- dscience_tools-1.0.0/tests/test_labeling.py +59 -0
- dscience_tools-1.0.0/tests/test_min_max.py +93 -0
- dscience_tools-1.0.0/tests/test_normality.py +56 -0
- dscience_tools-1.0.0/tests/test_outliers.py +99 -0
- dscience_tools-1.0.0/tests/test_plot_cm.py +63 -0
- dscience_tools-1.0.0/tests/test_stationarity.py +65 -0
- dscience_tools-1.0.0/tests/test_trials_res_df.py +65 -0
- dscience_tools-1.0.0/tests/test_zip_io.py +65 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Sergii Kavun
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dscience_tools
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: DSTools: Data Science Tools Library
|
|
5
|
+
Home-page: https://github.com/s-kav/ds_tools
|
|
6
|
+
Author: Sergii Kavun
|
|
7
|
+
Author-email: Sergii Kavun <kavserg@gmail.com>
|
|
8
|
+
Maintainer-email: Sergii Kavun <kavserg@gmail.com>
|
|
9
|
+
License: MIT License
|
|
10
|
+
Project-URL: Homepage, https://github.com/s-kav/ds_tools
|
|
11
|
+
Project-URL: Documentation, https://github.com/s-kav/ds_tools
|
|
12
|
+
Project-URL: Repository, https://github.com/s-kav/ds_tools.git
|
|
13
|
+
Project-URL: Source Code, https://github.com/s-kav/ds_tools/src
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Information Technology
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Natural Language :: English
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
26
|
+
Requires-Python: >=3.6.0
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: pandas<=2.2.3,>=0.25.0
|
|
30
|
+
Requires-Dist: numpy<=2.2.0,>=1.22.0
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# DSTools: Data Science Tools Library
|
|
34
|
+
|
|
35
|
+
[](https://badge.fury.io/py/dscience_tools)
|
|
36
|
+
[](https://opensource.org/licenses/MIT)
|
|
37
|
+
|
|
38
|
+
# Authors
|
|
39
|
+
|
|
40
|
+
- [@sergiikavun](https://www.linkedin.com/in/sergii-kavun/)
|
|
41
|
+
|
|
42
|
+
**DSTools** is a Python library designed to assist data scientists and researchers by providing a collection of helpful functions for various stages of a data science project, from data exploration and preprocessing to model evaluation and synthetic data generation.
|
|
43
|
+
|
|
44
|
+
# Table of Contents
|
|
45
|
+
|
|
46
|
+
* [Features](#features)
|
|
47
|
+
* [Installation](#installation)
|
|
48
|
+
* [Usage](#usage)
|
|
49
|
+
* [Function Overview](#function-overview)
|
|
50
|
+
* [Example](#example)
|
|
51
|
+
* [Contributing](#contributing)
|
|
52
|
+
* [References](#references)
|
|
53
|
+
* [License](#license)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Features
|
|
57
|
+
|
|
58
|
+
- **Data Exploration:** Quickly get statistics for numerical and categorical features (`describe_numeric`, `describe_categorical`), check for missing values (`check_NINF`), and visualize correlations (`corr_matrix`).
|
|
59
|
+
- **Model Evaluation:** Comprehensive classification model evaluation (`evaluate_classification`, `compute_metrics`) with clear visualizations (`plot_confusion_matrix`).
|
|
60
|
+
- **Data Preprocessing:** Encode categorical variables (`labeling`), handle outliers (`remove_outliers_iqr`), and scale features (`min_max_scale`).
|
|
61
|
+
- **Time Series Analysis:** Test for stationarity using the Dickey-Fuller test (`test_stationarity`).
|
|
62
|
+
- **Synthetic Data Generation:** Create complex numerical distributions matching specific statistical moments (`generate_distribution`, `generate_distribution_from_metrics`).
|
|
63
|
+
- **Advanced Statistics:** Calculate non-parametric correlation (`chatterjee_correlation`), entropy, and KL-divergence.
|
|
64
|
+
- **Utilities:** Save/load DataFrames to/from ZIP archives, generate random alphanumeric codes, and more.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Installation
|
|
68
|
+
|
|
69
|
+
## Clone the Repository
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone https://github.com/s-kav/ds_tools.git
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Navigate to the Project Directory
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
cd ds_tools
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Install Dependencies
|
|
84
|
+
|
|
85
|
+
Ensure you have Python version 3.8 or higher and install the required packages:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install -r requirements.txt
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
# Usage
|
|
93
|
+
|
|
94
|
+
Here's a simple example of how to use the library to evaluate a classification model.
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
|
|
99
|
+
import numpy as np
|
|
100
|
+
from ds_tools import DSTools
|
|
101
|
+
|
|
102
|
+
# 1. Initialize the toolkit
|
|
103
|
+
tools = DSTools()
|
|
104
|
+
|
|
105
|
+
# 2. Generate some dummy data
|
|
106
|
+
y_true = np.array([0, 1, 1, 0, 1, 0, 0, 1])
|
|
107
|
+
y_probs = np.array([0.1, 0.8, 0.6, 0.3, 0.9, 0.2, 0.4, 0.7])
|
|
108
|
+
|
|
109
|
+
# 3. Get a comprehensive evaluation report
|
|
110
|
+
# This will print metrics and show plots for ROC and Precision-Recall curves.
|
|
111
|
+
results = tools.evaluate_classification(true_labels=y_true, pred_probs=y_probs)
|
|
112
|
+
|
|
113
|
+
# The results are also returned as a dictionary
|
|
114
|
+
print(f"\nROC AUC Score: {results['roc_auc']:.4f}")
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# Function Overview
|
|
123
|
+
|
|
124
|
+
The library provides a wide range of functions. To see a full, formatted list of available tools, you can use the function_list method:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
|
|
128
|
+
from ds_tools import DSTools
|
|
129
|
+
|
|
130
|
+
tools = DSTools()
|
|
131
|
+
tools.function_list()
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
# Example
|
|
136
|
+
|
|
137
|
+
Generating a Synthetic Distribution: need to create a dataset with specific statistical properties?
|
|
138
|
+
generate_distribution_from_metrics can do that.
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
|
|
143
|
+
from ds_tools import DSTools, DistributionConfig
|
|
144
|
+
|
|
145
|
+
tools = DSTools()
|
|
146
|
+
|
|
147
|
+
# Define the desired metrics
|
|
148
|
+
metrics_config = DistributionConfig(
|
|
149
|
+
mean=1042,
|
|
150
|
+
median=330,
|
|
151
|
+
std=1500,
|
|
152
|
+
min_val=1,
|
|
153
|
+
max_val=120000,
|
|
154
|
+
skewness=13.2,
|
|
155
|
+
kurtosis=245, # Excess kurtosis
|
|
156
|
+
n=10000
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Generate the data
|
|
160
|
+
generated_data = tools.generate_distribution_from_metrics(n=10000, metrics=metrics_config)
|
|
161
|
+
|
|
162
|
+
print(f"Generated Mean: {np.mean(generated_data):.2f}")
|
|
163
|
+
print(f"Generated Std: {np.std(generated_data):.2f}")
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# Contributing
|
|
172
|
+
|
|
173
|
+
Contributions are welcome! Please feel free to submit a pull request or open an issue on the GitHub repository.
|
|
174
|
+
|
|
175
|
+
To contribute:
|
|
176
|
+
|
|
177
|
+
Fork the repository.
|
|
178
|
+
Create a new branch for your feature or bugfix.
|
|
179
|
+
Commit your changes with clear messages.
|
|
180
|
+
Push to your fork and submit a pull request.
|
|
181
|
+
Please ensure your code adheres to PEP8 standards and includes appropriate docstrings and comments.
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# References
|
|
185
|
+
|
|
186
|
+
For citing you should use:
|
|
187
|
+
|
|
188
|
+
Sergii Kavun. (2025). s-kav/ds_tools: Version 0.9.1 (v0.9.1). Zenodo. https://doi.org/10.5281/zenodo.15864146
|
|
189
|
+
|
|
190
|
+
[](https://doi.org/10.5281/zenodo.15864146)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# License
|
|
194
|
+
|
|
195
|
+
This project is licensed under the MIT License - see the [LICENSE](https://github.com/s-kav/ds_tools/blob/main/LICENSE) file for details.
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# DSTools: Data Science Tools Library
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/dscience_tools)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
|
|
6
|
+
# Authors
|
|
7
|
+
|
|
8
|
+
- [@sergiikavun](https://www.linkedin.com/in/sergii-kavun/)
|
|
9
|
+
|
|
10
|
+
**DSTools** is a Python library designed to assist data scientists and researchers by providing a collection of helpful functions for various stages of a data science project, from data exploration and preprocessing to model evaluation and synthetic data generation.
|
|
11
|
+
|
|
12
|
+
# Table of Contents
|
|
13
|
+
|
|
14
|
+
* [Features](#features)
|
|
15
|
+
* [Installation](#installation)
|
|
16
|
+
* [Usage](#usage)
|
|
17
|
+
* [Function Overview](#function-overview)
|
|
18
|
+
* [Example](#example)
|
|
19
|
+
* [Contributing](#contributing)
|
|
20
|
+
* [References](#references)
|
|
21
|
+
* [License](#license)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Features
|
|
25
|
+
|
|
26
|
+
- **Data Exploration:** Quickly get statistics for numerical and categorical features (`describe_numeric`, `describe_categorical`), check for missing values (`check_NINF`), and visualize correlations (`corr_matrix`).
|
|
27
|
+
- **Model Evaluation:** Comprehensive classification model evaluation (`evaluate_classification`, `compute_metrics`) with clear visualizations (`plot_confusion_matrix`).
|
|
28
|
+
- **Data Preprocessing:** Encode categorical variables (`labeling`), handle outliers (`remove_outliers_iqr`), and scale features (`min_max_scale`).
|
|
29
|
+
- **Time Series Analysis:** Test for stationarity using the Dickey-Fuller test (`test_stationarity`).
|
|
30
|
+
- **Synthetic Data Generation:** Create complex numerical distributions matching specific statistical moments (`generate_distribution`, `generate_distribution_from_metrics`).
|
|
31
|
+
- **Advanced Statistics:** Calculate non-parametric correlation (`chatterjee_correlation`), entropy, and KL-divergence.
|
|
32
|
+
- **Utilities:** Save/load DataFrames to/from ZIP archives, generate random alphanumeric codes, and more.
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Installation
|
|
36
|
+
|
|
37
|
+
## Clone the Repository
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
git clone https://github.com/s-kav/ds_tools.git
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Navigate to the Project Directory
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
cd ds_tools
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Install Dependencies
|
|
52
|
+
|
|
53
|
+
Ensure you have Python version 3.8 or higher and install the required packages:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install -r requirements.txt
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
# Usage
|
|
61
|
+
|
|
62
|
+
Here's a simple example of how to use the library to evaluate a classification model.
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
|
|
67
|
+
import numpy as np
|
|
68
|
+
from ds_tools import DSTools
|
|
69
|
+
|
|
70
|
+
# 1. Initialize the toolkit
|
|
71
|
+
tools = DSTools()
|
|
72
|
+
|
|
73
|
+
# 2. Generate some dummy data
|
|
74
|
+
y_true = np.array([0, 1, 1, 0, 1, 0, 0, 1])
|
|
75
|
+
y_probs = np.array([0.1, 0.8, 0.6, 0.3, 0.9, 0.2, 0.4, 0.7])
|
|
76
|
+
|
|
77
|
+
# 3. Get a comprehensive evaluation report
|
|
78
|
+
# This will print metrics and show plots for ROC and Precision-Recall curves.
|
|
79
|
+
results = tools.evaluate_classification(true_labels=y_true, pred_probs=y_probs)
|
|
80
|
+
|
|
81
|
+
# The results are also returned as a dictionary
|
|
82
|
+
print(f"\nROC AUC Score: {results['roc_auc']:.4f}")
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Function Overview
|
|
91
|
+
|
|
92
|
+
The library provides a wide range of functions. To see a full, formatted list of available tools, you can use the function_list method:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
|
|
96
|
+
from ds_tools import DSTools
|
|
97
|
+
|
|
98
|
+
tools = DSTools()
|
|
99
|
+
tools.function_list()
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
# Example
|
|
104
|
+
|
|
105
|
+
Generating a Synthetic Distribution: need to create a dataset with specific statistical properties?
|
|
106
|
+
generate_distribution_from_metrics can do that.
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
|
|
111
|
+
from ds_tools import DSTools, DistributionConfig
|
|
112
|
+
|
|
113
|
+
tools = DSTools()
|
|
114
|
+
|
|
115
|
+
# Define the desired metrics
|
|
116
|
+
metrics_config = DistributionConfig(
|
|
117
|
+
mean=1042,
|
|
118
|
+
median=330,
|
|
119
|
+
std=1500,
|
|
120
|
+
min_val=1,
|
|
121
|
+
max_val=120000,
|
|
122
|
+
skewness=13.2,
|
|
123
|
+
kurtosis=245, # Excess kurtosis
|
|
124
|
+
n=10000
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Generate the data
|
|
128
|
+
generated_data = tools.generate_distribution_from_metrics(n=10000, metrics=metrics_config)
|
|
129
|
+
|
|
130
|
+
print(f"Generated Mean: {np.mean(generated_data):.2f}")
|
|
131
|
+
print(f"Generated Std: {np.std(generated_data):.2f}")
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# Contributing
|
|
140
|
+
|
|
141
|
+
Contributions are welcome! Please feel free to submit a pull request or open an issue on the GitHub repository.
|
|
142
|
+
|
|
143
|
+
To contribute:
|
|
144
|
+
|
|
145
|
+
Fork the repository.
|
|
146
|
+
Create a new branch for your feature or bugfix.
|
|
147
|
+
Commit your changes with clear messages.
|
|
148
|
+
Push to your fork and submit a pull request.
|
|
149
|
+
Please ensure your code adheres to PEP8 standards and includes appropriate docstrings and comments.
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# References
|
|
153
|
+
|
|
154
|
+
For citing you should use:
|
|
155
|
+
|
|
156
|
+
Sergii Kavun. (2025). s-kav/ds_tools: Version 0.9.1 (v0.9.1). Zenodo. https://doi.org/10.5281/zenodo.15864146
|
|
157
|
+
|
|
158
|
+
[](https://doi.org/10.5281/zenodo.15864146)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# License
|
|
162
|
+
|
|
163
|
+
This project is licensed under the MIT License - see the [LICENSE](https://github.com/s-kav/ds_tools/blob/main/LICENSE) file for details.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dscience_tools"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "DSTools: Data Science Tools Library"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name="Sergii Kavun", email="kavserg@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
maintainers = [
|
|
10
|
+
{ name="Sergii Kavun", email="kavserg@gmail.com" }
|
|
11
|
+
]
|
|
12
|
+
license = {text = "MIT License"}
|
|
13
|
+
dependencies = ["pandas<=2.2.3,>=0.25.0", "numpy<=2.2.0,>=1.22.0"]
|
|
14
|
+
requires-python = ">=3.6.0"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Intended Audience :: Information Technology",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"Natural Language :: English",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Programming Language :: Python",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
24
|
+
"Programming Language :: Python :: 3.6",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Mathematics",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
dynamic = []
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/s-kav/ds_tools"
|
|
34
|
+
Documentation = "https://github.com/s-kav/ds_tools"
|
|
35
|
+
Repository = "https://github.com/s-kav/ds_tools.git"
|
|
36
|
+
"Source Code" = "https://github.com/s-kav/ds_tools/src"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
[build-system]
|
|
40
|
+
requires = ["setuptools>=61", "wheel"]
|
|
41
|
+
build-backend = "setuptools.build_meta"
|
|
42
|
+
|
|
43
|
+
[tool.pytest.ini_options]
|
|
44
|
+
addopts = [
|
|
45
|
+
"--import-mode=importlib",
|
|
46
|
+
]
|
|
47
|
+
filterwarnings = [
|
|
48
|
+
"ignore::pydantic.warnings.PydanticDeprecatedSince20",
|
|
49
|
+
"ignore::DeprecationWarning",
|
|
50
|
+
"error",
|
|
51
|
+
]
|
|
52
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[metadata]
|
|
2
|
+
name = dscience_tools
|
|
3
|
+
version = 0.9.1
|
|
4
|
+
author = Sergii Kavun
|
|
5
|
+
author_email = kavserg@gmail.com
|
|
6
|
+
description = A library of helpful functions for various data science research stages.
|
|
7
|
+
long_description = file: README.md
|
|
8
|
+
long_description_content_type = text/markdown
|
|
9
|
+
url = https://github.com/s-kav/ds_tools
|
|
10
|
+
license = MIT
|
|
11
|
+
classifiers =
|
|
12
|
+
Programming Language :: Python :: 3
|
|
13
|
+
Programming Language :: Python :: 3.8
|
|
14
|
+
Programming Language :: Python :: 3.9
|
|
15
|
+
Programming Language :: Python :: 3.10
|
|
16
|
+
Programming Language :: Python :: 3.11
|
|
17
|
+
License :: OSI Approved :: MIT License
|
|
18
|
+
Operating System :: OS Independent
|
|
19
|
+
Intended Audience :: Developers
|
|
20
|
+
Intended Audience :: Science/Research
|
|
21
|
+
Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Topic :: Scientific/Engineering
|
|
23
|
+
|
|
24
|
+
[options]
|
|
25
|
+
package_dir =
|
|
26
|
+
= src
|
|
27
|
+
packages = find:
|
|
28
|
+
python_requires = >=3.8
|
|
29
|
+
install_requires =
|
|
30
|
+
numpy
|
|
31
|
+
pandas
|
|
32
|
+
polars
|
|
33
|
+
matplotlib
|
|
34
|
+
seaborn
|
|
35
|
+
scipy
|
|
36
|
+
scikit-learn
|
|
37
|
+
statsmodels
|
|
38
|
+
pydantic>=2.0
|
|
39
|
+
|
|
40
|
+
[options.packages.find]
|
|
41
|
+
where = src
|
|
42
|
+
|
|
43
|
+
[egg_info]
|
|
44
|
+
tag_build =
|
|
45
|
+
tag_date = 0
|
|
46
|
+
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dscience_tools
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: DSTools: Data Science Tools Library
|
|
5
|
+
Home-page: https://github.com/s-kav/ds_tools
|
|
6
|
+
Author: Sergii Kavun
|
|
7
|
+
Author-email: Sergii Kavun <kavserg@gmail.com>
|
|
8
|
+
Maintainer-email: Sergii Kavun <kavserg@gmail.com>
|
|
9
|
+
License: MIT License
|
|
10
|
+
Project-URL: Homepage, https://github.com/s-kav/ds_tools
|
|
11
|
+
Project-URL: Documentation, https://github.com/s-kav/ds_tools
|
|
12
|
+
Project-URL: Repository, https://github.com/s-kav/ds_tools.git
|
|
13
|
+
Project-URL: Source Code, https://github.com/s-kav/ds_tools/src
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Information Technology
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: Natural Language :: English
|
|
18
|
+
Classifier: Operating System :: OS Independent
|
|
19
|
+
Classifier: Programming Language :: Python
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
26
|
+
Requires-Python: >=3.6.0
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: pandas<=2.2.3,>=0.25.0
|
|
30
|
+
Requires-Dist: numpy<=2.2.0,>=1.22.0
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# DSTools: Data Science Tools Library
|
|
34
|
+
|
|
35
|
+
[](https://badge.fury.io/py/dscience_tools)
|
|
36
|
+
[](https://opensource.org/licenses/MIT)
|
|
37
|
+
|
|
38
|
+
# Authors
|
|
39
|
+
|
|
40
|
+
- [@sergiikavun](https://www.linkedin.com/in/sergii-kavun/)
|
|
41
|
+
|
|
42
|
+
**DSTools** is a Python library designed to assist data scientists and researchers by providing a collection of helpful functions for various stages of a data science project, from data exploration and preprocessing to model evaluation and synthetic data generation.
|
|
43
|
+
|
|
44
|
+
# Table of Contents
|
|
45
|
+
|
|
46
|
+
* [Features](#features)
|
|
47
|
+
* [Installation](#installation)
|
|
48
|
+
* [Usage](#usage)
|
|
49
|
+
* [Function Overview](#function-overview)
|
|
50
|
+
* [Example](#example)
|
|
51
|
+
* [Contributing](#contributing)
|
|
52
|
+
* [References](#references)
|
|
53
|
+
* [License](#license)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Features
|
|
57
|
+
|
|
58
|
+
- **Data Exploration:** Quickly get statistics for numerical and categorical features (`describe_numeric`, `describe_categorical`), check for missing values (`check_NINF`), and visualize correlations (`corr_matrix`).
|
|
59
|
+
- **Model Evaluation:** Comprehensive classification model evaluation (`evaluate_classification`, `compute_metrics`) with clear visualizations (`plot_confusion_matrix`).
|
|
60
|
+
- **Data Preprocessing:** Encode categorical variables (`labeling`), handle outliers (`remove_outliers_iqr`), and scale features (`min_max_scale`).
|
|
61
|
+
- **Time Series Analysis:** Test for stationarity using the Dickey-Fuller test (`test_stationarity`).
|
|
62
|
+
- **Synthetic Data Generation:** Create complex numerical distributions matching specific statistical moments (`generate_distribution`, `generate_distribution_from_metrics`).
|
|
63
|
+
- **Advanced Statistics:** Calculate non-parametric correlation (`chatterjee_correlation`), entropy, and KL-divergence.
|
|
64
|
+
- **Utilities:** Save/load DataFrames to/from ZIP archives, generate random alphanumeric codes, and more.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Installation
|
|
68
|
+
|
|
69
|
+
## Clone the Repository
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone https://github.com/s-kav/ds_tools.git
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Navigate to the Project Directory
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
cd ds_tools
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Install Dependencies
|
|
84
|
+
|
|
85
|
+
Ensure you have Python version 3.8 or higher and install the required packages:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install -r requirements.txt
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
# Usage
|
|
93
|
+
|
|
94
|
+
Here's a simple example of how to use the library to evaluate a classification model.
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
|
|
99
|
+
import numpy as np
|
|
100
|
+
from ds_tools import DSTools
|
|
101
|
+
|
|
102
|
+
# 1. Initialize the toolkit
|
|
103
|
+
tools = DSTools()
|
|
104
|
+
|
|
105
|
+
# 2. Generate some dummy data
|
|
106
|
+
y_true = np.array([0, 1, 1, 0, 1, 0, 0, 1])
|
|
107
|
+
y_probs = np.array([0.1, 0.8, 0.6, 0.3, 0.9, 0.2, 0.4, 0.7])
|
|
108
|
+
|
|
109
|
+
# 3. Get a comprehensive evaluation report
|
|
110
|
+
# This will print metrics and show plots for ROC and Precision-Recall curves.
|
|
111
|
+
results = tools.evaluate_classification(true_labels=y_true, pred_probs=y_probs)
|
|
112
|
+
|
|
113
|
+
# The results are also returned as a dictionary
|
|
114
|
+
print(f"\nROC AUC Score: {results['roc_auc']:.4f}")
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# Function Overview
|
|
123
|
+
|
|
124
|
+
The library provides a wide range of functions. To see a full, formatted list of available tools, you can use the function_list method:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
|
|
128
|
+
from ds_tools import DSTools
|
|
129
|
+
|
|
130
|
+
tools = DSTools()
|
|
131
|
+
tools.function_list()
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
# Example
|
|
136
|
+
|
|
137
|
+
Generating a Synthetic Distribution: need to create a dataset with specific statistical properties?
|
|
138
|
+
generate_distribution_from_metrics can do that.
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
|
|
143
|
+
from ds_tools import DSTools, DistributionConfig
|
|
144
|
+
|
|
145
|
+
tools = DSTools()
|
|
146
|
+
|
|
147
|
+
# Define the desired metrics
|
|
148
|
+
metrics_config = DistributionConfig(
|
|
149
|
+
mean=1042,
|
|
150
|
+
median=330,
|
|
151
|
+
std=1500,
|
|
152
|
+
min_val=1,
|
|
153
|
+
max_val=120000,
|
|
154
|
+
skewness=13.2,
|
|
155
|
+
kurtosis=245, # Excess kurtosis
|
|
156
|
+
n=10000
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Generate the data
|
|
160
|
+
generated_data = tools.generate_distribution_from_metrics(n=10000, metrics=metrics_config)
|
|
161
|
+
|
|
162
|
+
print(f"Generated Mean: {np.mean(generated_data):.2f}")
|
|
163
|
+
print(f"Generated Std: {np.std(generated_data):.2f}")
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# Contributing
|
|
172
|
+
|
|
173
|
+
Contributions are welcome! Please feel free to submit a pull request or open an issue on the GitHub repository.
|
|
174
|
+
|
|
175
|
+
To contribute:
|
|
176
|
+
|
|
177
|
+
Fork the repository.
|
|
178
|
+
Create a new branch for your feature or bugfix.
|
|
179
|
+
Commit your changes with clear messages.
|
|
180
|
+
Push to your fork and submit a pull request.
|
|
181
|
+
Please ensure your code adheres to PEP8 standards and includes appropriate docstrings and comments.
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# References
|
|
185
|
+
|
|
186
|
+
For citing you should use:
|
|
187
|
+
|
|
188
|
+
Sergii Kavun. (2025). s-kav/ds_tools: Version 0.9.1 (v0.9.1). Zenodo. https://doi.org/10.5281/zenodo.15864146
|
|
189
|
+
|
|
190
|
+
[](https://doi.org/10.5281/zenodo.15864146)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# License
|
|
194
|
+
|
|
195
|
+
This project is licensed under the MIT License - see the [LICENSE](https://github.com/s-kav/ds_tools/blob/main/LICENSE) file for details.
|