dscience-tools 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. dscience_tools-1.0.0/LICENSE +21 -0
  2. dscience_tools-1.0.0/MANIFEST.in +4 -0
  3. dscience_tools-1.0.0/PKG-INFO +195 -0
  4. dscience_tools-1.0.0/README.md +163 -0
  5. dscience_tools-1.0.0/pyproject.toml +52 -0
  6. dscience_tools-1.0.0/setup.cfg +46 -0
  7. dscience_tools-1.0.0/src/dscience_tools.egg-info/PKG-INFO +195 -0
  8. dscience_tools-1.0.0/src/dscience_tools.egg-info/SOURCES.txt +34 -0
  9. dscience_tools-1.0.0/src/dscience_tools.egg-info/dependency_links.txt +1 -0
  10. dscience_tools-1.0.0/src/dscience_tools.egg-info/requires.txt +2 -0
  11. dscience_tools-1.0.0/src/dscience_tools.egg-info/top_level.txt +1 -0
  12. dscience_tools-1.0.0/tests/test_add_missing.py +57 -0
  13. dscience_tools-1.0.0/tests/test_alphanum.py +41 -0
  14. dscience_tools-1.0.0/tests/test_category_stats.py +38 -0
  15. dscience_tools-1.0.0/tests/test_chatterjee.py +47 -0
  16. dscience_tools-1.0.0/tests/test_check_ninf.py +29 -0
  17. dscience_tools-1.0.0/tests/test_compute_metrics.py +34 -0
  18. dscience_tools-1.0.0/tests/test_corr_matrix.py +43 -0
  19. dscience_tools-1.0.0/tests/test_describe_cat.py +69 -0
  20. dscience_tools-1.0.0/tests/test_describe_num.py +63 -0
  21. dscience_tools-1.0.0/tests/test_df_stats.py +65 -0
  22. dscience_tools-1.0.0/tests/test_entropy.py +43 -0
  23. dscience_tools-1.0.0/tests/test_evaluate_cls.py +38 -0
  24. dscience_tools-1.0.0/tests/test_generate_dist.py +52 -0
  25. dscience_tools-1.0.0/tests/test_generate_from_metrics.py +87 -0
  26. dscience_tools-1.0.0/tests/test_grubbs.py +52 -0
  27. dscience_tools-1.0.0/tests/test_kl_divergence.py +44 -0
  28. dscience_tools-1.0.0/tests/test_labeling.py +59 -0
  29. dscience_tools-1.0.0/tests/test_min_max.py +93 -0
  30. dscience_tools-1.0.0/tests/test_normality.py +56 -0
  31. dscience_tools-1.0.0/tests/test_outliers.py +99 -0
  32. dscience_tools-1.0.0/tests/test_plot_cm.py +63 -0
  33. dscience_tools-1.0.0/tests/test_stationarity.py +65 -0
  34. dscience_tools-1.0.0/tests/test_trials_res_df.py +65 -0
  35. dscience_tools-1.0.0/tests/test_zip_io.py +65 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sergii Kavun
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include LICENSE
2
+ include README.md
3
+ prune issues
4
+ prune results
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: dscience_tools
3
+ Version: 1.0.0
4
+ Summary: DSTools: Data Science Tools Library
5
+ Home-page: https://github.com/s-kav/ds_tools
6
+ Author: Sergii Kavun
7
+ Author-email: Sergii Kavun <kavserg@gmail.com>
8
+ Maintainer-email: Sergii Kavun <kavserg@gmail.com>
9
+ License: MIT License
10
+ Project-URL: Homepage, https://github.com/s-kav/ds_tools
11
+ Project-URL: Documentation, https://github.com/s-kav/ds_tools
12
+ Project-URL: Repository, https://github.com/s-kav/ds_tools.git
13
+ Project-URL: Source Code, https://github.com/s-kav/ds_tools/src
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Information Technology
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: Natural Language :: English
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3 :: Only
22
+ Classifier: Programming Language :: Python :: 3.6
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
26
+ Requires-Python: >=3.6.0
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: pandas<=2.2.3,>=0.25.0
30
+ Requires-Dist: numpy<=2.2.0,>=1.22.0
31
+ Dynamic: license-file
32
+
33
+ # DSTools: Data Science Tools Library
34
+
35
+ [![PyPI version](https://badge.fury.io/py/dscience_tools.svg)](https://badge.fury.io/py/dscience_tools)
36
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
37
+
38
+ # Authors
39
+
40
+ - [@sergiikavun](https://www.linkedin.com/in/sergii-kavun/)
41
+
42
+ **DSTools** is a Python library designed to assist data scientists and researchers by providing a collection of helpful functions for various stages of a data science project, from data exploration and preprocessing to model evaluation and synthetic data generation.
43
+
44
+ # Table of Contents
45
+
46
+ * [Features](#features)
47
+ * [Installation](#installation)
48
+ * [Usage](#usage)
49
+ * [Function Overview](#function-overview)
50
+ * [Example](#example)
51
+ * [Contributing](#contributing)
52
+ * [References](#references)
53
+ * [License](#license)
54
+
55
+
56
+ # Features
57
+
58
+ - **Data Exploration:** Quickly get statistics for numerical and categorical features (`describe_numeric`, `describe_categorical`), check for missing values (`check_NINF`), and visualize correlations (`corr_matrix`).
59
+ - **Model Evaluation:** Comprehensive classification model evaluation (`evaluate_classification`, `compute_metrics`) with clear visualizations (`plot_confusion_matrix`).
60
+ - **Data Preprocessing:** Encode categorical variables (`labeling`), handle outliers (`remove_outliers_iqr`), and scale features (`min_max_scale`).
61
+ - **Time Series Analysis:** Test for stationarity using the Dickey-Fuller test (`test_stationarity`).
62
+ - **Synthetic Data Generation:** Create complex numerical distributions matching specific statistical moments (`generate_distribution`, `generate_distribution_from_metrics`).
63
+ - **Advanced Statistics:** Calculate non-parametric correlation (`chatterjee_correlation`), entropy, and KL-divergence.
64
+ - **Utilities:** Save/load DataFrames to/from ZIP archives, generate random alphanumeric codes, and more.
65
+
66
+
67
+ # Installation
68
+
69
+ ## Clone the Repository
70
+
71
+ ```bash
72
+ git clone https://github.com/s-kav/ds_tools.git
73
+
74
+ ```
75
+
76
+ ## Navigate to the Project Directory
77
+
78
+ ```bash
79
+ cd ds_tools
80
+
81
+ ```
82
+
83
+ ## Install Dependencies
84
+
85
+ Ensure you have Python version 3.8 or higher and install the required packages:
86
+
87
+ ```bash
88
+ pip install -r requirements.txt
89
+
90
+ ```
91
+
92
+ # Usage
93
+
94
+ Here's a simple example of how to use the library to evaluate a classification model.
95
+
96
+
97
+ ```python
98
+
99
+ import numpy as np
100
+ from ds_tools import DSTools
101
+
102
+ # 1. Initialize the toolkit
103
+ tools = DSTools()
104
+
105
+ # 2. Generate some dummy data
106
+ y_true = np.array([0, 1, 1, 0, 1, 0, 0, 1])
107
+ y_probs = np.array([0.1, 0.8, 0.6, 0.3, 0.9, 0.2, 0.4, 0.7])
108
+
109
+ # 3. Get a comprehensive evaluation report
110
+ # This will print metrics and show plots for ROC and Precision-Recall curves.
111
+ results = tools.evaluate_classification(true_labels=y_true, pred_probs=y_probs)
112
+
113
+ # The results are also returned as a dictionary
114
+ print(f"\nROC AUC Score: {results['roc_auc']:.4f}")
115
+
116
+ ```
117
+
118
+
119
+ Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
120
+
121
+
122
+ # Function Overview
123
+
124
+ The library provides a wide range of functions. To see a full, formatted list of available tools, you can use the function_list method:
125
+
126
+ ```python
127
+
128
+ from ds_tools import DSTools
129
+
130
+ tools = DSTools()
131
+ tools.function_list()
132
+
133
+ ```
134
+
135
+ # Example
136
+
137
+ Generating a Synthetic Distribution: need to create a dataset with specific statistical properties?
138
+ generate_distribution_from_metrics can do that.
139
+
140
+
141
+ ```python
142
+
143
+ from ds_tools import DSTools, DistributionConfig
144
+
145
+ tools = DSTools()
146
+
147
+ # Define the desired metrics
148
+ metrics_config = DistributionConfig(
149
+ mean=1042,
150
+ median=330,
151
+ std=1500,
152
+ min_val=1,
153
+ max_val=120000,
154
+ skewness=13.2,
155
+ kurtosis=245, # Excess kurtosis
156
+ n=10000
157
+ )
158
+
159
+ # Generate the data
160
+ generated_data = tools.generate_distribution_from_metrics(n=10000, metrics=metrics_config)
161
+
162
+ print(f"Generated Mean: {np.mean(generated_data):.2f}")
163
+ print(f"Generated Std: {np.std(generated_data):.2f}")
164
+
165
+ ```
166
+
167
+
168
+ Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
169
+
170
+
171
+ # Contributing
172
+
173
+ Contributions are welcome! Please feel free to submit a pull request or open an issue on the GitHub repository.
174
+
175
+ To contribute:
176
+
177
+ Fork the repository.
178
+ Create a new branch for your feature or bugfix.
179
+ Commit your changes with clear messages.
180
+ Push to your fork and submit a pull request.
181
+ Please ensure your code adheres to PEP8 standards and includes appropriate docstrings and comments.
182
+
183
+
184
+ # References
185
+
186
+ For citing you should use:
187
+
188
+ Sergii Kavun. (2025). s-kav/ds_tools: Version 0.9.1 (v0.9.1). Zenodo. https://doi.org/10.5281/zenodo.15864146
189
+
190
+ [![DOI](https://zenodo.org/badge/1001952407.svg)](https://doi.org/10.5281/zenodo.15864146)
191
+
192
+
193
+ # License
194
+
195
+ This project is licensed under the MIT License - see the [LICENSE](https://github.com/s-kav/ds_tools/blob/main/LICENSE) file for details.
@@ -0,0 +1,163 @@
1
+ # DSTools: Data Science Tools Library
2
+
3
+ [![PyPI version](https://badge.fury.io/py/dscience_tools.svg)](https://badge.fury.io/py/dscience_tools)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+
6
+ # Authors
7
+
8
+ - [@sergiikavun](https://www.linkedin.com/in/sergii-kavun/)
9
+
10
+ **DSTools** is a Python library designed to assist data scientists and researchers by providing a collection of helpful functions for various stages of a data science project, from data exploration and preprocessing to model evaluation and synthetic data generation.
11
+
12
+ # Table of Contents
13
+
14
+ * [Features](#features)
15
+ * [Installation](#installation)
16
+ * [Usage](#usage)
17
+ * [Function Overview](#function-overview)
18
+ * [Example](#example)
19
+ * [Contributing](#contributing)
20
+ * [References](#references)
21
+ * [License](#license)
22
+
23
+
24
+ # Features
25
+
26
+ - **Data Exploration:** Quickly get statistics for numerical and categorical features (`describe_numeric`, `describe_categorical`), check for missing values (`check_NINF`), and visualize correlations (`corr_matrix`).
27
+ - **Model Evaluation:** Comprehensive classification model evaluation (`evaluate_classification`, `compute_metrics`) with clear visualizations (`plot_confusion_matrix`).
28
+ - **Data Preprocessing:** Encode categorical variables (`labeling`), handle outliers (`remove_outliers_iqr`), and scale features (`min_max_scale`).
29
+ - **Time Series Analysis:** Test for stationarity using the Dickey-Fuller test (`test_stationarity`).
30
+ - **Synthetic Data Generation:** Create complex numerical distributions matching specific statistical moments (`generate_distribution`, `generate_distribution_from_metrics`).
31
+ - **Advanced Statistics:** Calculate non-parametric correlation (`chatterjee_correlation`), entropy, and KL-divergence.
32
+ - **Utilities:** Save/load DataFrames to/from ZIP archives, generate random alphanumeric codes, and more.
33
+
34
+
35
+ # Installation
36
+
37
+ ## Clone the Repository
38
+
39
+ ```bash
40
+ git clone https://github.com/s-kav/ds_tools.git
41
+
42
+ ```
43
+
44
+ ## Navigate to the Project Directory
45
+
46
+ ```bash
47
+ cd ds_tools
48
+
49
+ ```
50
+
51
+ ## Install Dependencies
52
+
53
+ Ensure you have Python version 3.8 or higher and install the required packages:
54
+
55
+ ```bash
56
+ pip install -r requirements.txt
57
+
58
+ ```
59
+
60
+ # Usage
61
+
62
+ Here's a simple example of how to use the library to evaluate a classification model.
63
+
64
+
65
+ ```python
66
+
67
+ import numpy as np
68
+ from ds_tools import DSTools
69
+
70
+ # 1. Initialize the toolkit
71
+ tools = DSTools()
72
+
73
+ # 2. Generate some dummy data
74
+ y_true = np.array([0, 1, 1, 0, 1, 0, 0, 1])
75
+ y_probs = np.array([0.1, 0.8, 0.6, 0.3, 0.9, 0.2, 0.4, 0.7])
76
+
77
+ # 3. Get a comprehensive evaluation report
78
+ # This will print metrics and show plots for ROC and Precision-Recall curves.
79
+ results = tools.evaluate_classification(true_labels=y_true, pred_probs=y_probs)
80
+
81
+ # The results are also returned as a dictionary
82
+ print(f"\nROC AUC Score: {results['roc_auc']:.4f}")
83
+
84
+ ```
85
+
86
+
87
+ Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
88
+
89
+
90
+ # Function Overview
91
+
92
+ The library provides a wide range of functions. To see a full, formatted list of available tools, you can use the function_list method:
93
+
94
+ ```python
95
+
96
+ from ds_tools import DSTools
97
+
98
+ tools = DSTools()
99
+ tools.function_list()
100
+
101
+ ```
102
+
103
+ # Example
104
+
105
+ Generating a Synthetic Distribution: need to create a dataset with specific statistical properties?
106
+ generate_distribution_from_metrics can do that.
107
+
108
+
109
+ ```python
110
+
111
+ from ds_tools import DSTools, DistributionConfig
112
+
113
+ tools = DSTools()
114
+
115
+ # Define the desired metrics
116
+ metrics_config = DistributionConfig(
117
+ mean=1042,
118
+ median=330,
119
+ std=1500,
120
+ min_val=1,
121
+ max_val=120000,
122
+ skewness=13.2,
123
+ kurtosis=245, # Excess kurtosis
124
+ n=10000
125
+ )
126
+
127
+ # Generate the data
128
+ generated_data = tools.generate_distribution_from_metrics(n=10000, metrics=metrics_config)
129
+
130
+ print(f"Generated Mean: {np.mean(generated_data):.2f}")
131
+ print(f"Generated Std: {np.std(generated_data):.2f}")
132
+
133
+ ```
134
+
135
+
136
+ Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
137
+
138
+
139
+ # Contributing
140
+
141
+ Contributions are welcome! Please feel free to submit a pull request or open an issue on the GitHub repository.
142
+
143
+ To contribute:
144
+
145
+ Fork the repository.
146
+ Create a new branch for your feature or bugfix.
147
+ Commit your changes with clear messages.
148
+ Push to your fork and submit a pull request.
149
+ Please ensure your code adheres to PEP8 standards and includes appropriate docstrings and comments.
150
+
151
+
152
+ # References
153
+
154
+ For citing you should use:
155
+
156
+ Sergii Kavun. (2025). s-kav/ds_tools: Version 0.9.1 (v0.9.1). Zenodo. https://doi.org/10.5281/zenodo.15864146
157
+
158
+ [![DOI](https://zenodo.org/badge/1001952407.svg)](https://doi.org/10.5281/zenodo.15864146)
159
+
160
+
161
+ # License
162
+
163
+ This project is licensed under the MIT License - see the [LICENSE](https://github.com/s-kav/ds_tools/blob/main/LICENSE) file for details.
@@ -0,0 +1,52 @@
1
+ [project]
2
+ name = "dscience_tools"
3
+ version = "1.0.0"
4
+ description = "DSTools: Data Science Tools Library"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name="Sergii Kavun", email="kavserg@gmail.com" }
8
+ ]
9
+ maintainers = [
10
+ { name="Sergii Kavun", email="kavserg@gmail.com" }
11
+ ]
12
+ license = {text = "MIT License"}
13
+ dependencies = ["pandas<=2.2.3,>=0.25.0", "numpy<=2.2.0,>=1.22.0"]
14
+ requires-python = ">=3.6.0"
15
+ classifiers = [
16
+ "Intended Audience :: Developers",
17
+ "Intended Audience :: Information Technology",
18
+ "Intended Audience :: Science/Research",
19
+ "Natural Language :: English",
20
+ "Operating System :: OS Independent",
21
+ "Programming Language :: Python",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3 :: Only",
24
+ "Programming Language :: Python :: 3.6",
25
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
+ "Topic :: Scientific/Engineering :: Information Analysis",
27
+ "Topic :: Scientific/Engineering :: Mathematics",
28
+ ]
29
+
30
+ dynamic = []
31
+
32
+ [project.urls]
33
+ Homepage = "https://github.com/s-kav/ds_tools"
34
+ Documentation = "https://github.com/s-kav/ds_tools"
35
+ Repository = "https://github.com/s-kav/ds_tools.git"
36
+ "Source Code" = "https://github.com/s-kav/ds_tools/src"
37
+
38
+
39
+ [build-system]
40
+ requires = ["setuptools>=61", "wheel"]
41
+ build-backend = "setuptools.build_meta"
42
+
43
+ [tool.pytest.ini_options]
44
+ addopts = [
45
+ "--import-mode=importlib",
46
+ ]
47
+ filterwarnings = [
48
+ "ignore::pydantic.warnings.PydanticDeprecatedSince20",
49
+ "ignore::DeprecationWarning",
50
+ "error",
51
+ ]
52
+ testpaths = ["tests"]
@@ -0,0 +1,46 @@
1
+ [metadata]
2
+ name = dscience_tools
3
+ version = 0.9.1
4
+ author = Sergii Kavun
5
+ author_email = kavserg@gmail.com
6
+ description = A library of helpful functions for various data science research stages.
7
+ long_description = file: README.md
8
+ long_description_content_type = text/markdown
9
+ url = https://github.com/s-kav/ds_tools
10
+ license = MIT
11
+ classifiers =
12
+ Programming Language :: Python :: 3
13
+ Programming Language :: Python :: 3.8
14
+ Programming Language :: Python :: 3.9
15
+ Programming Language :: Python :: 3.10
16
+ Programming Language :: Python :: 3.11
17
+ License :: OSI Approved :: MIT License
18
+ Operating System :: OS Independent
19
+ Intended Audience :: Developers
20
+ Intended Audience :: Science/Research
21
+ Topic :: Software Development :: Libraries :: Python Modules
22
+ Topic :: Scientific/Engineering
23
+
24
+ [options]
25
+ package_dir =
26
+ = src
27
+ packages = find:
28
+ python_requires = >=3.8
29
+ install_requires =
30
+ numpy
31
+ pandas
32
+ polars
33
+ matplotlib
34
+ seaborn
35
+ scipy
36
+ scikit-learn
37
+ statsmodels
38
+ pydantic>=2.0
39
+
40
+ [options.packages.find]
41
+ where = src
42
+
43
+ [egg_info]
44
+ tag_build =
45
+ tag_date = 0
46
+
@@ -0,0 +1,195 @@
1
+ Metadata-Version: 2.4
2
+ Name: dscience_tools
3
+ Version: 1.0.0
4
+ Summary: DSTools: Data Science Tools Library
5
+ Home-page: https://github.com/s-kav/ds_tools
6
+ Author: Sergii Kavun
7
+ Author-email: Sergii Kavun <kavserg@gmail.com>
8
+ Maintainer-email: Sergii Kavun <kavserg@gmail.com>
9
+ License: MIT License
10
+ Project-URL: Homepage, https://github.com/s-kav/ds_tools
11
+ Project-URL: Documentation, https://github.com/s-kav/ds_tools
12
+ Project-URL: Repository, https://github.com/s-kav/ds_tools.git
13
+ Project-URL: Source Code, https://github.com/s-kav/ds_tools/src
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Information Technology
16
+ Classifier: Intended Audience :: Science/Research
17
+ Classifier: Natural Language :: English
18
+ Classifier: Operating System :: OS Independent
19
+ Classifier: Programming Language :: Python
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3 :: Only
22
+ Classifier: Programming Language :: Python :: 3.6
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
26
+ Requires-Python: >=3.6.0
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: pandas<=2.2.3,>=0.25.0
30
+ Requires-Dist: numpy<=2.2.0,>=1.22.0
31
+ Dynamic: license-file
32
+
33
+ # DSTools: Data Science Tools Library
34
+
35
+ [![PyPI version](https://badge.fury.io/py/dscience_tools.svg)](https://badge.fury.io/py/dscience_tools)
36
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
37
+
38
+ # Authors
39
+
40
+ - [@sergiikavun](https://www.linkedin.com/in/sergii-kavun/)
41
+
42
+ **DSTools** is a Python library designed to assist data scientists and researchers by providing a collection of helpful functions for various stages of a data science project, from data exploration and preprocessing to model evaluation and synthetic data generation.
43
+
44
+ # Table of Contents
45
+
46
+ * [Features](#features)
47
+ * [Installation](#installation)
48
+ * [Usage](#usage)
49
+ * [Function Overview](#function-overview)
50
+ * [Example](#example)
51
+ * [Contributing](#contributing)
52
+ * [References](#references)
53
+ * [License](#license)
54
+
55
+
56
+ # Features
57
+
58
+ - **Data Exploration:** Quickly get statistics for numerical and categorical features (`describe_numeric`, `describe_categorical`), check for missing values (`check_NINF`), and visualize correlations (`corr_matrix`).
59
+ - **Model Evaluation:** Comprehensive classification model evaluation (`evaluate_classification`, `compute_metrics`) with clear visualizations (`plot_confusion_matrix`).
60
+ - **Data Preprocessing:** Encode categorical variables (`labeling`), handle outliers (`remove_outliers_iqr`), and scale features (`min_max_scale`).
61
+ - **Time Series Analysis:** Test for stationarity using the Dickey-Fuller test (`test_stationarity`).
62
+ - **Synthetic Data Generation:** Create complex numerical distributions matching specific statistical moments (`generate_distribution`, `generate_distribution_from_metrics`).
63
+ - **Advanced Statistics:** Calculate non-parametric correlation (`chatterjee_correlation`), entropy, and KL-divergence.
64
+ - **Utilities:** Save/load DataFrames to/from ZIP archives, generate random alphanumeric codes, and more.
65
+
66
+
67
+ # Installation
68
+
69
+ ## Clone the Repository
70
+
71
+ ```bash
72
+ git clone https://github.com/s-kav/ds_tools.git
73
+
74
+ ```
75
+
76
+ ## Navigate to the Project Directory
77
+
78
+ ```bash
79
+ cd ds_tools
80
+
81
+ ```
82
+
83
+ ## Install Dependencies
84
+
85
+ Ensure you have Python version 3.8 or higher and install the required packages:
86
+
87
+ ```bash
88
+ pip install -r requirements.txt
89
+
90
+ ```
91
+
92
+ # Usage
93
+
94
+ Here's a simple example of how to use the library to evaluate a classification model.
95
+
96
+
97
+ ```python
98
+
99
+ import numpy as np
100
+ from ds_tools import DSTools
101
+
102
+ # 1. Initialize the toolkit
103
+ tools = DSTools()
104
+
105
+ # 2. Generate some dummy data
106
+ y_true = np.array([0, 1, 1, 0, 1, 0, 0, 1])
107
+ y_probs = np.array([0.1, 0.8, 0.6, 0.3, 0.9, 0.2, 0.4, 0.7])
108
+
109
+ # 3. Get a comprehensive evaluation report
110
+ # This will print metrics and show plots for ROC and Precision-Recall curves.
111
+ results = tools.evaluate_classification(true_labels=y_true, pred_probs=y_probs)
112
+
113
+ # The results are also returned as a dictionary
114
+ print(f"\nROC AUC Score: {results['roc_auc']:.4f}")
115
+
116
+ ```
117
+
118
+
119
+ Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
120
+
121
+
122
+ # Function Overview
123
+
124
+ The library provides a wide range of functions. To see a full, formatted list of available tools, you can use the function_list method:
125
+
126
+ ```python
127
+
128
+ from ds_tools import DSTools
129
+
130
+ tools = DSTools()
131
+ tools.function_list()
132
+
133
+ ```
134
+
135
+ # Example
136
+
137
+ Generating a Synthetic Distribution: need to create a dataset with specific statistical properties?
138
+ generate_distribution_from_metrics can do that.
139
+
140
+
141
+ ```python
142
+
143
+ from ds_tools import DSTools, DistributionConfig
144
+
145
+ tools = DSTools()
146
+
147
+ # Define the desired metrics
148
+ metrics_config = DistributionConfig(
149
+ mean=1042,
150
+ median=330,
151
+ std=1500,
152
+ min_val=1,
153
+ max_val=120000,
154
+ skewness=13.2,
155
+ kurtosis=245, # Excess kurtosis
156
+ n=10000
157
+ )
158
+
159
+ # Generate the data
160
+ generated_data = tools.generate_distribution_from_metrics(n=10000, metrics=metrics_config)
161
+
162
+ print(f"Generated Mean: {np.mean(generated_data):.2f}")
163
+ print(f"Generated Std: {np.std(generated_data):.2f}")
164
+
165
+ ```
166
+
167
+
168
+ Full code base for other function testing you can find [here](https://github.com/s-kav/ds_tools/blob/main/tests/code_checking_dstool.py).
169
+
170
+
171
+ # Contributing
172
+
173
+ Contributions are welcome! Please feel free to submit a pull request or open an issue on the GitHub repository.
174
+
175
+ To contribute:
176
+
177
+ Fork the repository.
178
+ Create a new branch for your feature or bugfix.
179
+ Commit your changes with clear messages.
180
+ Push to your fork and submit a pull request.
181
+ Please ensure your code adheres to PEP8 standards and includes appropriate docstrings and comments.
182
+
183
+
184
+ # References
185
+
186
+ For citing you should use:
187
+
188
+ Sergii Kavun. (2025). s-kav/ds_tools: Version 0.9.1 (v0.9.1). Zenodo. https://doi.org/10.5281/zenodo.15864146
189
+
190
+ [![DOI](https://zenodo.org/badge/1001952407.svg)](https://doi.org/10.5281/zenodo.15864146)
191
+
192
+
193
+ # License
194
+
195
+ This project is licensed under the MIT License - see the [LICENSE](https://github.com/s-kav/ds_tools/blob/main/LICENSE) file for details.