chelo 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chelo-0.0.1/LICENSE +21 -0
- chelo-0.0.1/MANIFEST.in +4 -0
- chelo-0.0.1/PKG-INFO +249 -0
- chelo-0.0.1/README.md +212 -0
- chelo-0.0.1/chelo.egg-info/PKG-INFO +249 -0
- chelo-0.0.1/chelo.egg-info/SOURCES.txt +17 -0
- chelo-0.0.1/chelo.egg-info/dependency_links.txt +1 -0
- chelo-0.0.1/chelo.egg-info/requires.txt +4 -0
- chelo-0.0.1/chelo.egg-info/top_level.txt +1 -0
- chelo-0.0.1/pyproject.toml +27 -0
- chelo-0.0.1/setup.cfg +4 -0
- chelo-0.0.1/src/__init__.py +0 -0
- chelo-0.0.1/src/base.py +166 -0
- chelo-0.0.1/src/datasets/__init__.py +17 -0
- chelo-0.0.1/src/datasets/wine_quality.py +71 -0
- chelo-0.0.1/src/registry.py +45 -0
- chelo-0.0.1/src/utils/__init__.py +0 -0
- chelo-0.0.1/src/utils/downloader.py +118 -0
- chelo-0.0.1/tests/test_base.py +50 -0
chelo-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Nikolaos Passalis
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
chelo-0.0.1/MANIFEST.in
ADDED
chelo-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: chelo
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Chemical Engineering Dataset Loader Library
|
|
5
|
+
Author-email: Nikolaos Passalis <passalis@auth.gr>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Nikolaos Passalis
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Classifier: Programming Language :: Python :: 3
|
|
29
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
30
|
+
Classifier: Operating System :: OS Independent
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Requires-Dist: numpy
|
|
34
|
+
Requires-Dist: pandas
|
|
35
|
+
Requires-Dist: tqdm
|
|
36
|
+
Requires-Dist: requests
|
|
37
|
+
|
|
38
|
+
# CheLo (Chemical Engineering Dataset Loader) Library
|
|
39
|
+
|
|
40
|
+
## Overview
|
|
41
|
+
The CheLo Library is a Python library designed to simplify the exploration of data-driven modeling for students studying chemical engineering and researchers working on related projects.
|
|
42
|
+
This library provides a standardized framework for accessing, managing, and utilizing chemical engineering datasets for machine learning and statistical analysis.
|
|
43
|
+
|
|
44
|
+
## Key Features
|
|
45
|
+
- **Dataset Standardization**: Unified API for accessing and exploring datasets.
|
|
46
|
+
- **Multiple Data Formats**: Provides ready to use loaders for numpy, PyTorch, and Keras.
|
|
47
|
+
- **Preprocessing Tools**: Methods for feature/target selection, statistics, and previewing datasets.
|
|
48
|
+
- **Dataset Management**: Automated downloading, caching, and registry of datasets.
|
|
49
|
+
- **Extensibility**: Abstract base class for easy addition of new datasets.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
To install the library, run the following command:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install chelo
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
To install the library in editable mode for development purposes:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
git clone https://github.com/your-repo/chelo.git
|
|
63
|
+
cd chelo
|
|
64
|
+
pip install -e .
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Package Structure
|
|
68
|
+
|
|
69
|
+
```plaintext
|
|
70
|
+
chelo/ # Root package
|
|
71
|
+
├── __init__.py # Exposes core components
|
|
72
|
+
├── base.py # Abstract base class and shared utilities
|
|
73
|
+
├── datasets/ # Dataset-specific implementations
|
|
74
|
+
│ └── ... # Dataset implementations
|
|
75
|
+
├── utils/ # Utility functions and helpers
|
|
76
|
+
│ ├── __init__.py # Utility imports
|
|
77
|
+
│ └── download.py # Dataset downloader and caching
|
|
78
|
+
├── registry.py # Dataset registry
|
|
79
|
+
└── tests/ # Unit and integration tests
|
|
80
|
+
├── __init__.py # Makes this directory a package
|
|
81
|
+
├── test_base.py # Tests for the base class
|
|
82
|
+
└── test_X.py # Tests for X dataset
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Usage Guide
|
|
86
|
+
|
|
87
|
+
### Loading a Dataset
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from chelo.datasets.wine_quality import WineQualityDataset
|
|
91
|
+
|
|
92
|
+
# Instantiate the dataset
|
|
93
|
+
dataset = WineQualityDataset(wine_type="red", selected_features=["alcohol", "pH"], selected_targets=["quality"])
|
|
94
|
+
|
|
95
|
+
# Load data (downloads if not cached)
|
|
96
|
+
dataset.load_data()
|
|
97
|
+
|
|
98
|
+
# Access dataset information
|
|
99
|
+
info = dataset.get_dataset_info()
|
|
100
|
+
print("Dataset Info:", info)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Accessing Data
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
# Convert to numpy arrays
|
|
107
|
+
features, targets = dataset.to_numpy()
|
|
108
|
+
print("Features shape:", features.shape)
|
|
109
|
+
print("Targets shape:", targets.shape)
|
|
110
|
+
|
|
111
|
+
# Convert to PyTorch Dataset
|
|
112
|
+
pytorch_dataset = dataset.to_pytorch()
|
|
113
|
+
print("Number of samples in PyTorch Dataset:", len(pytorch_dataset))
|
|
114
|
+
|
|
115
|
+
# Convert to Keras Sequence
|
|
116
|
+
keras_sequence = dataset.to_keras(batch_size=32)
|
|
117
|
+
for batch_features, batch_targets in keras_sequence:
|
|
118
|
+
print("Batch Features:", batch_features.shape)
|
|
119
|
+
print("Batch Targets:", batch_targets.shape)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Dataset Statistics and Preview
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
# Get basic statistics
|
|
126
|
+
stats = dataset.statistics()
|
|
127
|
+
print("Statistics:", stats)
|
|
128
|
+
|
|
129
|
+
# Preview the dataset
|
|
130
|
+
preview = dataset.preview(n=5)
|
|
131
|
+
print("Preview:", preview)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Registering and Accessing Datasets
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from chelo.registry import DatasetRegistry
|
|
138
|
+
|
|
139
|
+
# List available datasets
|
|
140
|
+
print("Available Datasets:", DatasetRegistry.list_datasets())
|
|
141
|
+
|
|
142
|
+
# Retrieve and load a dataset by name
|
|
143
|
+
dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="red")
|
|
144
|
+
dataset.load_data()
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Extending the Library
|
|
148
|
+
|
|
149
|
+
To add a new dataset, create a new class that inherits from `ChemicalEngineeringDataset` and implement the required methods:
|
|
150
|
+
|
|
151
|
+
1. **Create a new dataset module:**
|
|
152
|
+
|
|
153
|
+
```plaintext
|
|
154
|
+
chelo/datasets/my_new_dataset.py
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
2. **Implement the dataset class:**
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from ..base import ChemicalEngineeringDataset
|
|
161
|
+
|
|
162
|
+
class MyNewDataset(ChemicalEngineeringDataset):
|
|
163
|
+
def __init__(self, selected_features=None, selected_targets=None):
|
|
164
|
+
super().__init__(selected_features, selected_targets)
|
|
165
|
+
self.dataset_name = "My New Dataset"
|
|
166
|
+
|
|
167
|
+
def load_data(self):
|
|
168
|
+
# Load dataset into self.raw_features and self.raw_targets
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
def list_features(self):
|
|
172
|
+
return list(self.raw_features.keys())
|
|
173
|
+
|
|
174
|
+
def list_targets(self):
|
|
175
|
+
return list(self.raw_targets.keys())
|
|
176
|
+
|
|
177
|
+
def get_dataset_info(self):
|
|
178
|
+
return {"name": self.dataset_name, "description": "Description of the dataset."}
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
3. **Register the dataset:**
|
|
182
|
+
|
|
183
|
+
Add the following line to `chelo/datasets/__init__.py`:
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from .my_new_dataset import MyNewDataset
|
|
187
|
+
DatasetRegistry.register(MyNewDataset)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Advanced Features
|
|
191
|
+
|
|
192
|
+
### Downloader Utility
|
|
193
|
+
|
|
194
|
+
The library includes a downloader utility for downloading and caching datasets. Files are stored in a structured cache directory (default: `~/.chelo`).
|
|
195
|
+
|
|
196
|
+
#### Example Usage
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
from chelo.utils.download import DatasetDownloader
|
|
200
|
+
|
|
201
|
+
downloader = DatasetDownloader()
|
|
202
|
+
|
|
203
|
+
# Download a dataset file
|
|
204
|
+
url = "https://example.com/dataset.csv"
|
|
205
|
+
file_path = downloader.download(url, dataset_name="example_dataset", filename="example.csv")
|
|
206
|
+
print("Downloaded file path:", file_path)
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Dataset Registry
|
|
210
|
+
|
|
211
|
+
The registry dynamically manages available datasets, allowing users to list and retrieve datasets by name.
|
|
212
|
+
|
|
213
|
+
#### Example Usage
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
from chelo.registry import DatasetRegistry
|
|
217
|
+
|
|
218
|
+
# List all registered datasets
|
|
219
|
+
print("Available Datasets:", DatasetRegistry.list_datasets())
|
|
220
|
+
|
|
221
|
+
# Retrieve a dataset by name
|
|
222
|
+
dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="white")
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Testing
|
|
226
|
+
|
|
227
|
+
The library includes comprehensive unit tests to ensure correctness and reliability. Run tests using `pytest`:
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
pytest tests/
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Contributing
|
|
234
|
+
|
|
235
|
+
Contributions are welcome! To contribute:
|
|
236
|
+
|
|
237
|
+
1. Fork the repository.
|
|
238
|
+
2. Create a feature branch.
|
|
239
|
+
3. Implement your changes and add tests.
|
|
240
|
+
4. Submit a pull request with a detailed description of your changes.
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
This library is licensed under the MIT License. See the `LICENSE` file for more details.
|
|
245
|
+
|
|
246
|
+
## Contact
|
|
247
|
+
|
|
248
|
+
For questions or feedback, please contact [your email or project contact].
|
|
249
|
+
|
chelo-0.0.1/README.md
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# CheLo (Chemical Engineering Dataset Loader) Library
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
The CheLo Library is a Python library designed to simplify the exploration of data-driven modeling for students studying chemical engineering and researchers working on related projects.
|
|
5
|
+
This library provides a standardized framework for accessing, managing, and utilizing chemical engineering datasets for machine learning and statistical analysis.
|
|
6
|
+
|
|
7
|
+
## Key Features
|
|
8
|
+
- **Dataset Standardization**: Unified API for accessing and exploring datasets.
|
|
9
|
+
- **Multiple Data Formats**: Provides ready to use loaders for numpy, PyTorch, and Keras.
|
|
10
|
+
- **Preprocessing Tools**: Methods for feature/target selection, statistics, and previewing datasets.
|
|
11
|
+
- **Dataset Management**: Automated downloading, caching, and registry of datasets.
|
|
12
|
+
- **Extensibility**: Abstract base class for easy addition of new datasets.
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
To install the library, run the following command:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install chelo
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
To install the library in editable mode for development purposes:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
git clone https://github.com/your-repo/chelo.git
|
|
26
|
+
cd chelo
|
|
27
|
+
pip install -e .
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Package Structure
|
|
31
|
+
|
|
32
|
+
```plaintext
|
|
33
|
+
chelo/ # Root package
|
|
34
|
+
├── __init__.py # Exposes core components
|
|
35
|
+
├── base.py # Abstract base class and shared utilities
|
|
36
|
+
├── datasets/ # Dataset-specific implementations
|
|
37
|
+
│ └── ... # Dataset implementations
|
|
38
|
+
├── utils/ # Utility functions and helpers
|
|
39
|
+
│ ├── __init__.py # Utility imports
|
|
40
|
+
│ └── download.py # Dataset downloader and caching
|
|
41
|
+
├── registry.py # Dataset registry
|
|
42
|
+
└── tests/ # Unit and integration tests
|
|
43
|
+
├── __init__.py # Makes this directory a package
|
|
44
|
+
├── test_base.py # Tests for the base class
|
|
45
|
+
└── test_X.py # Tests for X dataset
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Usage Guide
|
|
49
|
+
|
|
50
|
+
### Loading a Dataset
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from chelo.datasets.wine_quality import WineQualityDataset
|
|
54
|
+
|
|
55
|
+
# Instantiate the dataset
|
|
56
|
+
dataset = WineQualityDataset(wine_type="red", selected_features=["alcohol", "pH"], selected_targets=["quality"])
|
|
57
|
+
|
|
58
|
+
# Load data (downloads if not cached)
|
|
59
|
+
dataset.load_data()
|
|
60
|
+
|
|
61
|
+
# Access dataset information
|
|
62
|
+
info = dataset.get_dataset_info()
|
|
63
|
+
print("Dataset Info:", info)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Accessing Data
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
# Convert to numpy arrays
|
|
70
|
+
features, targets = dataset.to_numpy()
|
|
71
|
+
print("Features shape:", features.shape)
|
|
72
|
+
print("Targets shape:", targets.shape)
|
|
73
|
+
|
|
74
|
+
# Convert to PyTorch Dataset
|
|
75
|
+
pytorch_dataset = dataset.to_pytorch()
|
|
76
|
+
print("Number of samples in PyTorch Dataset:", len(pytorch_dataset))
|
|
77
|
+
|
|
78
|
+
# Convert to Keras Sequence
|
|
79
|
+
keras_sequence = dataset.to_keras(batch_size=32)
|
|
80
|
+
for batch_features, batch_targets in keras_sequence:
|
|
81
|
+
print("Batch Features:", batch_features.shape)
|
|
82
|
+
print("Batch Targets:", batch_targets.shape)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Dataset Statistics and Preview
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
# Get basic statistics
|
|
89
|
+
stats = dataset.statistics()
|
|
90
|
+
print("Statistics:", stats)
|
|
91
|
+
|
|
92
|
+
# Preview the dataset
|
|
93
|
+
preview = dataset.preview(n=5)
|
|
94
|
+
print("Preview:", preview)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Registering and Accessing Datasets
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from chelo.registry import DatasetRegistry
|
|
101
|
+
|
|
102
|
+
# List available datasets
|
|
103
|
+
print("Available Datasets:", DatasetRegistry.list_datasets())
|
|
104
|
+
|
|
105
|
+
# Retrieve and load a dataset by name
|
|
106
|
+
dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="red")
|
|
107
|
+
dataset.load_data()
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Extending the Library
|
|
111
|
+
|
|
112
|
+
To add a new dataset, create a new class that inherits from `ChemicalEngineeringDataset` and implement the required methods:
|
|
113
|
+
|
|
114
|
+
1. **Create a new dataset module:**
|
|
115
|
+
|
|
116
|
+
```plaintext
|
|
117
|
+
chelo/datasets/my_new_dataset.py
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
2. **Implement the dataset class:**
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from ..base import ChemicalEngineeringDataset
|
|
124
|
+
|
|
125
|
+
class MyNewDataset(ChemicalEngineeringDataset):
|
|
126
|
+
def __init__(self, selected_features=None, selected_targets=None):
|
|
127
|
+
super().__init__(selected_features, selected_targets)
|
|
128
|
+
self.dataset_name = "My New Dataset"
|
|
129
|
+
|
|
130
|
+
def load_data(self):
|
|
131
|
+
# Load dataset into self.raw_features and self.raw_targets
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
def list_features(self):
|
|
135
|
+
return list(self.raw_features.keys())
|
|
136
|
+
|
|
137
|
+
def list_targets(self):
|
|
138
|
+
return list(self.raw_targets.keys())
|
|
139
|
+
|
|
140
|
+
def get_dataset_info(self):
|
|
141
|
+
return {"name": self.dataset_name, "description": "Description of the dataset."}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
3. **Register the dataset:**
|
|
145
|
+
|
|
146
|
+
Add the following line to `chelo/datasets/__init__.py`:
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from .my_new_dataset import MyNewDataset
|
|
150
|
+
DatasetRegistry.register(MyNewDataset)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Advanced Features
|
|
154
|
+
|
|
155
|
+
### Downloader Utility
|
|
156
|
+
|
|
157
|
+
The library includes a downloader utility for downloading and caching datasets. Files are stored in a structured cache directory (default: `~/.chelo`).
|
|
158
|
+
|
|
159
|
+
#### Example Usage
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from chelo.utils.download import DatasetDownloader
|
|
163
|
+
|
|
164
|
+
downloader = DatasetDownloader()
|
|
165
|
+
|
|
166
|
+
# Download a dataset file
|
|
167
|
+
url = "https://example.com/dataset.csv"
|
|
168
|
+
file_path = downloader.download(url, dataset_name="example_dataset", filename="example.csv")
|
|
169
|
+
print("Downloaded file path:", file_path)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Dataset Registry
|
|
173
|
+
|
|
174
|
+
The registry dynamically manages available datasets, allowing users to list and retrieve datasets by name.
|
|
175
|
+
|
|
176
|
+
#### Example Usage
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
from chelo.registry import DatasetRegistry
|
|
180
|
+
|
|
181
|
+
# List all registered datasets
|
|
182
|
+
print("Available Datasets:", DatasetRegistry.list_datasets())
|
|
183
|
+
|
|
184
|
+
# Retrieve a dataset by name
|
|
185
|
+
dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="white")
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Testing
|
|
189
|
+
|
|
190
|
+
The library includes comprehensive unit tests to ensure correctness and reliability. Run tests using `pytest`:
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
pytest tests/
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## Contributing
|
|
197
|
+
|
|
198
|
+
Contributions are welcome! To contribute:
|
|
199
|
+
|
|
200
|
+
1. Fork the repository.
|
|
201
|
+
2. Create a feature branch.
|
|
202
|
+
3. Implement your changes and add tests.
|
|
203
|
+
4. Submit a pull request with a detailed description of your changes.
|
|
204
|
+
|
|
205
|
+
## License
|
|
206
|
+
|
|
207
|
+
This library is licensed under the MIT License. See the `LICENSE` file for more details.
|
|
208
|
+
|
|
209
|
+
## Contact
|
|
210
|
+
|
|
211
|
+
For questions or feedback, please contact [your email or project contact].
|
|
212
|
+
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: chelo
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Chemical Engineering Dataset Loader Library
|
|
5
|
+
Author-email: Nikolaos Passalis <passalis@auth.gr>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Nikolaos Passalis
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Classifier: Programming Language :: Python :: 3
|
|
29
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
30
|
+
Classifier: Operating System :: OS Independent
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Requires-Dist: numpy
|
|
34
|
+
Requires-Dist: pandas
|
|
35
|
+
Requires-Dist: tqdm
|
|
36
|
+
Requires-Dist: requests
|
|
37
|
+
|
|
38
|
+
# CheLo (Chemical Engineering Dataset Loader) Library
|
|
39
|
+
|
|
40
|
+
## Overview
|
|
41
|
+
The CheLo Library is a Python library designed to simplify the exploration of data-driven modeling for students studying chemical engineering and researchers working on related projects.
|
|
42
|
+
This library provides a standardized framework for accessing, managing, and utilizing chemical engineering datasets for machine learning and statistical analysis.
|
|
43
|
+
|
|
44
|
+
## Key Features
|
|
45
|
+
- **Dataset Standardization**: Unified API for accessing and exploring datasets.
|
|
46
|
+
- **Multiple Data Formats**: Provides ready to use loaders for numpy, PyTorch, and Keras.
|
|
47
|
+
- **Preprocessing Tools**: Methods for feature/target selection, statistics, and previewing datasets.
|
|
48
|
+
- **Dataset Management**: Automated downloading, caching, and registry of datasets.
|
|
49
|
+
- **Extensibility**: Abstract base class for easy addition of new datasets.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
To install the library, run the following command:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install chelo
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
To install the library in editable mode for development purposes:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
git clone https://github.com/your-repo/chelo.git
|
|
63
|
+
cd chelo
|
|
64
|
+
pip install -e .
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Package Structure
|
|
68
|
+
|
|
69
|
+
```plaintext
|
|
70
|
+
chelo/ # Root package
|
|
71
|
+
├── __init__.py # Exposes core components
|
|
72
|
+
├── base.py # Abstract base class and shared utilities
|
|
73
|
+
├── datasets/ # Dataset-specific implementations
|
|
74
|
+
│ └── ... # Dataset implementations
|
|
75
|
+
├── utils/ # Utility functions and helpers
|
|
76
|
+
│ ├── __init__.py # Utility imports
|
|
77
|
+
│ └── download.py # Dataset downloader and caching
|
|
78
|
+
├── registry.py # Dataset registry
|
|
79
|
+
└── tests/ # Unit and integration tests
|
|
80
|
+
├── __init__.py # Makes this directory a package
|
|
81
|
+
├── test_base.py # Tests for the base class
|
|
82
|
+
└── test_X.py # Tests for X dataset
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Usage Guide
|
|
86
|
+
|
|
87
|
+
### Loading a Dataset
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from chelo.datasets.wine_quality import WineQualityDataset
|
|
91
|
+
|
|
92
|
+
# Instantiate the dataset
|
|
93
|
+
dataset = WineQualityDataset(wine_type="red", selected_features=["alcohol", "pH"], selected_targets=["quality"])
|
|
94
|
+
|
|
95
|
+
# Load data (downloads if not cached)
|
|
96
|
+
dataset.load_data()
|
|
97
|
+
|
|
98
|
+
# Access dataset information
|
|
99
|
+
info = dataset.get_dataset_info()
|
|
100
|
+
print("Dataset Info:", info)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Accessing Data
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
# Convert to numpy arrays
|
|
107
|
+
features, targets = dataset.to_numpy()
|
|
108
|
+
print("Features shape:", features.shape)
|
|
109
|
+
print("Targets shape:", targets.shape)
|
|
110
|
+
|
|
111
|
+
# Convert to PyTorch Dataset
|
|
112
|
+
pytorch_dataset = dataset.to_pytorch()
|
|
113
|
+
print("Number of samples in PyTorch Dataset:", len(pytorch_dataset))
|
|
114
|
+
|
|
115
|
+
# Convert to Keras Sequence
|
|
116
|
+
keras_sequence = dataset.to_keras(batch_size=32)
|
|
117
|
+
for batch_features, batch_targets in keras_sequence:
|
|
118
|
+
print("Batch Features:", batch_features.shape)
|
|
119
|
+
print("Batch Targets:", batch_targets.shape)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Dataset Statistics and Preview
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
# Get basic statistics
|
|
126
|
+
stats = dataset.statistics()
|
|
127
|
+
print("Statistics:", stats)
|
|
128
|
+
|
|
129
|
+
# Preview the dataset
|
|
130
|
+
preview = dataset.preview(n=5)
|
|
131
|
+
print("Preview:", preview)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Registering and Accessing Datasets
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from chelo.registry import DatasetRegistry
|
|
138
|
+
|
|
139
|
+
# List available datasets
|
|
140
|
+
print("Available Datasets:", DatasetRegistry.list_datasets())
|
|
141
|
+
|
|
142
|
+
# Retrieve and load a dataset by name
|
|
143
|
+
dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="red")
|
|
144
|
+
dataset.load_data()
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Extending the Library
|
|
148
|
+
|
|
149
|
+
To add a new dataset, create a new class that inherits from `ChemicalEngineeringDataset` and implement the required methods:
|
|
150
|
+
|
|
151
|
+
1. **Create a new dataset module:**
|
|
152
|
+
|
|
153
|
+
```plaintext
|
|
154
|
+
chelo/datasets/my_new_dataset.py
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
2. **Implement the dataset class:**
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from ..base import ChemicalEngineeringDataset
|
|
161
|
+
|
|
162
|
+
class MyNewDataset(ChemicalEngineeringDataset):
|
|
163
|
+
def __init__(self, selected_features=None, selected_targets=None):
|
|
164
|
+
super().__init__(selected_features, selected_targets)
|
|
165
|
+
self.dataset_name = "My New Dataset"
|
|
166
|
+
|
|
167
|
+
def load_data(self):
|
|
168
|
+
# Load dataset into self.raw_features and self.raw_targets
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
def list_features(self):
|
|
172
|
+
return list(self.raw_features.keys())
|
|
173
|
+
|
|
174
|
+
def list_targets(self):
|
|
175
|
+
return list(self.raw_targets.keys())
|
|
176
|
+
|
|
177
|
+
def get_dataset_info(self):
|
|
178
|
+
return {"name": self.dataset_name, "description": "Description of the dataset."}
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
3. **Register the dataset:**
|
|
182
|
+
|
|
183
|
+
Add the following line to `chelo/datasets/__init__.py`:
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from .my_new_dataset import MyNewDataset
|
|
187
|
+
DatasetRegistry.register(MyNewDataset)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Advanced Features
|
|
191
|
+
|
|
192
|
+
### Downloader Utility
|
|
193
|
+
|
|
194
|
+
The library includes a downloader utility for downloading and caching datasets. Files are stored in a structured cache directory (default: `~/.chelo`).
|
|
195
|
+
|
|
196
|
+
#### Example Usage
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
from chelo.utils.download import DatasetDownloader
|
|
200
|
+
|
|
201
|
+
downloader = DatasetDownloader()
|
|
202
|
+
|
|
203
|
+
# Download a dataset file
|
|
204
|
+
url = "https://example.com/dataset.csv"
|
|
205
|
+
file_path = downloader.download(url, dataset_name="example_dataset", filename="example.csv")
|
|
206
|
+
print("Downloaded file path:", file_path)
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Dataset Registry
|
|
210
|
+
|
|
211
|
+
The registry dynamically manages available datasets, allowing users to list and retrieve datasets by name.
|
|
212
|
+
|
|
213
|
+
#### Example Usage
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
from chelo.registry import DatasetRegistry
|
|
217
|
+
|
|
218
|
+
# List all registered datasets
|
|
219
|
+
print("Available Datasets:", DatasetRegistry.list_datasets())
|
|
220
|
+
|
|
221
|
+
# Retrieve a dataset by name
|
|
222
|
+
dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="white")
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Testing
|
|
226
|
+
|
|
227
|
+
The library includes comprehensive unit tests to ensure correctness and reliability. Run tests using `pytest`:
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
pytest tests/
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Contributing
|
|
234
|
+
|
|
235
|
+
Contributions are welcome! To contribute:
|
|
236
|
+
|
|
237
|
+
1. Fork the repository.
|
|
238
|
+
2. Create a feature branch.
|
|
239
|
+
3. Implement your changes and add tests.
|
|
240
|
+
4. Submit a pull request with a detailed description of your changes.
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
This library is licensed under the MIT License. See the `LICENSE` file for more details.
|
|
245
|
+
|
|
246
|
+
## Contact
|
|
247
|
+
|
|
248
|
+
For questions or feedback, please contact [your email or project contact].
|
|
249
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
chelo.egg-info/PKG-INFO
|
|
6
|
+
chelo.egg-info/SOURCES.txt
|
|
7
|
+
chelo.egg-info/dependency_links.txt
|
|
8
|
+
chelo.egg-info/requires.txt
|
|
9
|
+
chelo.egg-info/top_level.txt
|
|
10
|
+
src/__init__.py
|
|
11
|
+
src/base.py
|
|
12
|
+
src/registry.py
|
|
13
|
+
src/datasets/__init__.py
|
|
14
|
+
src/datasets/wine_quality.py
|
|
15
|
+
src/utils/__init__.py
|
|
16
|
+
src/utils/downloader.py
|
|
17
|
+
tests/test_base.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chelo
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[tool.setuptools]
|
|
6
|
+
package-dir = {"chelo" = "src"}
|
|
7
|
+
|
|
8
|
+
[project]
|
|
9
|
+
name = "chelo"
|
|
10
|
+
version = "0.0.1"
|
|
11
|
+
description = "Chemical Engineering Dataset Loader Library"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = {file = "LICENSE"}
|
|
14
|
+
authors = [
|
|
15
|
+
{ name = "Nikolaos Passalis", email = "passalis@auth.gr" }
|
|
16
|
+
]
|
|
17
|
+
dependencies = [
|
|
18
|
+
"numpy",
|
|
19
|
+
"pandas",
|
|
20
|
+
"tqdm",
|
|
21
|
+
"requests",
|
|
22
|
+
]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Operating System :: OS Independent",
|
|
27
|
+
]
|
chelo-0.0.1/setup.cfg
ADDED
|
File without changes
|
chelo-0.0.1/src/base.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CheLoDataset(ABC):
|
|
8
|
+
"""
|
|
9
|
+
Abstract Base Class for datasets.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, selected_features=None, selected_targets=None):
|
|
13
|
+
"""
|
|
14
|
+
Initialize the dataset with optional selected features and targets.
|
|
15
|
+
:param selected_features: List of features to select (default: all).
|
|
16
|
+
:param selected_targets: List of targets to select (default: all).
|
|
17
|
+
"""
|
|
18
|
+
self.raw_features = None # Immutable raw feature data
|
|
19
|
+
self.raw_targets = None # Immutable raw target data
|
|
20
|
+
self.features = None # Subset of features to use
|
|
21
|
+
self.targets = None # Subset of targets to use
|
|
22
|
+
self.dataset_name = None # Name of the dataset
|
|
23
|
+
|
|
24
|
+
self._selected_features = selected_features
|
|
25
|
+
self._selected_targets = selected_targets
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def load_data(self):
|
|
29
|
+
"""
|
|
30
|
+
Load the dataset and populate self.raw_features and self.raw_targets.
|
|
31
|
+
"""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def list_features(self):
|
|
36
|
+
"""
|
|
37
|
+
List available features in the dataset.
|
|
38
|
+
"""
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def list_targets(self):
|
|
43
|
+
"""
|
|
44
|
+
List available targets in the dataset.
|
|
45
|
+
"""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def get_dataset_info(self):
|
|
50
|
+
"""
|
|
51
|
+
Provide metadata about the dataset (e.g., source, size, description).
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
def select_features(self, feature_names):
|
|
56
|
+
"""
|
|
57
|
+
Dynamically select features from the dataset.
|
|
58
|
+
:param feature_names: List of feature names to select.
|
|
59
|
+
"""
|
|
60
|
+
if not self.raw_features:
|
|
61
|
+
raise ValueError(f"Dataset {self.dataset_name} not loaded yet!")
|
|
62
|
+
self.features = {name: self.raw_features[name] for name in feature_names}
|
|
63
|
+
|
|
64
|
+
def select_targets(self, target_names):
|
|
65
|
+
"""
|
|
66
|
+
Dynamically select targets from the dataset.
|
|
67
|
+
:param target_names: List of target names to select.
|
|
68
|
+
"""
|
|
69
|
+
if not self.raw_targets:
|
|
70
|
+
raise ValueError(f"Dataset {self.dataset_name} not loaded yet!")
|
|
71
|
+
self.targets = {name: self.raw_targets[name] for name in target_names}
|
|
72
|
+
|
|
73
|
+
def _apply_initial_selections(self):
|
|
74
|
+
"""
|
|
75
|
+
Apply initial selections if specified during initialization.
|
|
76
|
+
"""
|
|
77
|
+
if self._selected_features:
|
|
78
|
+
self.select_features(self._selected_features)
|
|
79
|
+
else:
|
|
80
|
+
self.features = self.raw_features
|
|
81
|
+
|
|
82
|
+
if self._selected_targets:
|
|
83
|
+
self.select_targets(self._selected_targets)
|
|
84
|
+
else:
|
|
85
|
+
self.targets = self.raw_targets
|
|
86
|
+
|
|
87
|
+
def size(self):
|
|
88
|
+
"""
|
|
89
|
+
Get the size of the dataset (number of samples).
|
|
90
|
+
"""
|
|
91
|
+
return len(next(iter(self.features.values())))
|
|
92
|
+
|
|
93
|
+
def statistics(self):
|
|
94
|
+
"""
|
|
95
|
+
Compute basic statistics for the features and targets.
|
|
96
|
+
:return: A dictionary of statistics (mean, std, min, max) for each feature and target.
|
|
97
|
+
"""
|
|
98
|
+
stats = {}
|
|
99
|
+
for key, values in {**self.features, **self.targets}.items():
|
|
100
|
+
stats[key] = {
|
|
101
|
+
"mean": np.mean(values),
|
|
102
|
+
"std": np.std(values),
|
|
103
|
+
"min": np.min(values),
|
|
104
|
+
"max": np.max(values),
|
|
105
|
+
}
|
|
106
|
+
return stats
|
|
107
|
+
|
|
108
|
+
def to_numpy(self):
|
|
109
|
+
"""
|
|
110
|
+
Convert the dataset to numpy arrays.
|
|
111
|
+
:return: Tuple of (features, targets) in numpy format.
|
|
112
|
+
"""
|
|
113
|
+
return np.array(list(self.features.values())).T, np.array(list(self.targets.values())).T
|
|
114
|
+
|
|
115
|
+
def to_pytorch(self):
|
|
116
|
+
"""
|
|
117
|
+
Provide a PyTorch Dataset object.
|
|
118
|
+
:return: A PyTorch Dataset containing features and targets.
|
|
119
|
+
"""
|
|
120
|
+
from torch.utils.data import Dataset
|
|
121
|
+
class PyTorchDataset(Dataset):
|
|
122
|
+
def __init__(self, features, targets):
|
|
123
|
+
self.features = features
|
|
124
|
+
self.targets = targets
|
|
125
|
+
|
|
126
|
+
def __len__(self):
|
|
127
|
+
return len(self.features)
|
|
128
|
+
|
|
129
|
+
def __getitem__(self, idx):
|
|
130
|
+
return self.features[idx], self.targets[idx]
|
|
131
|
+
|
|
132
|
+
return PyTorchDataset(np.array(list(self.features.values())).T,
|
|
133
|
+
np.array(list(self.targets.values())).T)
|
|
134
|
+
|
|
135
|
+
def to_keras(self, batch_size=32):
|
|
136
|
+
"""
|
|
137
|
+
Provide a Keras Sequence object for training.
|
|
138
|
+
:param batch_size: Number of samples per batch.
|
|
139
|
+
:return: A Keras Sequence containing features and targets.
|
|
140
|
+
"""
|
|
141
|
+
from tensorflow.keras.utils import Sequence
|
|
142
|
+
class KerasSequence(Sequence):
|
|
143
|
+
def __init__(self, features, targets, batch_size):
|
|
144
|
+
self.features = np.array(list(features.values())).T
|
|
145
|
+
self.targets = np.array(list(targets.values())).T
|
|
146
|
+
self.batch_size = batch_size
|
|
147
|
+
|
|
148
|
+
def __len__(self):
|
|
149
|
+
return int(np.ceil(len(self.features) / self.batch_size))
|
|
150
|
+
|
|
151
|
+
def __getitem__(self, idx):
|
|
152
|
+
start_idx = idx * self.batch_size
|
|
153
|
+
end_idx = start_idx + self.batch_size
|
|
154
|
+
return self.features[start_idx:end_idx], self.targets[start_idx:end_idx]
|
|
155
|
+
|
|
156
|
+
return KerasSequence(self.features, self.targets, batch_size)
|
|
157
|
+
|
|
158
|
+
def preview(self, n=5):
|
|
159
|
+
"""
|
|
160
|
+
Preview the first n rows of the dataset.
|
|
161
|
+
"""
|
|
162
|
+
preview_data = {
|
|
163
|
+
"features": {key: values[:n] for key, values in self.features.items()},
|
|
164
|
+
"targets": {key: values[:n] for key, values in self.targets.items()},
|
|
165
|
+
}
|
|
166
|
+
return preview_data
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# import pkgutil
|
|
2
|
+
# import importlib
|
|
3
|
+
# from ..registry import DatasetRegistry
|
|
4
|
+
# from ..base import CheLoDataset
|
|
5
|
+
|
|
6
|
+
# def auto_register_datasets():
|
|
7
|
+
# """
|
|
8
|
+
# Automatically discover and register datasets in the `datasets` package.
|
|
9
|
+
# """
|
|
10
|
+
# for _, module_name, _ in pkgutil.iter_modules(__path__):
|
|
11
|
+
# module = importlib.import_module(f"{__name__}.{module_name}")
|
|
12
|
+
# for attr_name in dir(module):
|
|
13
|
+
# attr = getattr(module, attr_name)
|
|
14
|
+
# if isinstance(attr, type) and issubclass(attr, CheLoDataset) and attr is not CheLoDataset:
|
|
15
|
+
# DatasetRegistry.register(attr)
|
|
16
|
+
#
|
|
17
|
+
# auto_register_datasets()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from ..base import CheLoDataset
|
|
2
|
+
from ..registry import register_dataset
|
|
3
|
+
from ..utils.downloader import DatasetDownloader
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
@register_dataset
|
|
7
|
+
class WineQualityDataset(CheLoDataset):
|
|
8
|
+
|
|
9
|
+
BASE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/"
|
|
10
|
+
FILES = {
|
|
11
|
+
"red": "winequality-red.csv",
|
|
12
|
+
"white": "winequality-white.csv",
|
|
13
|
+
}
|
|
14
|
+
CHECKSUMS ={
|
|
15
|
+
"red": "2daeecee174368f8a33b82c8cccae3a5",
|
|
16
|
+
"white": "5d9ff0f7f716dace19e3ab4578775fd7",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def __init__(self, wine_type="red", selected_features=None, selected_targets=None):
|
|
20
|
+
"""
|
|
21
|
+
Initialize the Wine Quality Dataset.
|
|
22
|
+
:param wine_type: Type of wine ('red' or 'white').
|
|
23
|
+
:param selected_features: Features to select (default: all).
|
|
24
|
+
:param selected_targets: Targets to select (default: all).
|
|
25
|
+
"""
|
|
26
|
+
super().__init__(selected_features, selected_targets)
|
|
27
|
+
if wine_type not in self.FILES:
|
|
28
|
+
raise ValueError(f"Invalid wine_type '{wine_type}'. Must be 'red' or 'white'.")
|
|
29
|
+
self.wine_type = wine_type
|
|
30
|
+
self.dataset_name = f"Wine Quality ({wine_type.capitalize()})"
|
|
31
|
+
|
|
32
|
+
def load_data(self):
|
|
33
|
+
"""
|
|
34
|
+
Load the dataset from the UCI repository or cache.
|
|
35
|
+
"""
|
|
36
|
+
downloader = DatasetDownloader()
|
|
37
|
+
file_url = self.BASE_URL + self.FILES[self.wine_type]
|
|
38
|
+
file_path = downloader.download(file_url, dataset_name="wine_quality", filename=self.FILES[self.wine_type],
|
|
39
|
+
checksum=self.CHECKSUMS[self.wine_type])
|
|
40
|
+
|
|
41
|
+
data = pd.read_csv(file_path, sep=";")
|
|
42
|
+
self.raw_features = data.drop(columns=["quality"]).to_dict(orient="list")
|
|
43
|
+
self.raw_targets = {"quality": data["quality"].tolist()}
|
|
44
|
+
self._apply_initial_selections()
|
|
45
|
+
|
|
46
|
+
def list_features(self):
|
|
47
|
+
"""
|
|
48
|
+
List the available features in the dataset.
|
|
49
|
+
:return: List of feature names.
|
|
50
|
+
"""
|
|
51
|
+
return list(self.raw_features.keys())
|
|
52
|
+
|
|
53
|
+
def list_targets(self):
|
|
54
|
+
"""
|
|
55
|
+
List the available targets in the dataset.
|
|
56
|
+
:return: List of target names.
|
|
57
|
+
"""
|
|
58
|
+
return list(self.raw_targets.keys())
|
|
59
|
+
|
|
60
|
+
def get_dataset_info(self):
|
|
61
|
+
"""
|
|
62
|
+
Get metadata about the dataset.
|
|
63
|
+
:return: A dictionary containing dataset metadata.
|
|
64
|
+
"""
|
|
65
|
+
return {
|
|
66
|
+
"name": self.dataset_name,
|
|
67
|
+
"description": "Dataset containing physicochemical attributes and quality ratings of wines.",
|
|
68
|
+
"wine_type": self.wine_type,
|
|
69
|
+
"features": self.list_features(),
|
|
70
|
+
"targets": self.list_targets(),
|
|
71
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
|
|
2
|
+
class DatasetRegistry:
|
|
3
|
+
"""
|
|
4
|
+
A registry to manage available datasets in CheLo
|
|
5
|
+
"""
|
|
6
|
+
_datasets = {}
|
|
7
|
+
|
|
8
|
+
@classmethod
|
|
9
|
+
def register(cls, dataset_cls):
|
|
10
|
+
"""
|
|
11
|
+
Register a dataset class with the registry.
|
|
12
|
+
:param dataset_cls: The dataset class to register.
|
|
13
|
+
"""
|
|
14
|
+
dataset_name = dataset_cls.__name__
|
|
15
|
+
if dataset_name in cls._datasets:
|
|
16
|
+
raise ValueError(f"Dataset {dataset_name} is already registered.")
|
|
17
|
+
cls._datasets[dataset_name] = dataset_cls
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def list_datasets(cls):
|
|
21
|
+
"""
|
|
22
|
+
List all registered datasets.
|
|
23
|
+
:return: A list of names of registered datasets.
|
|
24
|
+
"""
|
|
25
|
+
return list(cls._datasets.keys())
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def get_dataset(cls, name, **kwargs):
|
|
29
|
+
"""
|
|
30
|
+
Retrieve an instance of the specified dataset by name.
|
|
31
|
+
:param name: Name of the dataset to retrieve.
|
|
32
|
+
:param kwargs: Additional arguments to pass to the dataset constructor.
|
|
33
|
+
:return: An instance of the dataset.
|
|
34
|
+
"""
|
|
35
|
+
if name not in cls._datasets:
|
|
36
|
+
raise ValueError(f"Dataset {name} not found! Available datasets: {cls.list_datasets()}")
|
|
37
|
+
return cls._datasets[name](**kwargs)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def register_dataset(cls):
|
|
41
|
+
"""
|
|
42
|
+
A decorator to register a dataset class with the registry.
|
|
43
|
+
"""
|
|
44
|
+
DatasetRegistry.register(cls)
|
|
45
|
+
return cls
|
|
File without changes
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import hashlib
|
|
3
|
+
import requests
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import tarfile
|
|
6
|
+
import zipfile
|
|
7
|
+
|
|
8
|
+
class DatasetDownloader:
|
|
9
|
+
"""
|
|
10
|
+
Utility class for downloading and caching datasets.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, cache_dir=None):
|
|
14
|
+
"""
|
|
15
|
+
Initialize the downloader with an optional cache directory.
|
|
16
|
+
:param cache_dir: Directory to store downloaded datasets
|
|
17
|
+
"""
|
|
18
|
+
self.cache_dir = cache_dir or os.getenv("CHELO_DATASET_CACHE", os.path.expanduser("~/.chelo_datasets"))
|
|
19
|
+
os.makedirs(self.cache_dir, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_dataset_dir(self, dataset_name):
|
|
23
|
+
"""
|
|
24
|
+
Get the directory path for a specific dataset.
|
|
25
|
+
:param dataset_name: Name of the dataset.
|
|
26
|
+
:return: Path to the dataset's directory.
|
|
27
|
+
"""
|
|
28
|
+
dataset_dir = os.path.join(self.cache_dir, dataset_name)
|
|
29
|
+
os.makedirs(dataset_dir, exist_ok=True)
|
|
30
|
+
return dataset_dir
|
|
31
|
+
|
|
32
|
+
def _get_file_path(self, dataset_name, filename):
|
|
33
|
+
"""
|
|
34
|
+
Get the full path for a file in the dataset's directory.
|
|
35
|
+
:param dataset_name: Name of the dataset.
|
|
36
|
+
:param filename: Name of the file.
|
|
37
|
+
:return: Full path to the file.
|
|
38
|
+
"""
|
|
39
|
+
dataset_dir = self._get_dataset_dir(dataset_name)
|
|
40
|
+
return os.path.join(dataset_dir, filename)
|
|
41
|
+
|
|
42
|
+
def download(self, url, dataset_name, filename=None, checksum=None):
|
|
43
|
+
"""
|
|
44
|
+
Download a file for a specific dataset and save it in the dataset's folder.
|
|
45
|
+
:param url: URL of the file to download.
|
|
46
|
+
:param dataset_name: Name of the dataset.
|
|
47
|
+
:param filename: Local filename (default: inferred from the URL).
|
|
48
|
+
:param checksum: Expected checksum (MD5 or SHA256) to validate the file (optional).
|
|
49
|
+
:return: Path to the downloaded file.
|
|
50
|
+
"""
|
|
51
|
+
filename = filename or os.path.basename(url)
|
|
52
|
+
file_path = self._get_file_path(dataset_name, filename)
|
|
53
|
+
|
|
54
|
+
# Check if the file already exists
|
|
55
|
+
if os.path.exists(file_path):
|
|
56
|
+
if checksum and not self._verify_checksum(file_path, checksum):
|
|
57
|
+
print("Checksum mismatch! Redownloading the file.")
|
|
58
|
+
else:
|
|
59
|
+
return file_path
|
|
60
|
+
|
|
61
|
+
# Download the file
|
|
62
|
+
print(f"Downloading '{filename}' for dataset '{dataset_name}' from {url}...")
|
|
63
|
+
response = requests.get(url, stream=True)
|
|
64
|
+
response.raise_for_status()
|
|
65
|
+
|
|
66
|
+
# Save the file with progress bar
|
|
67
|
+
with open(file_path, "wb") as file, tqdm(
|
|
68
|
+
total=int(response.headers.get("content-length", 0)),
|
|
69
|
+
unit="B",
|
|
70
|
+
unit_scale=True,
|
|
71
|
+
desc=f"Downloading {filename}",
|
|
72
|
+
) as progress:
|
|
73
|
+
for chunk in response.iter_content(chunk_size=1024):
|
|
74
|
+
file.write(chunk)
|
|
75
|
+
progress.update(len(chunk))
|
|
76
|
+
|
|
77
|
+
# Verify checksum
|
|
78
|
+
if checksum and not self._verify_checksum(file_path, checksum):
|
|
79
|
+
raise ValueError(f"Checksum verification failed for '{filename}'.")
|
|
80
|
+
|
|
81
|
+
print(f"File downloaded and saved at '{file_path}'.")
|
|
82
|
+
return file_path
|
|
83
|
+
|
|
84
|
+
def _verify_checksum(self, file_path, checksum):
|
|
85
|
+
"""
|
|
86
|
+
Verify the checksum of a file.
|
|
87
|
+
:param file_path: Path to the file.
|
|
88
|
+
:param checksum: Expected checksum (MD5 or SHA256).
|
|
89
|
+
:return: True if the checksum matches, False otherwise.
|
|
90
|
+
"""
|
|
91
|
+
hash_func = hashlib.sha256 if len(checksum) == 64 else hashlib.md5
|
|
92
|
+
with open(file_path, "rb") as file:
|
|
93
|
+
file_hash = hash_func(file.read()).hexdigest()
|
|
94
|
+
return file_hash == checksum
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def extract_file(self, file_path, extract_to=None):
|
|
99
|
+
"""
|
|
100
|
+
Extract a compressed file (.zip or .tar.gz).
|
|
101
|
+
:param file_path: Path to the compressed file.
|
|
102
|
+
:param extract_to: Directory to extract to (default: same as file location).
|
|
103
|
+
:return: Path to the extracted directory.
|
|
104
|
+
"""
|
|
105
|
+
extract_to = extract_to or os.path.splitext(file_path)[0]
|
|
106
|
+
os.makedirs(extract_to, exist_ok=True)
|
|
107
|
+
|
|
108
|
+
if file_path.endswith(".zip"):
|
|
109
|
+
with zipfile.ZipFile(file_path, "r") as zip_ref:
|
|
110
|
+
zip_ref.extractall(extract_to)
|
|
111
|
+
elif file_path.endswith(".tar.gz"):
|
|
112
|
+
with tarfile.open(file_path, "r:gz") as tar_ref:
|
|
113
|
+
tar_ref.extractall(extract_to)
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f"Unsupported file format: {file_path}")
|
|
116
|
+
|
|
117
|
+
print(f"File extracted to '{extract_to}'.")
|
|
118
|
+
return extract_to
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import os
|
|
3
|
+
import numpy as np
|
|
4
|
+
from chelo.base import CheLoDataset
|
|
5
|
+
from chelo.utils.downloader import DatasetDownloader
|
|
6
|
+
|
|
7
|
+
# Mock Base Class Implementation for Testing
|
|
8
|
+
class MockDataset(CheLoDataset):
|
|
9
|
+
def __init__(self, selected_features=None, selected_targets=None):
|
|
10
|
+
super().__init__(selected_features, selected_targets)
|
|
11
|
+
self.dataset_name = "Mock Dataset"
|
|
12
|
+
|
|
13
|
+
def load_data(self):
|
|
14
|
+
self.raw_features = {
|
|
15
|
+
"Temperature": [300, 310, 320],
|
|
16
|
+
"Pressure": [101.3, 98.6, 102.5],
|
|
17
|
+
}
|
|
18
|
+
self.raw_targets = {"Reaction Rate": [1.2, 1.5, 1.3]}
|
|
19
|
+
self._apply_initial_selections()
|
|
20
|
+
|
|
21
|
+
def list_features(self):
|
|
22
|
+
return list(self.raw_features.keys())
|
|
23
|
+
|
|
24
|
+
def list_targets(self):
|
|
25
|
+
return list(self.raw_targets.keys())
|
|
26
|
+
|
|
27
|
+
def get_dataset_info(self):
|
|
28
|
+
return {
|
|
29
|
+
"name": self.dataset_name,
|
|
30
|
+
"description": "A mock dataset for testing.",
|
|
31
|
+
"size": self.size(),
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# Test Cases for Base Class
|
|
35
|
+
def test_mock_dataset():
|
|
36
|
+
dataset = MockDataset(selected_features=["Temperature"], selected_targets=["Reaction Rate"])
|
|
37
|
+
dataset.load_data()
|
|
38
|
+
|
|
39
|
+
# Verify dataset information
|
|
40
|
+
assert dataset.get_dataset_info()["name"] == "Mock Dataset"
|
|
41
|
+
assert dataset.list_features() == ["Temperature", "Pressure"]
|
|
42
|
+
assert dataset.list_targets() == ["Reaction Rate"]
|
|
43
|
+
|
|
44
|
+
# Verify feature and target selection
|
|
45
|
+
features, targets = dataset.to_numpy()
|
|
46
|
+
assert features.shape == (3, 1)
|
|
47
|
+
assert targets.shape == (3, 1)
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
pytest.main()
|