chelo 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chelo-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Nikolaos Passalis
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include README.md
2
+ include LICENSE
3
+ include requirements.txt
4
+ recursive-include src *
chelo-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,249 @@
1
+ Metadata-Version: 2.2
2
+ Name: chelo
3
+ Version: 0.0.1
4
+ Summary: Chemical Engineering Dataset Loader Library
5
+ Author-email: Nikolaos Passalis <passalis@auth.gr>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Nikolaos Passalis
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Classifier: Programming Language :: Python :: 3
29
+ Classifier: License :: OSI Approved :: MIT License
30
+ Classifier: Operating System :: OS Independent
31
+ Description-Content-Type: text/markdown
32
+ License-File: LICENSE
33
+ Requires-Dist: numpy
34
+ Requires-Dist: pandas
35
+ Requires-Dist: tqdm
36
+ Requires-Dist: requests
37
+
38
+ # CheLo (Chemical Engineering Dataset Loader) Library
39
+
40
+ ## Overview
41
+ The CheLo Library is a Python library designed to simplify the exploration of data-driven modeling for students studying chemical engineering and researchers working on related projects.
42
+ This library provides a standardized framework for accessing, managing, and utilizing chemical engineering datasets for machine learning and statistical analysis.
43
+
44
+ ## Key Features
45
+ - **Dataset Standardization**: Unified API for accessing and exploring datasets.
46
+ - **Multiple Data Formats**: Provides ready to use loaders for numpy, PyTorch, and Keras.
47
+ - **Preprocessing Tools**: Methods for feature/target selection, statistics, and previewing datasets.
48
+ - **Dataset Management**: Automated downloading, caching, and registry of datasets.
49
+ - **Extensibility**: Abstract base class for easy addition of new datasets.
50
+
51
+ ## Installation
52
+
53
+ To install the library, run the following command:
54
+
55
+ ```bash
56
+ pip install chelo
57
+ ```
58
+
59
+ To install the library in editable mode for development purposes:
60
+
61
+ ```bash
62
+ git clone https://github.com/your-repo/chelo.git
63
+ cd chelo
64
+ pip install -e .
65
+ ```
66
+
67
+ ## Package Structure
68
+
69
+ ```plaintext
70
+ chelo/ # Root package
71
+ ├── __init__.py # Exposes core components
72
+ ├── base.py # Abstract base class and shared utilities
73
+ ├── datasets/ # Dataset-specific implementations
74
+ │ └── ... # Dataset implementations
75
+ ├── utils/ # Utility functions and helpers
76
+ │ ├── __init__.py # Utility imports
77
+ │ └── download.py # Dataset downloader and caching
78
+ ├── registry.py # Dataset registry
79
+ └── tests/ # Unit and integration tests
80
+ ├── __init__.py # Makes this directory a package
81
+ ├── test_base.py # Tests for the base class
82
+ └── test_X.py # Tests for X dataset
83
+ ```
84
+
85
+ ## Usage Guide
86
+
87
+ ### Loading a Dataset
88
+
89
+ ```python
90
+ from chelo.datasets.wine_quality import WineQualityDataset
91
+
92
+ # Instantiate the dataset
93
+ dataset = WineQualityDataset(wine_type="red", selected_features=["alcohol", "pH"], selected_targets=["quality"])
94
+
95
+ # Load data (downloads if not cached)
96
+ dataset.load_data()
97
+
98
+ # Access dataset information
99
+ info = dataset.get_dataset_info()
100
+ print("Dataset Info:", info)
101
+ ```
102
+
103
+ ### Accessing Data
104
+
105
+ ```python
106
+ # Convert to numpy arrays
107
+ features, targets = dataset.to_numpy()
108
+ print("Features shape:", features.shape)
109
+ print("Targets shape:", targets.shape)
110
+
111
+ # Convert to PyTorch Dataset
112
+ pytorch_dataset = dataset.to_pytorch()
113
+ print("Number of samples in PyTorch Dataset:", len(pytorch_dataset))
114
+
115
+ # Convert to Keras Sequence
116
+ keras_sequence = dataset.to_keras(batch_size=32)
117
+ for batch_features, batch_targets in keras_sequence:
118
+ print("Batch Features:", batch_features.shape)
119
+ print("Batch Targets:", batch_targets.shape)
120
+ ```
121
+
122
+ ### Dataset Statistics and Preview
123
+
124
+ ```python
125
+ # Get basic statistics
126
+ stats = dataset.statistics()
127
+ print("Statistics:", stats)
128
+
129
+ # Preview the dataset
130
+ preview = dataset.preview(n=5)
131
+ print("Preview:", preview)
132
+ ```
133
+
134
+ ### Registering and Accessing Datasets
135
+
136
+ ```python
137
+ from chelo.registry import DatasetRegistry
138
+
139
+ # List available datasets
140
+ print("Available Datasets:", DatasetRegistry.list_datasets())
141
+
142
+ # Retrieve and load a dataset by name
143
+ dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="red")
144
+ dataset.load_data()
145
+ ```
146
+
147
+ ## Extending the Library
148
+
149
+ To add a new dataset, create a new class that inherits from `ChemicalEngineeringDataset` and implement the required methods:
150
+
151
+ 1. **Create a new dataset module:**
152
+
153
+ ```plaintext
154
+ chelo/datasets/my_new_dataset.py
155
+ ```
156
+
157
+ 2. **Implement the dataset class:**
158
+
159
+ ```python
160
+ from ..base import ChemicalEngineeringDataset
161
+
162
+ class MyNewDataset(ChemicalEngineeringDataset):
163
+ def __init__(self, selected_features=None, selected_targets=None):
164
+ super().__init__(selected_features, selected_targets)
165
+ self.dataset_name = "My New Dataset"
166
+
167
+ def load_data(self):
168
+ # Load dataset into self.raw_features and self.raw_targets
169
+ pass
170
+
171
+ def list_features(self):
172
+ return list(self.raw_features.keys())
173
+
174
+ def list_targets(self):
175
+ return list(self.raw_targets.keys())
176
+
177
+ def get_dataset_info(self):
178
+ return {"name": self.dataset_name, "description": "Description of the dataset."}
179
+ ```
180
+
181
+ 3. **Register the dataset:**
182
+
183
+ Add the following line to `chelo/datasets/__init__.py`:
184
+
185
+ ```python
186
+ from .my_new_dataset import MyNewDataset
187
+ DatasetRegistry.register(MyNewDataset)
188
+ ```
189
+
190
+ ## Advanced Features
191
+
192
+ ### Downloader Utility
193
+
194
+ The library includes a downloader utility for downloading and caching datasets. Files are stored in a structured cache directory (default: `~/.chelo`).
195
+
196
+ #### Example Usage
197
+
198
+ ```python
199
+ from chelo.utils.download import DatasetDownloader
200
+
201
+ downloader = DatasetDownloader()
202
+
203
+ # Download a dataset file
204
+ url = "https://example.com/dataset.csv"
205
+ file_path = downloader.download(url, dataset_name="example_dataset", filename="example.csv")
206
+ print("Downloaded file path:", file_path)
207
+ ```
208
+
209
+ ### Dataset Registry
210
+
211
+ The registry dynamically manages available datasets, allowing users to list and retrieve datasets by name.
212
+
213
+ #### Example Usage
214
+
215
+ ```python
216
+ from chelo.registry import DatasetRegistry
217
+
218
+ # List all registered datasets
219
+ print("Available Datasets:", DatasetRegistry.list_datasets())
220
+
221
+ # Retrieve a dataset by name
222
+ dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="white")
223
+ ```
224
+
225
+ ## Testing
226
+
227
+ The library includes comprehensive unit tests to ensure correctness and reliability. Run tests using `pytest`:
228
+
229
+ ```bash
230
+ pytest tests/
231
+ ```
232
+
233
+ ## Contributing
234
+
235
+ Contributions are welcome! To contribute:
236
+
237
+ 1. Fork the repository.
238
+ 2. Create a feature branch.
239
+ 3. Implement your changes and add tests.
240
+ 4. Submit a pull request with a detailed description of your changes.
241
+
242
+ ## License
243
+
244
+ This library is licensed under the MIT License. See the `LICENSE` file for more details.
245
+
246
+ ## Contact
247
+
248
+ For questions or feedback, please contact [your email or project contact].
249
+
chelo-0.0.1/README.md ADDED
@@ -0,0 +1,212 @@
1
+ # CheLo (Chemical Engineering Dataset Loader) Library
2
+
3
+ ## Overview
4
+ The CheLo Library is a Python library designed to simplify the exploration of data-driven modeling for students studying chemical engineering and researchers working on related projects.
5
+ This library provides a standardized framework for accessing, managing, and utilizing chemical engineering datasets for machine learning and statistical analysis.
6
+
7
+ ## Key Features
8
+ - **Dataset Standardization**: Unified API for accessing and exploring datasets.
9
+ - **Multiple Data Formats**: Provides ready to use loaders for numpy, PyTorch, and Keras.
10
+ - **Preprocessing Tools**: Methods for feature/target selection, statistics, and previewing datasets.
11
+ - **Dataset Management**: Automated downloading, caching, and registry of datasets.
12
+ - **Extensibility**: Abstract base class for easy addition of new datasets.
13
+
14
+ ## Installation
15
+
16
+ To install the library, run the following command:
17
+
18
+ ```bash
19
+ pip install chelo
20
+ ```
21
+
22
+ To install the library in editable mode for development purposes:
23
+
24
+ ```bash
25
+ git clone https://github.com/your-repo/chelo.git
26
+ cd chelo
27
+ pip install -e .
28
+ ```
29
+
30
+ ## Package Structure
31
+
32
+ ```plaintext
33
+ chelo/ # Root package
34
+ ├── __init__.py # Exposes core components
35
+ ├── base.py # Abstract base class and shared utilities
36
+ ├── datasets/ # Dataset-specific implementations
37
+ │ └── ... # Dataset implementations
38
+ ├── utils/ # Utility functions and helpers
39
+ │ ├── __init__.py # Utility imports
40
+ │ └── download.py # Dataset downloader and caching
41
+ ├── registry.py # Dataset registry
42
+ └── tests/ # Unit and integration tests
43
+ ├── __init__.py # Makes this directory a package
44
+ ├── test_base.py # Tests for the base class
45
+ └── test_X.py # Tests for X dataset
46
+ ```
47
+
48
+ ## Usage Guide
49
+
50
+ ### Loading a Dataset
51
+
52
+ ```python
53
+ from chelo.datasets.wine_quality import WineQualityDataset
54
+
55
+ # Instantiate the dataset
56
+ dataset = WineQualityDataset(wine_type="red", selected_features=["alcohol", "pH"], selected_targets=["quality"])
57
+
58
+ # Load data (downloads if not cached)
59
+ dataset.load_data()
60
+
61
+ # Access dataset information
62
+ info = dataset.get_dataset_info()
63
+ print("Dataset Info:", info)
64
+ ```
65
+
66
+ ### Accessing Data
67
+
68
+ ```python
69
+ # Convert to numpy arrays
70
+ features, targets = dataset.to_numpy()
71
+ print("Features shape:", features.shape)
72
+ print("Targets shape:", targets.shape)
73
+
74
+ # Convert to PyTorch Dataset
75
+ pytorch_dataset = dataset.to_pytorch()
76
+ print("Number of samples in PyTorch Dataset:", len(pytorch_dataset))
77
+
78
+ # Convert to Keras Sequence
79
+ keras_sequence = dataset.to_keras(batch_size=32)
80
+ for batch_features, batch_targets in keras_sequence:
81
+ print("Batch Features:", batch_features.shape)
82
+ print("Batch Targets:", batch_targets.shape)
83
+ ```
84
+
85
+ ### Dataset Statistics and Preview
86
+
87
+ ```python
88
+ # Get basic statistics
89
+ stats = dataset.statistics()
90
+ print("Statistics:", stats)
91
+
92
+ # Preview the dataset
93
+ preview = dataset.preview(n=5)
94
+ print("Preview:", preview)
95
+ ```
96
+
97
+ ### Registering and Accessing Datasets
98
+
99
+ ```python
100
+ from chelo.registry import DatasetRegistry
101
+
102
+ # List available datasets
103
+ print("Available Datasets:", DatasetRegistry.list_datasets())
104
+
105
+ # Retrieve and load a dataset by name
106
+ dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="red")
107
+ dataset.load_data()
108
+ ```
109
+
110
+ ## Extending the Library
111
+
112
+ To add a new dataset, create a new class that inherits from `ChemicalEngineeringDataset` and implement the required methods:
113
+
114
+ 1. **Create a new dataset module:**
115
+
116
+ ```plaintext
117
+ chelo/datasets/my_new_dataset.py
118
+ ```
119
+
120
+ 2. **Implement the dataset class:**
121
+
122
+ ```python
123
+ from ..base import ChemicalEngineeringDataset
124
+
125
+ class MyNewDataset(ChemicalEngineeringDataset):
126
+ def __init__(self, selected_features=None, selected_targets=None):
127
+ super().__init__(selected_features, selected_targets)
128
+ self.dataset_name = "My New Dataset"
129
+
130
+ def load_data(self):
131
+ # Load dataset into self.raw_features and self.raw_targets
132
+ pass
133
+
134
+ def list_features(self):
135
+ return list(self.raw_features.keys())
136
+
137
+ def list_targets(self):
138
+ return list(self.raw_targets.keys())
139
+
140
+ def get_dataset_info(self):
141
+ return {"name": self.dataset_name, "description": "Description of the dataset."}
142
+ ```
143
+
144
+ 3. **Register the dataset:**
145
+
146
+ Add the following line to `chelo/datasets/__init__.py`:
147
+
148
+ ```python
149
+ from .my_new_dataset import MyNewDataset
150
+ DatasetRegistry.register(MyNewDataset)
151
+ ```
152
+
153
+ ## Advanced Features
154
+
155
+ ### Downloader Utility
156
+
157
+ The library includes a downloader utility for downloading and caching datasets. Files are stored in a structured cache directory (default: `~/.chelo`).
158
+
159
+ #### Example Usage
160
+
161
+ ```python
162
+ from chelo.utils.download import DatasetDownloader
163
+
164
+ downloader = DatasetDownloader()
165
+
166
+ # Download a dataset file
167
+ url = "https://example.com/dataset.csv"
168
+ file_path = downloader.download(url, dataset_name="example_dataset", filename="example.csv")
169
+ print("Downloaded file path:", file_path)
170
+ ```
171
+
172
+ ### Dataset Registry
173
+
174
+ The registry dynamically manages available datasets, allowing users to list and retrieve datasets by name.
175
+
176
+ #### Example Usage
177
+
178
+ ```python
179
+ from chelo.registry import DatasetRegistry
180
+
181
+ # List all registered datasets
182
+ print("Available Datasets:", DatasetRegistry.list_datasets())
183
+
184
+ # Retrieve a dataset by name
185
+ dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="white")
186
+ ```
187
+
188
+ ## Testing
189
+
190
+ The library includes comprehensive unit tests to ensure correctness and reliability. Run tests using `pytest`:
191
+
192
+ ```bash
193
+ pytest tests/
194
+ ```
195
+
196
+ ## Contributing
197
+
198
+ Contributions are welcome! To contribute:
199
+
200
+ 1. Fork the repository.
201
+ 2. Create a feature branch.
202
+ 3. Implement your changes and add tests.
203
+ 4. Submit a pull request with a detailed description of your changes.
204
+
205
+ ## License
206
+
207
+ This library is licensed under the MIT License. See the `LICENSE` file for more details.
208
+
209
+ ## Contact
210
+
211
+ For questions or feedback, please contact [your email or project contact].
212
+
@@ -0,0 +1,249 @@
1
+ Metadata-Version: 2.2
2
+ Name: chelo
3
+ Version: 0.0.1
4
+ Summary: Chemical Engineering Dataset Loader Library
5
+ Author-email: Nikolaos Passalis <passalis@auth.gr>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2025 Nikolaos Passalis
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Classifier: Programming Language :: Python :: 3
29
+ Classifier: License :: OSI Approved :: MIT License
30
+ Classifier: Operating System :: OS Independent
31
+ Description-Content-Type: text/markdown
32
+ License-File: LICENSE
33
+ Requires-Dist: numpy
34
+ Requires-Dist: pandas
35
+ Requires-Dist: tqdm
36
+ Requires-Dist: requests
37
+
38
+ # CheLo (Chemical Engineering Dataset Loader) Library
39
+
40
+ ## Overview
41
+ The CheLo Library is a Python library designed to simplify the exploration of data-driven modeling for students studying chemical engineering and researchers working on related projects.
42
+ This library provides a standardized framework for accessing, managing, and utilizing chemical engineering datasets for machine learning and statistical analysis.
43
+
44
+ ## Key Features
45
+ - **Dataset Standardization**: Unified API for accessing and exploring datasets.
46
+ - **Multiple Data Formats**: Provides ready to use loaders for numpy, PyTorch, and Keras.
47
+ - **Preprocessing Tools**: Methods for feature/target selection, statistics, and previewing datasets.
48
+ - **Dataset Management**: Automated downloading, caching, and registry of datasets.
49
+ - **Extensibility**: Abstract base class for easy addition of new datasets.
50
+
51
+ ## Installation
52
+
53
+ To install the library, run the following command:
54
+
55
+ ```bash
56
+ pip install chelo
57
+ ```
58
+
59
+ To install the library in editable mode for development purposes:
60
+
61
+ ```bash
62
+ git clone https://github.com/your-repo/chelo.git
63
+ cd chelo
64
+ pip install -e .
65
+ ```
66
+
67
+ ## Package Structure
68
+
69
+ ```plaintext
70
+ chelo/ # Root package
71
+ ├── __init__.py # Exposes core components
72
+ ├── base.py # Abstract base class and shared utilities
73
+ ├── datasets/ # Dataset-specific implementations
74
+ │ └── ... # Dataset implementations
75
+ ├── utils/ # Utility functions and helpers
76
+ │ ├── __init__.py # Utility imports
77
+ │ └── download.py # Dataset downloader and caching
78
+ ├── registry.py # Dataset registry
79
+ └── tests/ # Unit and integration tests
80
+ ├── __init__.py # Makes this directory a package
81
+ ├── test_base.py # Tests for the base class
82
+ └── test_X.py # Tests for X dataset
83
+ ```
84
+
85
+ ## Usage Guide
86
+
87
+ ### Loading a Dataset
88
+
89
+ ```python
90
+ from chelo.datasets.wine_quality import WineQualityDataset
91
+
92
+ # Instantiate the dataset
93
+ dataset = WineQualityDataset(wine_type="red", selected_features=["alcohol", "pH"], selected_targets=["quality"])
94
+
95
+ # Load data (downloads if not cached)
96
+ dataset.load_data()
97
+
98
+ # Access dataset information
99
+ info = dataset.get_dataset_info()
100
+ print("Dataset Info:", info)
101
+ ```
102
+
103
+ ### Accessing Data
104
+
105
+ ```python
106
+ # Convert to numpy arrays
107
+ features, targets = dataset.to_numpy()
108
+ print("Features shape:", features.shape)
109
+ print("Targets shape:", targets.shape)
110
+
111
+ # Convert to PyTorch Dataset
112
+ pytorch_dataset = dataset.to_pytorch()
113
+ print("Number of samples in PyTorch Dataset:", len(pytorch_dataset))
114
+
115
+ # Convert to Keras Sequence
116
+ keras_sequence = dataset.to_keras(batch_size=32)
117
+ for batch_features, batch_targets in keras_sequence:
118
+ print("Batch Features:", batch_features.shape)
119
+ print("Batch Targets:", batch_targets.shape)
120
+ ```
121
+
122
+ ### Dataset Statistics and Preview
123
+
124
+ ```python
125
+ # Get basic statistics
126
+ stats = dataset.statistics()
127
+ print("Statistics:", stats)
128
+
129
+ # Preview the dataset
130
+ preview = dataset.preview(n=5)
131
+ print("Preview:", preview)
132
+ ```
133
+
134
+ ### Registering and Accessing Datasets
135
+
136
+ ```python
137
+ from chelo.registry import DatasetRegistry
138
+
139
+ # List available datasets
140
+ print("Available Datasets:", DatasetRegistry.list_datasets())
141
+
142
+ # Retrieve and load a dataset by name
143
+ dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="red")
144
+ dataset.load_data()
145
+ ```
146
+
147
+ ## Extending the Library
148
+
149
+ To add a new dataset, create a new class that inherits from `ChemicalEngineeringDataset` and implement the required methods:
150
+
151
+ 1. **Create a new dataset module:**
152
+
153
+ ```plaintext
154
+ chelo/datasets/my_new_dataset.py
155
+ ```
156
+
157
+ 2. **Implement the dataset class:**
158
+
159
+ ```python
160
+ from ..base import ChemicalEngineeringDataset
161
+
162
+ class MyNewDataset(ChemicalEngineeringDataset):
163
+ def __init__(self, selected_features=None, selected_targets=None):
164
+ super().__init__(selected_features, selected_targets)
165
+ self.dataset_name = "My New Dataset"
166
+
167
+ def load_data(self):
168
+ # Load dataset into self.raw_features and self.raw_targets
169
+ pass
170
+
171
+ def list_features(self):
172
+ return list(self.raw_features.keys())
173
+
174
+ def list_targets(self):
175
+ return list(self.raw_targets.keys())
176
+
177
+ def get_dataset_info(self):
178
+ return {"name": self.dataset_name, "description": "Description of the dataset."}
179
+ ```
180
+
181
+ 3. **Register the dataset:**
182
+
183
+ Add the following line to `chelo/datasets/__init__.py`:
184
+
185
+ ```python
186
+ from .my_new_dataset import MyNewDataset
187
+ DatasetRegistry.register(MyNewDataset)
188
+ ```
189
+
190
+ ## Advanced Features
191
+
192
+ ### Downloader Utility
193
+
194
+ The library includes a downloader utility for downloading and caching datasets. Files are stored in a structured cache directory (default: `~/.chelo`).
195
+
196
+ #### Example Usage
197
+
198
+ ```python
199
+ from chelo.utils.download import DatasetDownloader
200
+
201
+ downloader = DatasetDownloader()
202
+
203
+ # Download a dataset file
204
+ url = "https://example.com/dataset.csv"
205
+ file_path = downloader.download(url, dataset_name="example_dataset", filename="example.csv")
206
+ print("Downloaded file path:", file_path)
207
+ ```
208
+
209
+ ### Dataset Registry
210
+
211
+ The registry dynamically manages available datasets, allowing users to list and retrieve datasets by name.
212
+
213
+ #### Example Usage
214
+
215
+ ```python
216
+ from chelo.registry import DatasetRegistry
217
+
218
+ # List all registered datasets
219
+ print("Available Datasets:", DatasetRegistry.list_datasets())
220
+
221
+ # Retrieve a dataset by name
222
+ dataset = DatasetRegistry.get_dataset("WineQualityDataset", wine_type="white")
223
+ ```
224
+
225
+ ## Testing
226
+
227
+ The library includes comprehensive unit tests to ensure correctness and reliability. Run tests using `pytest`:
228
+
229
+ ```bash
230
+ pytest tests/
231
+ ```
232
+
233
+ ## Contributing
234
+
235
+ Contributions are welcome! To contribute:
236
+
237
+ 1. Fork the repository.
238
+ 2. Create a feature branch.
239
+ 3. Implement your changes and add tests.
240
+ 4. Submit a pull request with a detailed description of your changes.
241
+
242
+ ## License
243
+
244
+ This library is licensed under the MIT License. See the `LICENSE` file for more details.
245
+
246
+ ## Contact
247
+
248
+ For questions or feedback, please contact [your email or project contact].
249
+
@@ -0,0 +1,17 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ chelo.egg-info/PKG-INFO
6
+ chelo.egg-info/SOURCES.txt
7
+ chelo.egg-info/dependency_links.txt
8
+ chelo.egg-info/requires.txt
9
+ chelo.egg-info/top_level.txt
10
+ src/__init__.py
11
+ src/base.py
12
+ src/registry.py
13
+ src/datasets/__init__.py
14
+ src/datasets/wine_quality.py
15
+ src/utils/__init__.py
16
+ src/utils/downloader.py
17
+ tests/test_base.py
@@ -0,0 +1,4 @@
1
+ numpy
2
+ pandas
3
+ tqdm
4
+ requests
@@ -0,0 +1 @@
1
+ chelo
@@ -0,0 +1,27 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [tool.setuptools]
6
+ package-dir = {"chelo" = "src"}
7
+
8
+ [project]
9
+ name = "chelo"
10
+ version = "0.0.1"
11
+ description = "Chemical Engineering Dataset Loader Library"
12
+ readme = "README.md"
13
+ license = {file = "LICENSE"}
14
+ authors = [
15
+ { name = "Nikolaos Passalis", email = "passalis@auth.gr" }
16
+ ]
17
+ dependencies = [
18
+ "numpy",
19
+ "pandas",
20
+ "tqdm",
21
+ "requests",
22
+ ]
23
+ classifiers = [
24
+ "Programming Language :: Python :: 3",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Operating System :: OS Independent",
27
+ ]
chelo-0.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,166 @@
1
+ from abc import ABC, abstractmethod
2
+ import numpy as np
3
+
4
+
5
+
6
+
7
+ class CheLoDataset(ABC):
8
+ """
9
+ Abstract Base Class for datasets.
10
+ """
11
+
12
+ def __init__(self, selected_features=None, selected_targets=None):
13
+ """
14
+ Initialize the dataset with optional selected features and targets.
15
+ :param selected_features: List of features to select (default: all).
16
+ :param selected_targets: List of targets to select (default: all).
17
+ """
18
+ self.raw_features = None # Immutable raw feature data
19
+ self.raw_targets = None # Immutable raw target data
20
+ self.features = None # Subset of features to use
21
+ self.targets = None # Subset of targets to use
22
+ self.dataset_name = None # Name of the dataset
23
+
24
+ self._selected_features = selected_features
25
+ self._selected_targets = selected_targets
26
+
27
+ @abstractmethod
28
+ def load_data(self):
29
+ """
30
+ Load the dataset and populate self.raw_features and self.raw_targets.
31
+ """
32
+ pass
33
+
34
+ @abstractmethod
35
+ def list_features(self):
36
+ """
37
+ List available features in the dataset.
38
+ """
39
+ pass
40
+
41
+ @abstractmethod
42
+ def list_targets(self):
43
+ """
44
+ List available targets in the dataset.
45
+ """
46
+ pass
47
+
48
+ @abstractmethod
49
+ def get_dataset_info(self):
50
+ """
51
+ Provide metadata about the dataset (e.g., source, size, description).
52
+ """
53
+ pass
54
+
55
+ def select_features(self, feature_names):
56
+ """
57
+ Dynamically select features from the dataset.
58
+ :param feature_names: List of feature names to select.
59
+ """
60
+ if not self.raw_features:
61
+ raise ValueError(f"Dataset {self.dataset_name} not loaded yet!")
62
+ self.features = {name: self.raw_features[name] for name in feature_names}
63
+
64
+ def select_targets(self, target_names):
65
+ """
66
+ Dynamically select targets from the dataset.
67
+ :param target_names: List of target names to select.
68
+ """
69
+ if not self.raw_targets:
70
+ raise ValueError(f"Dataset {self.dataset_name} not loaded yet!")
71
+ self.targets = {name: self.raw_targets[name] for name in target_names}
72
+
73
+ def _apply_initial_selections(self):
74
+ """
75
+ Apply initial selections if specified during initialization.
76
+ """
77
+ if self._selected_features:
78
+ self.select_features(self._selected_features)
79
+ else:
80
+ self.features = self.raw_features
81
+
82
+ if self._selected_targets:
83
+ self.select_targets(self._selected_targets)
84
+ else:
85
+ self.targets = self.raw_targets
86
+
87
+ def size(self):
88
+ """
89
+ Get the size of the dataset (number of samples).
90
+ """
91
+ return len(next(iter(self.features.values())))
92
+
93
+ def statistics(self):
94
+ """
95
+ Compute basic statistics for the features and targets.
96
+ :return: A dictionary of statistics (mean, std, min, max) for each feature and target.
97
+ """
98
+ stats = {}
99
+ for key, values in {**self.features, **self.targets}.items():
100
+ stats[key] = {
101
+ "mean": np.mean(values),
102
+ "std": np.std(values),
103
+ "min": np.min(values),
104
+ "max": np.max(values),
105
+ }
106
+ return stats
107
+
108
+ def to_numpy(self):
109
+ """
110
+ Convert the dataset to numpy arrays.
111
+ :return: Tuple of (features, targets) in numpy format.
112
+ """
113
+ return np.array(list(self.features.values())).T, np.array(list(self.targets.values())).T
114
+
115
+ def to_pytorch(self):
116
+ """
117
+ Provide a PyTorch Dataset object.
118
+ :return: A PyTorch Dataset containing features and targets.
119
+ """
120
+ from torch.utils.data import Dataset
121
+ class PyTorchDataset(Dataset):
122
+ def __init__(self, features, targets):
123
+ self.features = features
124
+ self.targets = targets
125
+
126
+ def __len__(self):
127
+ return len(self.features)
128
+
129
+ def __getitem__(self, idx):
130
+ return self.features[idx], self.targets[idx]
131
+
132
+ return PyTorchDataset(np.array(list(self.features.values())).T,
133
+ np.array(list(self.targets.values())).T)
134
+
135
+ def to_keras(self, batch_size=32):
136
+ """
137
+ Provide a Keras Sequence object for training.
138
+ :param batch_size: Number of samples per batch.
139
+ :return: A Keras Sequence containing features and targets.
140
+ """
141
+ from tensorflow.keras.utils import Sequence
142
+ class KerasSequence(Sequence):
143
+ def __init__(self, features, targets, batch_size):
144
+ self.features = np.array(list(features.values())).T
145
+ self.targets = np.array(list(targets.values())).T
146
+ self.batch_size = batch_size
147
+
148
+ def __len__(self):
149
+ return int(np.ceil(len(self.features) / self.batch_size))
150
+
151
+ def __getitem__(self, idx):
152
+ start_idx = idx * self.batch_size
153
+ end_idx = start_idx + self.batch_size
154
+ return self.features[start_idx:end_idx], self.targets[start_idx:end_idx]
155
+
156
+ return KerasSequence(self.features, self.targets, batch_size)
157
+
158
+ def preview(self, n=5):
159
+ """
160
+ Preview the first n rows of the dataset.
161
+ """
162
+ preview_data = {
163
+ "features": {key: values[:n] for key, values in self.features.items()},
164
+ "targets": {key: values[:n] for key, values in self.targets.items()},
165
+ }
166
+ return preview_data
@@ -0,0 +1,17 @@
1
+ # import pkgutil
2
+ # import importlib
3
+ # from ..registry import DatasetRegistry
4
+ # from ..base import CheLoDataset
5
+
6
+ # def auto_register_datasets():
7
+ # """
8
+ # Automatically discover and register datasets in the `datasets` package.
9
+ # """
10
+ # for _, module_name, _ in pkgutil.iter_modules(__path__):
11
+ # module = importlib.import_module(f"{__name__}.{module_name}")
12
+ # for attr_name in dir(module):
13
+ # attr = getattr(module, attr_name)
14
+ # if isinstance(attr, type) and issubclass(attr, CheLoDataset) and attr is not CheLoDataset:
15
+ # DatasetRegistry.register(attr)
16
+ #
17
+ # auto_register_datasets()
@@ -0,0 +1,71 @@
1
+ from ..base import CheLoDataset
2
+ from ..registry import register_dataset
3
+ from ..utils.downloader import DatasetDownloader
4
+ import pandas as pd
5
+
6
+ @register_dataset
7
+ class WineQualityDataset(CheLoDataset):
8
+
9
+ BASE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/"
10
+ FILES = {
11
+ "red": "winequality-red.csv",
12
+ "white": "winequality-white.csv",
13
+ }
14
+ CHECKSUMS ={
15
+ "red": "2daeecee174368f8a33b82c8cccae3a5",
16
+ "white": "5d9ff0f7f716dace19e3ab4578775fd7",
17
+ }
18
+
19
+ def __init__(self, wine_type="red", selected_features=None, selected_targets=None):
20
+ """
21
+ Initialize the Wine Quality Dataset.
22
+ :param wine_type: Type of wine ('red' or 'white').
23
+ :param selected_features: Features to select (default: all).
24
+ :param selected_targets: Targets to select (default: all).
25
+ """
26
+ super().__init__(selected_features, selected_targets)
27
+ if wine_type not in self.FILES:
28
+ raise ValueError(f"Invalid wine_type '{wine_type}'. Must be 'red' or 'white'.")
29
+ self.wine_type = wine_type
30
+ self.dataset_name = f"Wine Quality ({wine_type.capitalize()})"
31
+
32
+ def load_data(self):
33
+ """
34
+ Load the dataset from the UCI repository or cache.
35
+ """
36
+ downloader = DatasetDownloader()
37
+ file_url = self.BASE_URL + self.FILES[self.wine_type]
38
+ file_path = downloader.download(file_url, dataset_name="wine_quality", filename=self.FILES[self.wine_type],
39
+ checksum=self.CHECKSUMS[self.wine_type])
40
+
41
+ data = pd.read_csv(file_path, sep=";")
42
+ self.raw_features = data.drop(columns=["quality"]).to_dict(orient="list")
43
+ self.raw_targets = {"quality": data["quality"].tolist()}
44
+ self._apply_initial_selections()
45
+
46
+ def list_features(self):
47
+ """
48
+ List the available features in the dataset.
49
+ :return: List of feature names.
50
+ """
51
+ return list(self.raw_features.keys())
52
+
53
+ def list_targets(self):
54
+ """
55
+ List the available targets in the dataset.
56
+ :return: List of target names.
57
+ """
58
+ return list(self.raw_targets.keys())
59
+
60
+ def get_dataset_info(self):
61
+ """
62
+ Get metadata about the dataset.
63
+ :return: A dictionary containing dataset metadata.
64
+ """
65
+ return {
66
+ "name": self.dataset_name,
67
+ "description": "Dataset containing physicochemical attributes and quality ratings of wines.",
68
+ "wine_type": self.wine_type,
69
+ "features": self.list_features(),
70
+ "targets": self.list_targets(),
71
+ }
@@ -0,0 +1,45 @@
1
+
2
+ class DatasetRegistry:
3
+ """
4
+ A registry to manage available datasets in CheLo
5
+ """
6
+ _datasets = {}
7
+
8
+ @classmethod
9
+ def register(cls, dataset_cls):
10
+ """
11
+ Register a dataset class with the registry.
12
+ :param dataset_cls: The dataset class to register.
13
+ """
14
+ dataset_name = dataset_cls.__name__
15
+ if dataset_name in cls._datasets:
16
+ raise ValueError(f"Dataset {dataset_name} is already registered.")
17
+ cls._datasets[dataset_name] = dataset_cls
18
+
19
+ @classmethod
20
+ def list_datasets(cls):
21
+ """
22
+ List all registered datasets.
23
+ :return: A list of names of registered datasets.
24
+ """
25
+ return list(cls._datasets.keys())
26
+
27
+ @classmethod
28
+ def get_dataset(cls, name, **kwargs):
29
+ """
30
+ Retrieve an instance of the specified dataset by name.
31
+ :param name: Name of the dataset to retrieve.
32
+ :param kwargs: Additional arguments to pass to the dataset constructor.
33
+ :return: An instance of the dataset.
34
+ """
35
+ if name not in cls._datasets:
36
+ raise ValueError(f"Dataset {name} not found! Available datasets: {cls.list_datasets()}")
37
+ return cls._datasets[name](**kwargs)
38
+
39
+
40
+ def register_dataset(cls):
41
+ """
42
+ A decorator to register a dataset class with the registry.
43
+ """
44
+ DatasetRegistry.register(cls)
45
+ return cls
File without changes
@@ -0,0 +1,118 @@
1
+ import os
2
+ import hashlib
3
+ import requests
4
+ from tqdm import tqdm
5
+ import tarfile
6
+ import zipfile
7
+
8
+ class DatasetDownloader:
9
+ """
10
+ Utility class for downloading and caching datasets.
11
+ """
12
+
13
+ def __init__(self, cache_dir=None):
14
+ """
15
+ Initialize the downloader with an optional cache directory.
16
+ :param cache_dir: Directory to store downloaded datasets
17
+ """
18
+ self.cache_dir = cache_dir or os.getenv("CHELO_DATASET_CACHE", os.path.expanduser("~/.chelo_datasets"))
19
+ os.makedirs(self.cache_dir, exist_ok=True)
20
+
21
+
22
+ def _get_dataset_dir(self, dataset_name):
23
+ """
24
+ Get the directory path for a specific dataset.
25
+ :param dataset_name: Name of the dataset.
26
+ :return: Path to the dataset's directory.
27
+ """
28
+ dataset_dir = os.path.join(self.cache_dir, dataset_name)
29
+ os.makedirs(dataset_dir, exist_ok=True)
30
+ return dataset_dir
31
+
32
+ def _get_file_path(self, dataset_name, filename):
33
+ """
34
+ Get the full path for a file in the dataset's directory.
35
+ :param dataset_name: Name of the dataset.
36
+ :param filename: Name of the file.
37
+ :return: Full path to the file.
38
+ """
39
+ dataset_dir = self._get_dataset_dir(dataset_name)
40
+ return os.path.join(dataset_dir, filename)
41
+
42
+ def download(self, url, dataset_name, filename=None, checksum=None):
43
+ """
44
+ Download a file for a specific dataset and save it in the dataset's folder.
45
+ :param url: URL of the file to download.
46
+ :param dataset_name: Name of the dataset.
47
+ :param filename: Local filename (default: inferred from the URL).
48
+ :param checksum: Expected checksum (MD5 or SHA256) to validate the file (optional).
49
+ :return: Path to the downloaded file.
50
+ """
51
+ filename = filename or os.path.basename(url)
52
+ file_path = self._get_file_path(dataset_name, filename)
53
+
54
+ # Check if the file already exists
55
+ if os.path.exists(file_path):
56
+ if checksum and not self._verify_checksum(file_path, checksum):
57
+ print("Checksum mismatch! Redownloading the file.")
58
+ else:
59
+ return file_path
60
+
61
+ # Download the file
62
+ print(f"Downloading '{filename}' for dataset '{dataset_name}' from {url}...")
63
+ response = requests.get(url, stream=True)
64
+ response.raise_for_status()
65
+
66
+ # Save the file with progress bar
67
+ with open(file_path, "wb") as file, tqdm(
68
+ total=int(response.headers.get("content-length", 0)),
69
+ unit="B",
70
+ unit_scale=True,
71
+ desc=f"Downloading {filename}",
72
+ ) as progress:
73
+ for chunk in response.iter_content(chunk_size=1024):
74
+ file.write(chunk)
75
+ progress.update(len(chunk))
76
+
77
+ # Verify checksum
78
+ if checksum and not self._verify_checksum(file_path, checksum):
79
+ raise ValueError(f"Checksum verification failed for '{filename}'.")
80
+
81
+ print(f"File downloaded and saved at '{file_path}'.")
82
+ return file_path
83
+
84
+ def _verify_checksum(self, file_path, checksum):
85
+ """
86
+ Verify the checksum of a file.
87
+ :param file_path: Path to the file.
88
+ :param checksum: Expected checksum (MD5 or SHA256).
89
+ :return: True if the checksum matches, False otherwise.
90
+ """
91
+ hash_func = hashlib.sha256 if len(checksum) == 64 else hashlib.md5
92
+ with open(file_path, "rb") as file:
93
+ file_hash = hash_func(file.read()).hexdigest()
94
+ return file_hash == checksum
95
+
96
+
97
+
98
+ def extract_file(self, file_path, extract_to=None):
99
+ """
100
+ Extract a compressed file (.zip or .tar.gz).
101
+ :param file_path: Path to the compressed file.
102
+ :param extract_to: Directory to extract to (default: same as file location).
103
+ :return: Path to the extracted directory.
104
+ """
105
+ extract_to = extract_to or os.path.splitext(file_path)[0]
106
+ os.makedirs(extract_to, exist_ok=True)
107
+
108
+ if file_path.endswith(".zip"):
109
+ with zipfile.ZipFile(file_path, "r") as zip_ref:
110
+ zip_ref.extractall(extract_to)
111
+ elif file_path.endswith(".tar.gz"):
112
+ with tarfile.open(file_path, "r:gz") as tar_ref:
113
+ tar_ref.extractall(extract_to)
114
+ else:
115
+ raise ValueError(f"Unsupported file format: {file_path}")
116
+
117
+ print(f"File extracted to '{extract_to}'.")
118
+ return extract_to
@@ -0,0 +1,50 @@
1
+ import pytest
2
+ import os
3
+ import numpy as np
4
+ from chelo.base import CheLoDataset
5
+ from chelo.utils.downloader import DatasetDownloader
6
+
7
+ # Mock Base Class Implementation for Testing
8
+ class MockDataset(CheLoDataset):
9
+ def __init__(self, selected_features=None, selected_targets=None):
10
+ super().__init__(selected_features, selected_targets)
11
+ self.dataset_name = "Mock Dataset"
12
+
13
+ def load_data(self):
14
+ self.raw_features = {
15
+ "Temperature": [300, 310, 320],
16
+ "Pressure": [101.3, 98.6, 102.5],
17
+ }
18
+ self.raw_targets = {"Reaction Rate": [1.2, 1.5, 1.3]}
19
+ self._apply_initial_selections()
20
+
21
+ def list_features(self):
22
+ return list(self.raw_features.keys())
23
+
24
+ def list_targets(self):
25
+ return list(self.raw_targets.keys())
26
+
27
+ def get_dataset_info(self):
28
+ return {
29
+ "name": self.dataset_name,
30
+ "description": "A mock dataset for testing.",
31
+ "size": self.size(),
32
+ }
33
+
34
+ # Test Cases for Base Class
35
+ def test_mock_dataset():
36
+ dataset = MockDataset(selected_features=["Temperature"], selected_targets=["Reaction Rate"])
37
+ dataset.load_data()
38
+
39
+ # Verify dataset information
40
+ assert dataset.get_dataset_info()["name"] == "Mock Dataset"
41
+ assert dataset.list_features() == ["Temperature", "Pressure"]
42
+ assert dataset.list_targets() == ["Reaction Rate"]
43
+
44
+ # Verify feature and target selection
45
+ features, targets = dataset.to_numpy()
46
+ assert features.shape == (3, 1)
47
+ assert targets.shape == (3, 1)
48
+
49
+ if __name__ == "__main__":
50
+ pytest.main()