congrads 0.1.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- congrads/__init__.py +21 -13
- congrads/checkpoints.py +232 -0
- congrads/constraints.py +728 -316
- congrads/core.py +525 -139
- congrads/datasets.py +273 -516
- congrads/descriptor.py +95 -30
- congrads/metrics.py +185 -38
- congrads/networks.py +51 -28
- congrads/requirements.txt +6 -0
- congrads/transformations.py +139 -0
- congrads/utils.py +710 -0
- congrads-1.0.1.dist-info/LICENSE +26 -0
- congrads-1.0.1.dist-info/METADATA +208 -0
- congrads-1.0.1.dist-info/RECORD +16 -0
- {congrads-0.1.0.dist-info → congrads-1.0.1.dist-info}/WHEEL +1 -1
- congrads/learners.py +0 -233
- congrads-0.1.0.dist-info/LICENSE +0 -34
- congrads-0.1.0.dist-info/METADATA +0 -196
- congrads-0.1.0.dist-info/RECORD +0 -13
- {congrads-0.1.0.dist-info → congrads-1.0.1.dist-info}/top_level.txt +0 -0
congrads/datasets.py
CHANGED
|
@@ -1,34 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module defines several PyTorch dataset classes for loading and
|
|
3
|
+
working with various datasets. Each dataset class extends the
|
|
4
|
+
`torch.utils.data.Dataset` class and provides functionality for
|
|
5
|
+
downloading, loading, and transforming specific datasets.
|
|
6
|
+
|
|
7
|
+
Classes:
|
|
8
|
+
|
|
9
|
+
- BiasCorrection: A dataset class for the Bias Correction dataset
|
|
10
|
+
focused on temperature forecast data.
|
|
11
|
+
- FamilyIncome: A dataset class for the Family Income and
|
|
12
|
+
Expenditure dataset.
|
|
13
|
+
- NoisySines: A dataset class that generates noisy sine wave
|
|
14
|
+
samples with added Gaussian noise.
|
|
15
|
+
|
|
16
|
+
Each dataset class provides methods for downloading the data
|
|
17
|
+
(if not already available), checking the integrity of the dataset, loading
|
|
18
|
+
the data from CSV files or generating synthetic data, and applying
|
|
19
|
+
transformations to the data.
|
|
20
|
+
|
|
21
|
+
Key Methods:
|
|
22
|
+
|
|
23
|
+
- `__init__`: Initializes the dataset by specifying the root directory,
|
|
24
|
+
transformation function, and optional download flag.
|
|
25
|
+
- `__getitem__`: Retrieves a specific data point given its index,
|
|
26
|
+
returning input-output pairs.
|
|
27
|
+
- `__len__`: Returns the total number of examples in the dataset.
|
|
28
|
+
- `download`: Downloads and extracts the dataset from
|
|
29
|
+
the specified mirrors.
|
|
30
|
+
- `_load_data`: Loads the dataset from CSV files and
|
|
31
|
+
applies transformations.
|
|
32
|
+
- `_check_exists`: Checks if the dataset is already
|
|
33
|
+
downloaded and verified.
|
|
34
|
+
|
|
35
|
+
Each dataset class allows the user to apply custom transformations to the
|
|
36
|
+
dataset through the `transform` argument to allow pre-processing and offers
|
|
37
|
+
the ability to download the dataset if it's not already present on
|
|
38
|
+
the local disk.
|
|
39
|
+
"""
|
|
40
|
+
|
|
1
41
|
import os
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
from typing import Callable, Union
|
|
2
44
|
from urllib.error import URLError
|
|
45
|
+
|
|
3
46
|
import numpy as np
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Callable, Optional, Union
|
|
6
47
|
import pandas as pd
|
|
7
|
-
import lightning as L
|
|
8
|
-
from torch.utils.data import Dataset, random_split, DataLoader
|
|
9
48
|
import torch
|
|
10
|
-
|
|
11
|
-
from torchvision.datasets.utils import
|
|
49
|
+
from torch.utils.data import Dataset
|
|
50
|
+
from torchvision.datasets.utils import (
|
|
51
|
+
check_integrity,
|
|
52
|
+
download_and_extract_archive,
|
|
53
|
+
)
|
|
12
54
|
|
|
13
55
|
|
|
14
56
|
class BiasCorrection(Dataset):
|
|
15
57
|
"""
|
|
16
|
-
Bias Correction
|
|
58
|
+
A dataset class for accessing the Bias Correction dataset.
|
|
17
59
|
|
|
18
|
-
This class
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
60
|
+
This class extends the `Dataset` class and provides functionality for
|
|
61
|
+
downloading, loading, and transforming the Bias Correction dataset.
|
|
62
|
+
The dataset is focused on temperature forecast data and is made available
|
|
63
|
+
for use with PyTorch. If `download` is set to True, the dataset will be
|
|
64
|
+
downloaded if it is not already available. The data is then loaded,
|
|
65
|
+
and a transformation function is applied to it.
|
|
22
66
|
|
|
23
67
|
Args:
|
|
24
|
-
root (str
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
68
|
+
root (Union[str, Path]): The root directory where the dataset
|
|
69
|
+
will be stored or loaded from.
|
|
70
|
+
transform (Callable): A function to transform the dataset
|
|
71
|
+
(e.g., preprocessing).
|
|
72
|
+
download (bool, optional): Whether to download the dataset if it's
|
|
73
|
+
not already present. Defaults to False.
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
RuntimeError: If the dataset is not found and `download`
|
|
77
|
+
is not set to True or if all mirrors fail to provide the dataset.
|
|
32
78
|
"""
|
|
33
79
|
|
|
34
80
|
mirrors = [
|
|
@@ -37,6 +83,7 @@ class BiasCorrection(Dataset):
|
|
|
37
83
|
|
|
38
84
|
resources = [
|
|
39
85
|
(
|
|
86
|
+
# pylint: disable-next=line-too-long
|
|
40
87
|
"bias+correction+of+numerical+prediction+model+temperature+forecast.zip",
|
|
41
88
|
"3deee56d461a2686887c4ae38fe3ccf3",
|
|
42
89
|
),
|
|
@@ -45,16 +92,11 @@ class BiasCorrection(Dataset):
|
|
|
45
92
|
def __init__(
|
|
46
93
|
self,
|
|
47
94
|
root: Union[str, Path],
|
|
95
|
+
transform: Callable,
|
|
48
96
|
download: bool = False,
|
|
49
|
-
transform: Optional[Callable] = None,
|
|
50
97
|
) -> None:
|
|
51
98
|
"""
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
root (Union[str, Path]): The root directory where the dataset will be stored or accessed.
|
|
56
|
-
download (bool, optional): If True, downloads the dataset from the provided mirrors. Default is False.
|
|
57
|
-
transform (Optional[Callable], optional): A function to transform the data before loading. Default is None.
|
|
99
|
+
Constructor method to initialize the dataset.
|
|
58
100
|
"""
|
|
59
101
|
|
|
60
102
|
super().__init__()
|
|
@@ -73,22 +115,20 @@ class BiasCorrection(Dataset):
|
|
|
73
115
|
|
|
74
116
|
def _load_data(self):
|
|
75
117
|
"""
|
|
76
|
-
Loads the dataset from the CSV file
|
|
118
|
+
Loads the dataset from the CSV file and applies the transformation.
|
|
119
|
+
|
|
120
|
+
The data is read from the `Bias_correction_ucl.csv` file, and the
|
|
121
|
+
transformation function is applied to it.
|
|
122
|
+
The input and output data are separated and returned as numpy arrays.
|
|
77
123
|
|
|
78
124
|
Returns:
|
|
79
|
-
Tuple: A tuple containing
|
|
125
|
+
Tuple[numpy.ndarray, numpy.ndarray]: A tuple containing the input
|
|
126
|
+
and output data as numpy arrays.
|
|
80
127
|
"""
|
|
81
128
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
.pipe(self.transform)
|
|
86
|
-
.pipe(self.add_input_output_temperature)
|
|
87
|
-
)
|
|
88
|
-
else:
|
|
89
|
-
data: pd.DataFrame = pd.read_csv(
|
|
90
|
-
os.path.join(self.data_folder, "Bias_correction_ucl.csv")
|
|
91
|
-
).pipe(self.add_input_output_temperature)
|
|
129
|
+
data: pd.DataFrame = pd.read_csv(
|
|
130
|
+
os.path.join(self.data_folder, "Bias_correction_ucl.csv")
|
|
131
|
+
).pipe(self.transform)
|
|
92
132
|
|
|
93
133
|
data_input = data["Input"].to_numpy(dtype=np.float32)
|
|
94
134
|
data_output = data["Output"].to_numpy(dtype=np.float32)
|
|
@@ -100,20 +140,22 @@ class BiasCorrection(Dataset):
|
|
|
100
140
|
Returns the number of examples in the dataset.
|
|
101
141
|
|
|
102
142
|
Returns:
|
|
103
|
-
int: The
|
|
143
|
+
int: The number of examples in the dataset
|
|
144
|
+
(i.e., the number of rows in the input data).
|
|
104
145
|
"""
|
|
105
|
-
|
|
146
|
+
|
|
106
147
|
return self.data_input.shape[0]
|
|
107
148
|
|
|
108
149
|
def __getitem__(self, idx):
|
|
109
150
|
"""
|
|
110
|
-
|
|
151
|
+
Returns the input-output pair for a given index.
|
|
111
152
|
|
|
112
153
|
Args:
|
|
113
154
|
idx (int): The index of the example to retrieve.
|
|
114
155
|
|
|
115
156
|
Returns:
|
|
116
|
-
Tuple
|
|
157
|
+
Tuple[torch.Tensor, torch.Tensor]: The input-output pair
|
|
158
|
+
as PyTorch tensors.
|
|
117
159
|
"""
|
|
118
160
|
|
|
119
161
|
example = self.data_input[idx, :]
|
|
@@ -128,17 +170,21 @@ class BiasCorrection(Dataset):
|
|
|
128
170
|
Returns the path to the folder where the dataset is stored.
|
|
129
171
|
|
|
130
172
|
Returns:
|
|
131
|
-
str: The path to the dataset folder
|
|
173
|
+
str: The path to the dataset folder.
|
|
132
174
|
"""
|
|
133
175
|
|
|
134
176
|
return os.path.join(self.root, self.__class__.__name__)
|
|
135
177
|
|
|
136
178
|
def _check_exists(self) -> bool:
|
|
137
179
|
"""
|
|
138
|
-
Checks if the dataset
|
|
180
|
+
Checks if the dataset is already downloaded and verified.
|
|
181
|
+
|
|
182
|
+
This method checks that all required files exist and
|
|
183
|
+
their integrity is validated via MD5 checksums.
|
|
139
184
|
|
|
140
185
|
Returns:
|
|
141
|
-
bool: True if
|
|
186
|
+
bool: True if all resources exist and their
|
|
187
|
+
integrity is valid, False otherwise.
|
|
142
188
|
"""
|
|
143
189
|
|
|
144
190
|
return all(
|
|
@@ -148,10 +194,14 @@ class BiasCorrection(Dataset):
|
|
|
148
194
|
|
|
149
195
|
def download(self) -> None:
|
|
150
196
|
"""
|
|
151
|
-
Downloads
|
|
197
|
+
Downloads and extracts the dataset.
|
|
198
|
+
|
|
199
|
+
This method attempts to download the dataset from the mirrors and
|
|
200
|
+
extract it into the appropriate folder. If any error occurs during
|
|
201
|
+
downloading, it will try each mirror in sequence.
|
|
152
202
|
|
|
153
|
-
|
|
154
|
-
|
|
203
|
+
Raises:
|
|
204
|
+
RuntimeError: If all mirrors fail to provide the dataset.
|
|
155
205
|
"""
|
|
156
206
|
|
|
157
207
|
if self._check_exists():
|
|
@@ -166,7 +216,10 @@ class BiasCorrection(Dataset):
|
|
|
166
216
|
url = f"{mirror}{filename}"
|
|
167
217
|
try:
|
|
168
218
|
download_and_extract_archive(
|
|
169
|
-
url,
|
|
219
|
+
url,
|
|
220
|
+
download_root=self.data_folder,
|
|
221
|
+
filename=filename,
|
|
222
|
+
md5=md5,
|
|
170
223
|
)
|
|
171
224
|
except URLError as e:
|
|
172
225
|
errors.append(e)
|
|
@@ -178,212 +231,41 @@ class BiasCorrection(Dataset):
|
|
|
178
231
|
s += f"Tried {mirror}, got:\n{str(err)}\n"
|
|
179
232
|
raise RuntimeError(s)
|
|
180
233
|
|
|
181
|
-
@staticmethod
|
|
182
|
-
def add_input_output_temperature(df: pd.DataFrame) -> pd.DataFrame:
|
|
183
|
-
"""Add a multiindex denoting if the column is an input or output variable."""
|
|
184
|
-
# copy the dataframe
|
|
185
|
-
temp_df = df.copy()
|
|
186
|
-
# extract all the column names
|
|
187
|
-
column_names = temp_df.columns.tolist()
|
|
188
|
-
# only the last 2 columns are output variables, all others are input variables. So make list of corresponding lengths of 'Input' and 'Output'
|
|
189
|
-
input_list = ["Input"] * (len(column_names) - 2)
|
|
190
|
-
output_list = ["Output"] * 2
|
|
191
|
-
# concat both lists
|
|
192
|
-
input_output_list = input_list + output_list
|
|
193
|
-
# define multi index for attaching this 'Input' and 'Output' list with the column names already existing
|
|
194
|
-
multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
|
|
195
|
-
# transpose such that index can be adjusted to multi index
|
|
196
|
-
new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
|
|
197
|
-
# transpose back such that columns are the same as before except with different labels
|
|
198
|
-
return new_df.transpose()
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
class BCDataModule(L.LightningDataModule):
|
|
202
|
-
"""Bias Correction dataset module."""
|
|
203
|
-
|
|
204
|
-
def __init__(
|
|
205
|
-
self,
|
|
206
|
-
dataset_directory: str = "./datasets",
|
|
207
|
-
batch_size: int = 32,
|
|
208
|
-
train_size: float = 0.8,
|
|
209
|
-
val_size: float = 0.1,
|
|
210
|
-
test_size: float = 0.1,
|
|
211
|
-
shuffle_train: bool = True,
|
|
212
|
-
shuffle_val: bool = False,
|
|
213
|
-
shuffle_test: bool = False,
|
|
214
|
-
num_workers: int = 4,
|
|
215
|
-
pin_memory: bool = False,
|
|
216
|
-
) -> None:
|
|
217
|
-
super().__init__()
|
|
218
|
-
# Define required parameters here
|
|
219
|
-
self.batch_size = batch_size
|
|
220
|
-
self.dataset_directory = dataset_directory
|
|
221
|
-
self.train_size = train_size
|
|
222
|
-
self.val_size = val_size
|
|
223
|
-
self.test_size = test_size
|
|
224
|
-
self.shuffle_train = shuffle_train
|
|
225
|
-
self.shuffle_val = shuffle_val
|
|
226
|
-
self.shuffle_test = shuffle_test
|
|
227
|
-
self.num_workers = num_workers
|
|
228
|
-
self.pin_memory = pin_memory
|
|
229
|
-
|
|
230
|
-
def prepare_data(self):
|
|
231
|
-
# Define steps that should be done
|
|
232
|
-
# on only one GPU, like getting data.
|
|
233
|
-
BiasCorrection(self.dataset_directory, download=True, transform=self.transform)
|
|
234
|
-
|
|
235
|
-
def setup(self, stage=None):
|
|
236
|
-
# Define steps that should be done on
|
|
237
|
-
# every GPU, like splitting data, applying
|
|
238
|
-
# transform etc.
|
|
239
|
-
data = BiasCorrection(self.dataset_directory, transform=self.transform)
|
|
240
|
-
|
|
241
|
-
train_val_data, self.test_data = random_split(
|
|
242
|
-
data, [1 - self.test_size, self.test_size]
|
|
243
|
-
)
|
|
244
|
-
self.train_data, self.val_data = random_split(
|
|
245
|
-
train_val_data,
|
|
246
|
-
[
|
|
247
|
-
self.train_size / (1 - self.test_size),
|
|
248
|
-
self.val_size / (1 - self.test_size),
|
|
249
|
-
],
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
def train_dataloader(self):
|
|
253
|
-
# Return DataLoader for Training Data here
|
|
254
|
-
return DataLoader(
|
|
255
|
-
self.train_data,
|
|
256
|
-
batch_size=self.batch_size,
|
|
257
|
-
shuffle=self.shuffle_train,
|
|
258
|
-
num_workers=self.num_workers,
|
|
259
|
-
pin_memory=self.pin_memory,
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
def val_dataloader(self):
|
|
263
|
-
# Return DataLoader for Validation Data here
|
|
264
|
-
return DataLoader(
|
|
265
|
-
self.val_data,
|
|
266
|
-
batch_size=self.batch_size,
|
|
267
|
-
shuffle=self.shuffle_val,
|
|
268
|
-
num_workers=self.num_workers,
|
|
269
|
-
pin_memory=self.pin_memory,
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
def test_dataloader(self):
|
|
273
|
-
# Return DataLoader for Testing Data here
|
|
274
|
-
return DataLoader(
|
|
275
|
-
self.test_data,
|
|
276
|
-
batch_size=self.batch_size,
|
|
277
|
-
shuffle=self.shuffle_test,
|
|
278
|
-
num_workers=self.num_workers,
|
|
279
|
-
pin_memory=self.pin_memory,
|
|
280
|
-
)
|
|
281
|
-
|
|
282
|
-
@staticmethod
|
|
283
|
-
def transform(df: pd.DataFrame) -> pd.DataFrame:
|
|
284
|
-
def date_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
|
|
285
|
-
"""Transform the string that denotes the date to the datetime format in pandas."""
|
|
286
|
-
# make copy of dataframe
|
|
287
|
-
df_temp = df.copy()
|
|
288
|
-
# add new column at the front where the date string is transformed to the datetime format
|
|
289
|
-
df_temp.insert(0, "DateTransformed", pd.to_datetime(df_temp["Date"]))
|
|
290
|
-
return df_temp
|
|
291
|
-
|
|
292
|
-
def add_year(df: pd.DataFrame) -> pd.DataFrame:
|
|
293
|
-
"""Extract the year from the datetime cell and add it as a new column to the dataframe at the front."""
|
|
294
|
-
# make copy of dataframe
|
|
295
|
-
df_temp = df.copy()
|
|
296
|
-
# extract year and add new column at the front containing these numbers
|
|
297
|
-
df_temp.insert(0, "Year", df_temp["DateTransformed"].dt.year)
|
|
298
|
-
return df_temp
|
|
299
|
-
|
|
300
|
-
def add_month(df: pd.DataFrame) -> pd.DataFrame:
|
|
301
|
-
"""Extract the month from the datetime cell and add it as a new column to the dataframe at the front."""
|
|
302
|
-
# make copy of dataframe
|
|
303
|
-
df_temp = df.copy()
|
|
304
|
-
# extract month and add new column at index 1 containing these numbers
|
|
305
|
-
df_temp.insert(1, "Month", df_temp["DateTransformed"].dt.month)
|
|
306
|
-
return df_temp
|
|
307
|
-
|
|
308
|
-
def add_day(df: pd.DataFrame) -> pd.DataFrame:
|
|
309
|
-
"""Extract the day from the datetime cell and add it as a new column to the dataframe at the front."""
|
|
310
|
-
# make copy of dataframe
|
|
311
|
-
df_temp = df.copy()
|
|
312
|
-
# extract day and add new column at index 2 containing these numbers
|
|
313
|
-
df_temp.insert(2, "Day", df_temp["DateTransformed"].dt.day)
|
|
314
|
-
return df_temp
|
|
315
|
-
|
|
316
|
-
def normalize_columns_bias(df: pd.DataFrame) -> pd.DataFrame:
|
|
317
|
-
"""Normalize the columns for the bias correction dataset. This is different from normalizing all the columns separately because the
|
|
318
|
-
upper and lower bounds for the output variables are assumed to be the same.
|
|
319
|
-
"""
|
|
320
|
-
# copy the dataframe
|
|
321
|
-
temp_df = df.copy()
|
|
322
|
-
# normalize each column
|
|
323
|
-
for feature_name in df.columns:
|
|
324
|
-
# the output columns are normalized using the same upper and lower bound for more efficient check of the inequality
|
|
325
|
-
if feature_name == "Next_Tmax" or feature_name == "Next_Tmin":
|
|
326
|
-
max_value = 38.9
|
|
327
|
-
min_value = 11.3
|
|
328
|
-
# the input columns are normalized using their respective upper and lower bounds
|
|
329
|
-
else:
|
|
330
|
-
max_value = df[feature_name].max()
|
|
331
|
-
min_value = df[feature_name].min()
|
|
332
|
-
temp_df[feature_name] = (df[feature_name] - min_value) / (
|
|
333
|
-
max_value - min_value
|
|
334
|
-
)
|
|
335
|
-
return temp_df
|
|
336
|
-
|
|
337
|
-
def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
|
|
338
|
-
"""Sample 2500 examples from the dataframe without replacement."""
|
|
339
|
-
temp_df = df.copy()
|
|
340
|
-
sample_df = temp_df.sample(n=2500, replace=False, random_state=3, axis=0)
|
|
341
|
-
return sample_df
|
|
342
|
-
|
|
343
|
-
return (
|
|
344
|
-
# drop missing values
|
|
345
|
-
df.dropna(how="any")
|
|
346
|
-
# transform string date to datetime format
|
|
347
|
-
.pipe(date_to_datetime)
|
|
348
|
-
# add year as a single column
|
|
349
|
-
.pipe(add_year)
|
|
350
|
-
# add month as a single column
|
|
351
|
-
.pipe(add_month)
|
|
352
|
-
# add day as a single column
|
|
353
|
-
.pipe(add_day)
|
|
354
|
-
# remove original date string and the datetime format
|
|
355
|
-
.drop(["Date", "DateTransformed"], axis=1, inplace=False)
|
|
356
|
-
# convert all numbers to float32
|
|
357
|
-
.astype("float32")
|
|
358
|
-
# normalize columns
|
|
359
|
-
.pipe(normalize_columns_bias)
|
|
360
|
-
# sample 2500 examples out of the dataset
|
|
361
|
-
.pipe(sample_2500_examples)
|
|
362
|
-
)
|
|
363
234
|
|
|
235
|
+
class FamilyIncome(Dataset):
|
|
236
|
+
"""
|
|
237
|
+
A dataset class for accessing the Family Income and Expenditure dataset.
|
|
364
238
|
|
|
365
|
-
class
|
|
366
|
-
|
|
239
|
+
This class extends the `Dataset` class and provides functionality for
|
|
240
|
+
downloading, loading, and transforming the Family Income and
|
|
241
|
+
Expenditure dataset. The dataset is intended for use with
|
|
242
|
+
PyTorch-based projects, offering a convenient interface for data handling.
|
|
243
|
+
This class provides access to the Family Income and Expenditure dataset
|
|
244
|
+
for use with PyTorch. If `download` is set to True, the dataset will be
|
|
245
|
+
downloaded if it is not already available. The data is then loaded,
|
|
246
|
+
and a user-defined transformation function is applied to it.
|
|
367
247
|
|
|
368
248
|
Args:
|
|
369
|
-
root (str
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
249
|
+
root (Union[str, Path]): The root directory where the dataset will
|
|
250
|
+
be stored or loaded from.
|
|
251
|
+
transform (Callable): A function to transform the dataset
|
|
252
|
+
(e.g., preprocessing).
|
|
253
|
+
download (bool, optional): Whether to download the dataset if it's
|
|
254
|
+
not already present. Defaults to False.
|
|
255
|
+
|
|
256
|
+
Raises:
|
|
257
|
+
RuntimeError: If the dataset is not found and `download`
|
|
258
|
+
is not set to True or if all mirrors fail to provide the dataset.
|
|
378
259
|
"""
|
|
379
260
|
|
|
380
261
|
mirrors = [
|
|
381
|
-
|
|
262
|
+
# pylint: disable-next=line-too-long
|
|
263
|
+
"https://www.kaggle.com/api/v1/datasets/download/grosvenpaul/family-income-and-expenditure",
|
|
382
264
|
]
|
|
383
265
|
|
|
384
266
|
resources = [
|
|
385
267
|
(
|
|
386
|
-
"
|
|
268
|
+
"archive.zip",
|
|
387
269
|
"7d74bc7facc3d7c07c4df1c1c6ac563e",
|
|
388
270
|
),
|
|
389
271
|
]
|
|
@@ -391,9 +273,13 @@ class FiniteIncome(Dataset):
|
|
|
391
273
|
def __init__(
|
|
392
274
|
self,
|
|
393
275
|
root: Union[str, Path],
|
|
276
|
+
transform: Callable,
|
|
394
277
|
download: bool = False,
|
|
395
|
-
transform: Optional[Callable] = None,
|
|
396
278
|
) -> None:
|
|
279
|
+
"""
|
|
280
|
+
Constructor method to initialize the dataset.
|
|
281
|
+
"""
|
|
282
|
+
|
|
397
283
|
super().__init__()
|
|
398
284
|
self.root = root
|
|
399
285
|
self.transform = transform
|
|
@@ -403,21 +289,28 @@ class FiniteIncome(Dataset):
|
|
|
403
289
|
|
|
404
290
|
if not self._check_exists():
|
|
405
291
|
raise RuntimeError(
|
|
406
|
-
"Dataset not found. You can use download=True to download it"
|
|
292
|
+
"Dataset not found. You can use download=True to download it."
|
|
407
293
|
)
|
|
408
294
|
|
|
409
295
|
self.data_input, self.data_output = self._load_data()
|
|
410
296
|
|
|
411
297
|
def _load_data(self):
|
|
298
|
+
"""
|
|
299
|
+
Loads the Family Income and Expenditure dataset from the CSV file
|
|
300
|
+
and applies the transformation.
|
|
412
301
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
302
|
+
The data is read from the `Family Income and Expenditure.csv` file,
|
|
303
|
+
and the transformation function is applied to it. The input and
|
|
304
|
+
output data are separated and returned as numpy arrays.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
Tuple[numpy.ndarray, numpy.ndarray]: A tuple containing the input
|
|
308
|
+
and output data as numpy arrays.
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
data: pd.DataFrame = pd.read_csv(
|
|
312
|
+
os.path.join(self.data_folder, "Family Income and Expenditure.csv")
|
|
313
|
+
).pipe(self.transform)
|
|
421
314
|
|
|
422
315
|
data_input = data["Input"].to_numpy(dtype=np.float32)
|
|
423
316
|
data_output = data["Output"].to_numpy(dtype=np.float32)
|
|
@@ -425,9 +318,28 @@ class FiniteIncome(Dataset):
|
|
|
425
318
|
return data_input, data_output
|
|
426
319
|
|
|
427
320
|
def __len__(self):
|
|
321
|
+
"""
|
|
322
|
+
Returns the number of examples in the dataset.
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
int: The number of examples in the dataset
|
|
326
|
+
(i.e., the number of rows in the input data).
|
|
327
|
+
"""
|
|
328
|
+
|
|
428
329
|
return self.data_input.shape[0]
|
|
429
330
|
|
|
430
331
|
def __getitem__(self, idx):
|
|
332
|
+
"""
|
|
333
|
+
Returns the input-output pair for a given index.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
idx (int): The index of the example to retrieve.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Tuple[torch.Tensor, torch.Tensor]: The input-output pair
|
|
340
|
+
as PyTorch tensors.
|
|
341
|
+
"""
|
|
342
|
+
|
|
431
343
|
example = self.data_input[idx, :]
|
|
432
344
|
target = self.data_output[idx, :]
|
|
433
345
|
example = torch.tensor(example)
|
|
@@ -436,16 +348,43 @@ class FiniteIncome(Dataset):
|
|
|
436
348
|
|
|
437
349
|
@property
|
|
438
350
|
def data_folder(self) -> str:
|
|
351
|
+
"""
|
|
352
|
+
Returns the path to the folder where the dataset is stored.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
str: The path to the dataset folder.
|
|
356
|
+
"""
|
|
357
|
+
|
|
439
358
|
return os.path.join(self.root, self.__class__.__name__)
|
|
440
359
|
|
|
441
360
|
def _check_exists(self) -> bool:
|
|
361
|
+
"""
|
|
362
|
+
Checks if the dataset is already downloaded and verified.
|
|
363
|
+
|
|
364
|
+
This method checks that all required files exist and
|
|
365
|
+
their integrity is validated via MD5 checksums.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
bool: True if all resources exist and their
|
|
369
|
+
integrity is valid, False otherwise.
|
|
370
|
+
"""
|
|
371
|
+
|
|
442
372
|
return all(
|
|
443
373
|
check_integrity(os.path.join(self.data_folder, file_path), checksum)
|
|
444
374
|
for file_path, checksum in self.resources
|
|
445
375
|
)
|
|
446
376
|
|
|
447
377
|
def download(self) -> None:
|
|
448
|
-
"""
|
|
378
|
+
"""
|
|
379
|
+
Downloads and extracts the dataset.
|
|
380
|
+
|
|
381
|
+
This method attempts to download the dataset from the mirrors
|
|
382
|
+
and extract it into the appropriate folder. If any error occurs
|
|
383
|
+
during downloading, it will try each mirror in sequence.
|
|
384
|
+
|
|
385
|
+
Raises:
|
|
386
|
+
RuntimeError: If all mirrors fail to provide the dataset.
|
|
387
|
+
"""
|
|
449
388
|
|
|
450
389
|
if self._check_exists():
|
|
451
390
|
return
|
|
@@ -456,10 +395,13 @@ class FiniteIncome(Dataset):
|
|
|
456
395
|
for filename, md5 in self.resources:
|
|
457
396
|
errors = []
|
|
458
397
|
for mirror in self.mirrors:
|
|
459
|
-
url = f"{mirror}
|
|
398
|
+
url = f"{mirror}"
|
|
460
399
|
try:
|
|
461
400
|
download_and_extract_archive(
|
|
462
|
-
url,
|
|
401
|
+
url,
|
|
402
|
+
download_root=self.data_folder,
|
|
403
|
+
filename=filename,
|
|
404
|
+
md5=md5,
|
|
463
405
|
)
|
|
464
406
|
except URLError as e:
|
|
465
407
|
errors.append(e)
|
|
@@ -471,272 +413,87 @@ class FiniteIncome(Dataset):
|
|
|
471
413
|
s += f"Tried {mirror}, got:\n{str(err)}\n"
|
|
472
414
|
raise RuntimeError(s)
|
|
473
415
|
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
|
|
491
|
-
# transpose back such that columns are the same as before except with different labels
|
|
492
|
-
return new_df.transpose()
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
class FIDataModule(L.LightningDataModule):
|
|
496
|
-
"""Finite Income dataset module."""
|
|
416
|
+
|
|
417
|
+
class NoisySines(Dataset):
|
|
418
|
+
"""
|
|
419
|
+
A PyTorch dataset generating samples from a causal
|
|
420
|
+
sine wave with added noise.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
length (int): Number of data points in the dataset.
|
|
424
|
+
amplitude (float): Amplitude of the sine wave.
|
|
425
|
+
frequency (float): Frequency of the sine wave in Hz.
|
|
426
|
+
noise_std (float): Standard deviation of the Gaussian noise.
|
|
427
|
+
bias (float): Offset from zero.
|
|
428
|
+
|
|
429
|
+
The sine wave is zero for times before t=0 and follows a
|
|
430
|
+
standard sine wave after t=0, with Gaussian noise added to all points.
|
|
431
|
+
"""
|
|
497
432
|
|
|
498
433
|
def __init__(
|
|
499
434
|
self,
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
self.
|
|
514
|
-
self.
|
|
515
|
-
self.
|
|
516
|
-
self.val_size = val_size
|
|
517
|
-
self.test_size = test_size
|
|
518
|
-
self.shuffle_train = shuffle_train
|
|
519
|
-
self.shuffle_val = shuffle_val
|
|
520
|
-
self.shuffle_test = shuffle_test
|
|
521
|
-
self.num_workers = num_workers
|
|
522
|
-
self.pin_memory = pin_memory
|
|
523
|
-
|
|
524
|
-
def prepare_data(self):
|
|
525
|
-
# Define steps that should be done
|
|
526
|
-
# on only one GPU, like getting data.
|
|
527
|
-
# TODO downloading currently disabled since not compatible with api
|
|
528
|
-
# FiniteIncome(self.dataset_directory, download=True, transform=self.transform)
|
|
529
|
-
pass
|
|
530
|
-
|
|
531
|
-
def setup(self, stage=None):
|
|
532
|
-
# Define steps that should be done on
|
|
533
|
-
# every GPU, like splitting data, applying
|
|
534
|
-
# transform etc.
|
|
535
|
-
data = FiniteIncome(self.dataset_directory, transform=self.transform)
|
|
536
|
-
|
|
537
|
-
train_val_data, self.test_data = random_split(
|
|
538
|
-
data, [1 - self.test_size, self.test_size]
|
|
539
|
-
)
|
|
540
|
-
self.train_data, self.val_data = random_split(
|
|
541
|
-
train_val_data,
|
|
542
|
-
[
|
|
543
|
-
self.train_size / (1 - self.test_size),
|
|
544
|
-
self.val_size / (1 - self.test_size),
|
|
545
|
-
],
|
|
546
|
-
)
|
|
435
|
+
length,
|
|
436
|
+
amplitude=1,
|
|
437
|
+
frequency=10.0,
|
|
438
|
+
noise_std=0.05,
|
|
439
|
+
bias=0,
|
|
440
|
+
random_seed=42,
|
|
441
|
+
):
|
|
442
|
+
"""
|
|
443
|
+
Initializes the NoisyCausalSine dataset.
|
|
444
|
+
"""
|
|
445
|
+
self.length = length
|
|
446
|
+
self.amplitude = amplitude
|
|
447
|
+
self.frequency = frequency
|
|
448
|
+
self.noise_std = noise_std
|
|
449
|
+
self.bias = bias
|
|
450
|
+
self.random_seed = random_seed
|
|
547
451
|
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
self.train_data,
|
|
552
|
-
batch_size=self.batch_size,
|
|
553
|
-
shuffle=self.shuffle_train,
|
|
554
|
-
num_workers=self.num_workers,
|
|
555
|
-
pin_memory=self.pin_memory,
|
|
556
|
-
)
|
|
452
|
+
np.random.seed(self.random_seed)
|
|
453
|
+
self.time = np.linspace(0, 1, length)
|
|
454
|
+
self.noise = np.random.normal(0, self.noise_std, length)
|
|
557
455
|
|
|
558
|
-
def
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
self.val_data,
|
|
562
|
-
batch_size=self.batch_size,
|
|
563
|
-
shuffle=self.shuffle_val,
|
|
564
|
-
num_workers=self.num_workers,
|
|
565
|
-
pin_memory=self.pin_memory,
|
|
566
|
-
)
|
|
456
|
+
def __getitem__(self, idx):
|
|
457
|
+
"""
|
|
458
|
+
Returns the time and noisy sine wave value for a given index.
|
|
567
459
|
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
return DataLoader(
|
|
571
|
-
self.test_data,
|
|
572
|
-
batch_size=self.batch_size,
|
|
573
|
-
shuffle=self.shuffle_test,
|
|
574
|
-
num_workers=self.num_workers,
|
|
575
|
-
pin_memory=self.pin_memory,
|
|
576
|
-
)
|
|
460
|
+
Args:
|
|
461
|
+
idx (int): Index of the data point to retrieve.
|
|
577
462
|
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
)
|
|
592
|
-
return temp_df
|
|
593
|
-
|
|
594
|
-
def check_constraints_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
595
|
-
"""Check if all the constraints are satisfied for the dataframe and remove the examples that do not satisfy the constraint. This
|
|
596
|
-
function only works for the Family Income dataset and the constraints are that the household income is larger than all the expenses
|
|
597
|
-
and the food expense is larger than the sum of the other (more detailed) food expenses.
|
|
598
|
-
"""
|
|
599
|
-
temp_df = df.copy()
|
|
600
|
-
# check that household income is larger than expenses in the output
|
|
601
|
-
input_array = temp_df["Input"].to_numpy()
|
|
602
|
-
income_array = np.add(
|
|
603
|
-
np.multiply(
|
|
604
|
-
input_array[:, [0, 1]],
|
|
605
|
-
np.subtract(
|
|
606
|
-
np.asarray([11815988, 9234485]), np.asarray([11285, 0])
|
|
607
|
-
),
|
|
608
|
-
),
|
|
609
|
-
np.asarray([11285, 0]),
|
|
610
|
-
)
|
|
611
|
-
expense_array = temp_df["Output"].to_numpy()
|
|
612
|
-
expense_array = np.add(
|
|
613
|
-
np.multiply(
|
|
614
|
-
expense_array,
|
|
615
|
-
np.subtract(
|
|
616
|
-
np.asarray(
|
|
617
|
-
[
|
|
618
|
-
791848,
|
|
619
|
-
437467,
|
|
620
|
-
140992,
|
|
621
|
-
74800,
|
|
622
|
-
2188560,
|
|
623
|
-
1049275,
|
|
624
|
-
149940,
|
|
625
|
-
731000,
|
|
626
|
-
]
|
|
627
|
-
),
|
|
628
|
-
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
629
|
-
),
|
|
630
|
-
),
|
|
631
|
-
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
632
|
-
)
|
|
633
|
-
expense_array_without_dup = expense_array[:, [0, 4, 5, 6, 7]]
|
|
634
|
-
sum_expenses = np.sum(expense_array_without_dup, axis=1)
|
|
635
|
-
total_income = np.sum(income_array, axis=1)
|
|
636
|
-
sanity_check_array = np.greater_equal(total_income, sum_expenses)
|
|
637
|
-
temp_df["Unimportant"] = sanity_check_array.tolist()
|
|
638
|
-
reduction = temp_df[temp_df.Unimportant]
|
|
639
|
-
drop_reduction = reduction.drop("Unimportant", axis=1)
|
|
640
|
-
|
|
641
|
-
# check that the food expense is larger than all the sub expenses
|
|
642
|
-
expense_reduced_array = drop_reduction["Output"].to_numpy()
|
|
643
|
-
expense_reduced_array = np.add(
|
|
644
|
-
np.multiply(
|
|
645
|
-
expense_reduced_array,
|
|
646
|
-
np.subtract(
|
|
647
|
-
np.asarray(
|
|
648
|
-
[
|
|
649
|
-
791848,
|
|
650
|
-
437467,
|
|
651
|
-
140992,
|
|
652
|
-
74800,
|
|
653
|
-
2188560,
|
|
654
|
-
1049275,
|
|
655
|
-
149940,
|
|
656
|
-
731000,
|
|
657
|
-
]
|
|
658
|
-
),
|
|
659
|
-
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
660
|
-
),
|
|
661
|
-
),
|
|
662
|
-
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
663
|
-
)
|
|
664
|
-
food_mul_expense_array = expense_reduced_array[:, [1, 2, 3]]
|
|
665
|
-
food_mul_expense_array_sum = np.sum(food_mul_expense_array, axis=1)
|
|
666
|
-
food_expense_array = expense_reduced_array[:, 0]
|
|
667
|
-
sanity_check_array = np.greater_equal(
|
|
668
|
-
food_expense_array, food_mul_expense_array_sum
|
|
669
|
-
)
|
|
670
|
-
drop_reduction["Unimportant"] = sanity_check_array.tolist()
|
|
671
|
-
new_reduction = drop_reduction[drop_reduction.Unimportant]
|
|
672
|
-
satisfied_constraints_df = new_reduction.drop("Unimportant", axis=1)
|
|
673
|
-
|
|
674
|
-
return satisfied_constraints_df
|
|
675
|
-
|
|
676
|
-
def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
|
|
677
|
-
"""Sample 2500 examples from the dataframe without replacement."""
|
|
678
|
-
temp_df = df.copy()
|
|
679
|
-
sample_df = temp_df.sample(n=2500, replace=False, random_state=3, axis=0)
|
|
680
|
-
return sample_df
|
|
681
|
-
|
|
682
|
-
def add_input_output_family_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
683
|
-
"""Add a multiindex denoting if the column is an input or output variable."""
|
|
684
|
-
# copy the dataframe
|
|
685
|
-
temp_df = df.copy()
|
|
686
|
-
# extract all the column names
|
|
687
|
-
column_names = temp_df.columns.tolist()
|
|
688
|
-
# the 2nd-9th columns correspond to output variables and all others to input variables. So make list of corresponding lengths of 'Input' and 'Output'
|
|
689
|
-
input_list_start = ["Input"]
|
|
690
|
-
input_list_end = ["Input"] * (len(column_names) - 9)
|
|
691
|
-
output_list = ["Output"] * 8
|
|
692
|
-
# concat both lists
|
|
693
|
-
input_output_list = input_list_start + output_list + input_list_end
|
|
694
|
-
# define multi index for attaching this 'Input' and 'Output' list with the column names already existing
|
|
695
|
-
multiindex_bias = pd.MultiIndex.from_arrays(
|
|
696
|
-
[input_output_list, column_names]
|
|
463
|
+
Returns:
|
|
464
|
+
Tuple[torch.Tensor, torch.Tensor]: A tuple containing the
|
|
465
|
+
time value and the noisy sine wave value.
|
|
466
|
+
"""
|
|
467
|
+
|
|
468
|
+
t = self.time[idx]
|
|
469
|
+
if idx < self.length // 2:
|
|
470
|
+
sine_value = self.bias
|
|
471
|
+
cosine_value = self.bias
|
|
472
|
+
else:
|
|
473
|
+
sine_value = (
|
|
474
|
+
self.amplitude * np.sin(2 * np.pi * self.frequency * t)
|
|
475
|
+
+ self.bias
|
|
697
476
|
)
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
return new_df.transpose()
|
|
702
|
-
|
|
703
|
-
return (
|
|
704
|
-
# drop missing values
|
|
705
|
-
df.dropna(how="any")
|
|
706
|
-
# convert object to fitting dtype
|
|
707
|
-
.convert_dtypes()
|
|
708
|
-
# remove all strings (no other dtypes are present except for integers and floats)
|
|
709
|
-
.select_dtypes(exclude=["string"])
|
|
710
|
-
# transform all numbers to same dtype
|
|
711
|
-
.astype("float32")
|
|
712
|
-
# drop column with label Agricultural Household indicator because this is not really a numerical input but rather a categorical/classification
|
|
713
|
-
.drop(["Agricultural Household indicator"], axis=1, inplace=False)
|
|
714
|
-
# this column is dropped because it depends on Agricultural Household indicator
|
|
715
|
-
.drop(["Crop Farming and Gardening expenses"], axis=1, inplace=False)
|
|
716
|
-
# use 8 output variables and 24 input variables
|
|
717
|
-
.drop(
|
|
718
|
-
[
|
|
719
|
-
"Total Rice Expenditure",
|
|
720
|
-
"Total Fish and marine products Expenditure",
|
|
721
|
-
"Fruit Expenditure",
|
|
722
|
-
"Restaurant and hotels Expenditure",
|
|
723
|
-
"Alcoholic Beverages Expenditure",
|
|
724
|
-
"Tobacco Expenditure",
|
|
725
|
-
"Clothing, Footwear and Other Wear Expenditure",
|
|
726
|
-
"Imputed House Rental Value",
|
|
727
|
-
"Transportation Expenditure",
|
|
728
|
-
"Miscellaneous Goods and Services Expenditure",
|
|
729
|
-
"Special Occasions Expenditure",
|
|
730
|
-
],
|
|
731
|
-
axis=1,
|
|
732
|
-
inplace=False,
|
|
477
|
+
cosine_value = (
|
|
478
|
+
self.amplitude * np.cos(2 * np.pi * self.frequency * t)
|
|
479
|
+
+ self.bias
|
|
733
480
|
)
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
.
|
|
481
|
+
|
|
482
|
+
# Add noise to the signals
|
|
483
|
+
noisy_sine = sine_value + self.noise[idx]
|
|
484
|
+
noisy_cosine = cosine_value + self.noise[idx]
|
|
485
|
+
|
|
486
|
+
# Convert to tensor
|
|
487
|
+
example, target = torch.tensor([t], dtype=torch.float32), torch.tensor(
|
|
488
|
+
[noisy_sine, noisy_cosine], dtype=torch.float32
|
|
742
489
|
)
|
|
490
|
+
return example, target
|
|
491
|
+
|
|
492
|
+
def __len__(self):
|
|
493
|
+
"""
|
|
494
|
+
Returns the total number of data points in the dataset.
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
int: The length of the dataset.
|
|
498
|
+
"""
|
|
499
|
+
return self.length
|