congrads 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- congrads/__init__.py +7 -6
- congrads/constraints.py +182 -300
- congrads/core.py +158 -144
- congrads/datasets.py +12 -559
- congrads/descriptor.py +20 -35
- congrads/metrics.py +37 -52
- congrads/networks.py +5 -6
- congrads/utils.py +310 -0
- congrads-0.2.0.dist-info/LICENSE +26 -0
- congrads-0.2.0.dist-info/METADATA +222 -0
- congrads-0.2.0.dist-info/RECORD +13 -0
- congrads/learners.py +0 -233
- congrads-0.1.0.dist-info/LICENSE +0 -34
- congrads-0.1.0.dist-info/METADATA +0 -196
- congrads-0.1.0.dist-info/RECORD +0 -13
- {congrads-0.1.0.dist-info → congrads-0.2.0.dist-info}/WHEEL +0 -0
- {congrads-0.1.0.dist-info → congrads-0.2.0.dist-info}/top_level.txt +0 -0
congrads/datasets.py
CHANGED
|
@@ -2,34 +2,15 @@ import os
|
|
|
2
2
|
from urllib.error import URLError
|
|
3
3
|
import numpy as np
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Callable,
|
|
5
|
+
from typing import Callable, Union
|
|
6
6
|
import pandas as pd
|
|
7
|
-
|
|
8
|
-
from torch.utils.data import Dataset, random_split, DataLoader
|
|
7
|
+
from torch.utils.data import Dataset
|
|
9
8
|
import torch
|
|
10
9
|
|
|
11
10
|
from torchvision.datasets.utils import check_integrity, download_and_extract_archive
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
class BiasCorrection(Dataset):
|
|
15
|
-
"""
|
|
16
|
-
Bias Correction Dataset.
|
|
17
|
-
|
|
18
|
-
This class provides access to a dataset used for bias correction of numerical prediction models,
|
|
19
|
-
specifically for temperature forecasts. The dataset is stored in CSV format and can be optionally
|
|
20
|
-
transformed using a provided function. The class includes methods for downloading, loading, and
|
|
21
|
-
accessing the dataset, as well as applying transformations.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
root (str or pathlib.Path): Root directory of the dataset where the 'BiasCorrection/Bias_correction_ucl.csv' file is located.
|
|
25
|
-
download (bool, optional): If True, the dataset will be downloaded from the internet and saved to the root directory.
|
|
26
|
-
If the dataset already exists, it will not be downloaded again. Default is False.
|
|
27
|
-
transform (callable, optional): A function/transform that takes a Pandas DataFrame and returns a transformed version.
|
|
28
|
-
This allows for data preprocessing before it is loaded. Default is None.
|
|
29
|
-
|
|
30
|
-
Acknowledgement:
|
|
31
|
-
This class was developed with inspiration from the MNIST dataset class in torchvision.
|
|
32
|
-
"""
|
|
33
14
|
|
|
34
15
|
mirrors = [
|
|
35
16
|
"https://archive.ics.uci.edu/static/public/514/",
|
|
@@ -45,17 +26,9 @@ class BiasCorrection(Dataset):
|
|
|
45
26
|
def __init__(
|
|
46
27
|
self,
|
|
47
28
|
root: Union[str, Path],
|
|
29
|
+
transform: Callable,
|
|
48
30
|
download: bool = False,
|
|
49
|
-
transform: Optional[Callable] = None,
|
|
50
31
|
) -> None:
|
|
51
|
-
"""
|
|
52
|
-
Initializes the BiasCorrection dataset, optionally downloading it if necessary.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
root (Union[str, Path]): The root directory where the dataset will be stored or accessed.
|
|
56
|
-
download (bool, optional): If True, downloads the dataset from the provided mirrors. Default is False.
|
|
57
|
-
transform (Optional[Callable], optional): A function to transform the data before loading. Default is None.
|
|
58
|
-
"""
|
|
59
32
|
|
|
60
33
|
super().__init__()
|
|
61
34
|
self.root = root
|
|
@@ -72,23 +45,10 @@ class BiasCorrection(Dataset):
|
|
|
72
45
|
self.data_input, self.data_output = self._load_data()
|
|
73
46
|
|
|
74
47
|
def _load_data(self):
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
Tuple: A tuple containing two numpy arrays: `data_input` (input data) and `data_output` (output data).
|
|
80
|
-
"""
|
|
81
|
-
|
|
82
|
-
if self.transform:
|
|
83
|
-
data: pd.DataFrame = (
|
|
84
|
-
pd.read_csv(os.path.join(self.data_folder, "Bias_correction_ucl.csv"))
|
|
85
|
-
.pipe(self.transform)
|
|
86
|
-
.pipe(self.add_input_output_temperature)
|
|
87
|
-
)
|
|
88
|
-
else:
|
|
89
|
-
data: pd.DataFrame = pd.read_csv(
|
|
90
|
-
os.path.join(self.data_folder, "Bias_correction_ucl.csv")
|
|
91
|
-
).pipe(self.add_input_output_temperature)
|
|
48
|
+
|
|
49
|
+
data: pd.DataFrame = pd.read_csv(
|
|
50
|
+
os.path.join(self.data_folder, "Bias_correction_ucl.csv")
|
|
51
|
+
).pipe(self.transform)
|
|
92
52
|
|
|
93
53
|
data_input = data["Input"].to_numpy(dtype=np.float32)
|
|
94
54
|
data_output = data["Output"].to_numpy(dtype=np.float32)
|
|
@@ -96,25 +56,10 @@ class BiasCorrection(Dataset):
|
|
|
96
56
|
return data_input, data_output
|
|
97
57
|
|
|
98
58
|
def __len__(self):
|
|
99
|
-
"""
|
|
100
|
-
Returns the number of examples in the dataset.
|
|
101
59
|
|
|
102
|
-
Returns:
|
|
103
|
-
int: The total number of examples in the dataset (i.e., the number of rows in the input data).
|
|
104
|
-
"""
|
|
105
|
-
|
|
106
60
|
return self.data_input.shape[0]
|
|
107
61
|
|
|
108
62
|
def __getitem__(self, idx):
|
|
109
|
-
"""
|
|
110
|
-
Retrieves a single example and its corresponding target from the dataset.
|
|
111
|
-
|
|
112
|
-
Args:
|
|
113
|
-
idx (int): The index of the example to retrieve.
|
|
114
|
-
|
|
115
|
-
Returns:
|
|
116
|
-
Tuple: A tuple containing two tensors: the input example and the target output.
|
|
117
|
-
"""
|
|
118
63
|
|
|
119
64
|
example = self.data_input[idx, :]
|
|
120
65
|
target = self.data_output[idx, :]
|
|
@@ -124,36 +69,16 @@ class BiasCorrection(Dataset):
|
|
|
124
69
|
|
|
125
70
|
@property
|
|
126
71
|
def data_folder(self) -> str:
|
|
127
|
-
"""
|
|
128
|
-
Returns the path to the folder where the dataset is stored.
|
|
129
|
-
|
|
130
|
-
Returns:
|
|
131
|
-
str: The path to the dataset folder within the root directory.
|
|
132
|
-
"""
|
|
133
72
|
|
|
134
73
|
return os.path.join(self.root, self.__class__.__name__)
|
|
135
74
|
|
|
136
75
|
def _check_exists(self) -> bool:
|
|
137
|
-
"""
|
|
138
|
-
Checks if the dataset files exist and their integrity is verified.
|
|
139
|
-
|
|
140
|
-
Returns:
|
|
141
|
-
bool: True if the dataset exists and is valid, otherwise False.
|
|
142
|
-
"""
|
|
143
|
-
|
|
144
76
|
return all(
|
|
145
77
|
check_integrity(os.path.join(self.data_folder, file_path), checksum)
|
|
146
78
|
for file_path, checksum in self.resources
|
|
147
79
|
)
|
|
148
80
|
|
|
149
81
|
def download(self) -> None:
|
|
150
|
-
"""
|
|
151
|
-
Downloads the dataset from the provided mirrors and extracts it.
|
|
152
|
-
|
|
153
|
-
The method ensures the dataset is downloaded and extracted to the appropriate folder.
|
|
154
|
-
If the dataset is already present, it will not be downloaded again.
|
|
155
|
-
"""
|
|
156
|
-
|
|
157
82
|
if self._check_exists():
|
|
158
83
|
return
|
|
159
84
|
|
|
@@ -178,204 +103,8 @@ class BiasCorrection(Dataset):
|
|
|
178
103
|
s += f"Tried {mirror}, got:\n{str(err)}\n"
|
|
179
104
|
raise RuntimeError(s)
|
|
180
105
|
|
|
181
|
-
@staticmethod
|
|
182
|
-
def add_input_output_temperature(df: pd.DataFrame) -> pd.DataFrame:
|
|
183
|
-
"""Add a multiindex denoting if the column is an input or output variable."""
|
|
184
|
-
# copy the dataframe
|
|
185
|
-
temp_df = df.copy()
|
|
186
|
-
# extract all the column names
|
|
187
|
-
column_names = temp_df.columns.tolist()
|
|
188
|
-
# only the last 2 columns are output variables, all others are input variables. So make list of corresponding lengths of 'Input' and 'Output'
|
|
189
|
-
input_list = ["Input"] * (len(column_names) - 2)
|
|
190
|
-
output_list = ["Output"] * 2
|
|
191
|
-
# concat both lists
|
|
192
|
-
input_output_list = input_list + output_list
|
|
193
|
-
# define multi index for attaching this 'Input' and 'Output' list with the column names already existing
|
|
194
|
-
multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
|
|
195
|
-
# transpose such that index can be adjusted to multi index
|
|
196
|
-
new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
|
|
197
|
-
# transpose back such that columns are the same as before except with different labels
|
|
198
|
-
return new_df.transpose()
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
class BCDataModule(L.LightningDataModule):
|
|
202
|
-
"""Bias Correction dataset module."""
|
|
203
|
-
|
|
204
|
-
def __init__(
|
|
205
|
-
self,
|
|
206
|
-
dataset_directory: str = "./datasets",
|
|
207
|
-
batch_size: int = 32,
|
|
208
|
-
train_size: float = 0.8,
|
|
209
|
-
val_size: float = 0.1,
|
|
210
|
-
test_size: float = 0.1,
|
|
211
|
-
shuffle_train: bool = True,
|
|
212
|
-
shuffle_val: bool = False,
|
|
213
|
-
shuffle_test: bool = False,
|
|
214
|
-
num_workers: int = 4,
|
|
215
|
-
pin_memory: bool = False,
|
|
216
|
-
) -> None:
|
|
217
|
-
super().__init__()
|
|
218
|
-
# Define required parameters here
|
|
219
|
-
self.batch_size = batch_size
|
|
220
|
-
self.dataset_directory = dataset_directory
|
|
221
|
-
self.train_size = train_size
|
|
222
|
-
self.val_size = val_size
|
|
223
|
-
self.test_size = test_size
|
|
224
|
-
self.shuffle_train = shuffle_train
|
|
225
|
-
self.shuffle_val = shuffle_val
|
|
226
|
-
self.shuffle_test = shuffle_test
|
|
227
|
-
self.num_workers = num_workers
|
|
228
|
-
self.pin_memory = pin_memory
|
|
229
|
-
|
|
230
|
-
def prepare_data(self):
|
|
231
|
-
# Define steps that should be done
|
|
232
|
-
# on only one GPU, like getting data.
|
|
233
|
-
BiasCorrection(self.dataset_directory, download=True, transform=self.transform)
|
|
234
|
-
|
|
235
|
-
def setup(self, stage=None):
|
|
236
|
-
# Define steps that should be done on
|
|
237
|
-
# every GPU, like splitting data, applying
|
|
238
|
-
# transform etc.
|
|
239
|
-
data = BiasCorrection(self.dataset_directory, transform=self.transform)
|
|
240
|
-
|
|
241
|
-
train_val_data, self.test_data = random_split(
|
|
242
|
-
data, [1 - self.test_size, self.test_size]
|
|
243
|
-
)
|
|
244
|
-
self.train_data, self.val_data = random_split(
|
|
245
|
-
train_val_data,
|
|
246
|
-
[
|
|
247
|
-
self.train_size / (1 - self.test_size),
|
|
248
|
-
self.val_size / (1 - self.test_size),
|
|
249
|
-
],
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
def train_dataloader(self):
|
|
253
|
-
# Return DataLoader for Training Data here
|
|
254
|
-
return DataLoader(
|
|
255
|
-
self.train_data,
|
|
256
|
-
batch_size=self.batch_size,
|
|
257
|
-
shuffle=self.shuffle_train,
|
|
258
|
-
num_workers=self.num_workers,
|
|
259
|
-
pin_memory=self.pin_memory,
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
def val_dataloader(self):
|
|
263
|
-
# Return DataLoader for Validation Data here
|
|
264
|
-
return DataLoader(
|
|
265
|
-
self.val_data,
|
|
266
|
-
batch_size=self.batch_size,
|
|
267
|
-
shuffle=self.shuffle_val,
|
|
268
|
-
num_workers=self.num_workers,
|
|
269
|
-
pin_memory=self.pin_memory,
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
def test_dataloader(self):
|
|
273
|
-
# Return DataLoader for Testing Data here
|
|
274
|
-
return DataLoader(
|
|
275
|
-
self.test_data,
|
|
276
|
-
batch_size=self.batch_size,
|
|
277
|
-
shuffle=self.shuffle_test,
|
|
278
|
-
num_workers=self.num_workers,
|
|
279
|
-
pin_memory=self.pin_memory,
|
|
280
|
-
)
|
|
281
|
-
|
|
282
|
-
@staticmethod
|
|
283
|
-
def transform(df: pd.DataFrame) -> pd.DataFrame:
|
|
284
|
-
def date_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
|
|
285
|
-
"""Transform the string that denotes the date to the datetime format in pandas."""
|
|
286
|
-
# make copy of dataframe
|
|
287
|
-
df_temp = df.copy()
|
|
288
|
-
# add new column at the front where the date string is transformed to the datetime format
|
|
289
|
-
df_temp.insert(0, "DateTransformed", pd.to_datetime(df_temp["Date"]))
|
|
290
|
-
return df_temp
|
|
291
|
-
|
|
292
|
-
def add_year(df: pd.DataFrame) -> pd.DataFrame:
|
|
293
|
-
"""Extract the year from the datetime cell and add it as a new column to the dataframe at the front."""
|
|
294
|
-
# make copy of dataframe
|
|
295
|
-
df_temp = df.copy()
|
|
296
|
-
# extract year and add new column at the front containing these numbers
|
|
297
|
-
df_temp.insert(0, "Year", df_temp["DateTransformed"].dt.year)
|
|
298
|
-
return df_temp
|
|
299
|
-
|
|
300
|
-
def add_month(df: pd.DataFrame) -> pd.DataFrame:
|
|
301
|
-
"""Extract the month from the datetime cell and add it as a new column to the dataframe at the front."""
|
|
302
|
-
# make copy of dataframe
|
|
303
|
-
df_temp = df.copy()
|
|
304
|
-
# extract month and add new column at index 1 containing these numbers
|
|
305
|
-
df_temp.insert(1, "Month", df_temp["DateTransformed"].dt.month)
|
|
306
|
-
return df_temp
|
|
307
|
-
|
|
308
|
-
def add_day(df: pd.DataFrame) -> pd.DataFrame:
|
|
309
|
-
"""Extract the day from the datetime cell and add it as a new column to the dataframe at the front."""
|
|
310
|
-
# make copy of dataframe
|
|
311
|
-
df_temp = df.copy()
|
|
312
|
-
# extract day and add new column at index 2 containing these numbers
|
|
313
|
-
df_temp.insert(2, "Day", df_temp["DateTransformed"].dt.day)
|
|
314
|
-
return df_temp
|
|
315
|
-
|
|
316
|
-
def normalize_columns_bias(df: pd.DataFrame) -> pd.DataFrame:
|
|
317
|
-
"""Normalize the columns for the bias correction dataset. This is different from normalizing all the columns separately because the
|
|
318
|
-
upper and lower bounds for the output variables are assumed to be the same.
|
|
319
|
-
"""
|
|
320
|
-
# copy the dataframe
|
|
321
|
-
temp_df = df.copy()
|
|
322
|
-
# normalize each column
|
|
323
|
-
for feature_name in df.columns:
|
|
324
|
-
# the output columns are normalized using the same upper and lower bound for more efficient check of the inequality
|
|
325
|
-
if feature_name == "Next_Tmax" or feature_name == "Next_Tmin":
|
|
326
|
-
max_value = 38.9
|
|
327
|
-
min_value = 11.3
|
|
328
|
-
# the input columns are normalized using their respective upper and lower bounds
|
|
329
|
-
else:
|
|
330
|
-
max_value = df[feature_name].max()
|
|
331
|
-
min_value = df[feature_name].min()
|
|
332
|
-
temp_df[feature_name] = (df[feature_name] - min_value) / (
|
|
333
|
-
max_value - min_value
|
|
334
|
-
)
|
|
335
|
-
return temp_df
|
|
336
|
-
|
|
337
|
-
def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
|
|
338
|
-
"""Sample 2500 examples from the dataframe without replacement."""
|
|
339
|
-
temp_df = df.copy()
|
|
340
|
-
sample_df = temp_df.sample(n=2500, replace=False, random_state=3, axis=0)
|
|
341
|
-
return sample_df
|
|
342
|
-
|
|
343
|
-
return (
|
|
344
|
-
# drop missing values
|
|
345
|
-
df.dropna(how="any")
|
|
346
|
-
# transform string date to datetime format
|
|
347
|
-
.pipe(date_to_datetime)
|
|
348
|
-
# add year as a single column
|
|
349
|
-
.pipe(add_year)
|
|
350
|
-
# add month as a single column
|
|
351
|
-
.pipe(add_month)
|
|
352
|
-
# add day as a single column
|
|
353
|
-
.pipe(add_day)
|
|
354
|
-
# remove original date string and the datetime format
|
|
355
|
-
.drop(["Date", "DateTransformed"], axis=1, inplace=False)
|
|
356
|
-
# convert all numbers to float32
|
|
357
|
-
.astype("float32")
|
|
358
|
-
# normalize columns
|
|
359
|
-
.pipe(normalize_columns_bias)
|
|
360
|
-
# sample 2500 examples out of the dataset
|
|
361
|
-
.pipe(sample_2500_examples)
|
|
362
|
-
)
|
|
363
|
-
|
|
364
106
|
|
|
365
107
|
class FiniteIncome(Dataset):
|
|
366
|
-
"""Finite Income Dataset.
|
|
367
|
-
|
|
368
|
-
Args:
|
|
369
|
-
root (str or ``pathlib.Path``): Root directory of dataset where ``BiasCorrection/Bias_correction_ucl.csv`` exists.
|
|
370
|
-
download (bool, optional): If True, downloads the dataset from the internet and
|
|
371
|
-
puts it in root directory. If dataset is already downloaded, it is not
|
|
372
|
-
downloaded again.
|
|
373
|
-
transform (callable, optional): A function/transform that takes in a Pandas DataFrame
|
|
374
|
-
and returns a transformed version.
|
|
375
|
-
|
|
376
|
-
Acknowledgement:
|
|
377
|
-
This class was developed with inspiration from the MNIST dataset class in torchvision.
|
|
378
|
-
"""
|
|
379
108
|
|
|
380
109
|
mirrors = [
|
|
381
110
|
"https://www.kaggle.com/api/v1/datasets/download/grosvenpaul/",
|
|
@@ -391,8 +120,8 @@ class FiniteIncome(Dataset):
|
|
|
391
120
|
def __init__(
|
|
392
121
|
self,
|
|
393
122
|
root: Union[str, Path],
|
|
123
|
+
transform: Callable,
|
|
394
124
|
download: bool = False,
|
|
395
|
-
transform: Optional[Callable] = None,
|
|
396
125
|
) -> None:
|
|
397
126
|
super().__init__()
|
|
398
127
|
self.root = root
|
|
@@ -403,21 +132,16 @@ class FiniteIncome(Dataset):
|
|
|
403
132
|
|
|
404
133
|
if not self._check_exists():
|
|
405
134
|
raise RuntimeError(
|
|
406
|
-
"Dataset not found. You can use download=True to download it"
|
|
135
|
+
"Dataset not found. You can use download=True to download it."
|
|
407
136
|
)
|
|
408
137
|
|
|
409
138
|
self.data_input, self.data_output = self._load_data()
|
|
410
139
|
|
|
411
140
|
def _load_data(self):
|
|
412
141
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
).pipe(self.transform)
|
|
417
|
-
else:
|
|
418
|
-
data: pd.DataFrame = pd.read_csv(
|
|
419
|
-
os.path.join(self.data_folder, "Family Income and Expenditure.csv")
|
|
420
|
-
).pipe(self.add_input_output_family_income)
|
|
142
|
+
data: pd.DataFrame = pd.read_csv(
|
|
143
|
+
os.path.join(self.data_folder, "Family Income and Expenditure.csv")
|
|
144
|
+
).pipe(self.transform)
|
|
421
145
|
|
|
422
146
|
data_input = data["Input"].to_numpy(dtype=np.float32)
|
|
423
147
|
data_output = data["Output"].to_numpy(dtype=np.float32)
|
|
@@ -445,7 +169,6 @@ class FiniteIncome(Dataset):
|
|
|
445
169
|
)
|
|
446
170
|
|
|
447
171
|
def download(self) -> None:
|
|
448
|
-
"""Download the MNIST data if it doesn't exist already."""
|
|
449
172
|
|
|
450
173
|
if self._check_exists():
|
|
451
174
|
return
|
|
@@ -470,273 +193,3 @@ class FiniteIncome(Dataset):
|
|
|
470
193
|
for mirror, err in zip(self.mirrors, errors):
|
|
471
194
|
s += f"Tried {mirror}, got:\n{str(err)}\n"
|
|
472
195
|
raise RuntimeError(s)
|
|
473
|
-
|
|
474
|
-
@staticmethod
|
|
475
|
-
def add_input_output_family_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
476
|
-
"""Add a multiindex denoting if the column is an input or output variable."""
|
|
477
|
-
# copy the dataframe
|
|
478
|
-
temp_df = df.copy()
|
|
479
|
-
# extract all the column names
|
|
480
|
-
column_names = temp_df.columns.tolist()
|
|
481
|
-
# the 2nd-9th columns correspond to output variables and all others to input variables. So make list of corresponding lengths of 'Input' and 'Output'
|
|
482
|
-
input_list_start = ["Input"]
|
|
483
|
-
input_list_end = ["Input"] * (len(column_names) - 9)
|
|
484
|
-
output_list = ["Output"] * 8
|
|
485
|
-
# concat both lists
|
|
486
|
-
input_output_list = input_list_start + output_list + input_list_end
|
|
487
|
-
# define multi index for attaching this 'Input' and 'Output' list with the column names already existing
|
|
488
|
-
multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
|
|
489
|
-
# transpose such that index can be adjusted to multi index
|
|
490
|
-
new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
|
|
491
|
-
# transpose back such that columns are the same as before except with different labels
|
|
492
|
-
return new_df.transpose()
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
class FIDataModule(L.LightningDataModule):
|
|
496
|
-
"""Finite Income dataset module."""
|
|
497
|
-
|
|
498
|
-
def __init__(
|
|
499
|
-
self,
|
|
500
|
-
dataset_directory: str = "./datasets",
|
|
501
|
-
batch_size: int = 32,
|
|
502
|
-
train_size: float = 0.8,
|
|
503
|
-
val_size: float = 0.1,
|
|
504
|
-
test_size: float = 0.1,
|
|
505
|
-
shuffle_train: bool = True,
|
|
506
|
-
shuffle_val: bool = False,
|
|
507
|
-
shuffle_test: bool = False,
|
|
508
|
-
num_workers: int = 4,
|
|
509
|
-
pin_memory: bool = False,
|
|
510
|
-
) -> None:
|
|
511
|
-
super().__init__()
|
|
512
|
-
# Define required parameters here
|
|
513
|
-
self.batch_size = batch_size
|
|
514
|
-
self.dataset_directory = dataset_directory
|
|
515
|
-
self.train_size = train_size
|
|
516
|
-
self.val_size = val_size
|
|
517
|
-
self.test_size = test_size
|
|
518
|
-
self.shuffle_train = shuffle_train
|
|
519
|
-
self.shuffle_val = shuffle_val
|
|
520
|
-
self.shuffle_test = shuffle_test
|
|
521
|
-
self.num_workers = num_workers
|
|
522
|
-
self.pin_memory = pin_memory
|
|
523
|
-
|
|
524
|
-
def prepare_data(self):
|
|
525
|
-
# Define steps that should be done
|
|
526
|
-
# on only one GPU, like getting data.
|
|
527
|
-
# TODO downloading currently disabled since not compatible with api
|
|
528
|
-
# FiniteIncome(self.dataset_directory, download=True, transform=self.transform)
|
|
529
|
-
pass
|
|
530
|
-
|
|
531
|
-
def setup(self, stage=None):
|
|
532
|
-
# Define steps that should be done on
|
|
533
|
-
# every GPU, like splitting data, applying
|
|
534
|
-
# transform etc.
|
|
535
|
-
data = FiniteIncome(self.dataset_directory, transform=self.transform)
|
|
536
|
-
|
|
537
|
-
train_val_data, self.test_data = random_split(
|
|
538
|
-
data, [1 - self.test_size, self.test_size]
|
|
539
|
-
)
|
|
540
|
-
self.train_data, self.val_data = random_split(
|
|
541
|
-
train_val_data,
|
|
542
|
-
[
|
|
543
|
-
self.train_size / (1 - self.test_size),
|
|
544
|
-
self.val_size / (1 - self.test_size),
|
|
545
|
-
],
|
|
546
|
-
)
|
|
547
|
-
|
|
548
|
-
def train_dataloader(self):
|
|
549
|
-
# Return DataLoader for Training Data here
|
|
550
|
-
return DataLoader(
|
|
551
|
-
self.train_data,
|
|
552
|
-
batch_size=self.batch_size,
|
|
553
|
-
shuffle=self.shuffle_train,
|
|
554
|
-
num_workers=self.num_workers,
|
|
555
|
-
pin_memory=self.pin_memory,
|
|
556
|
-
)
|
|
557
|
-
|
|
558
|
-
def val_dataloader(self):
|
|
559
|
-
# Return DataLoader for Validation Data here
|
|
560
|
-
return DataLoader(
|
|
561
|
-
self.val_data,
|
|
562
|
-
batch_size=self.batch_size,
|
|
563
|
-
shuffle=self.shuffle_val,
|
|
564
|
-
num_workers=self.num_workers,
|
|
565
|
-
pin_memory=self.pin_memory,
|
|
566
|
-
)
|
|
567
|
-
|
|
568
|
-
def test_dataloader(self):
|
|
569
|
-
# Return DataLoader for Testing Data here
|
|
570
|
-
return DataLoader(
|
|
571
|
-
self.test_data,
|
|
572
|
-
batch_size=self.batch_size,
|
|
573
|
-
shuffle=self.shuffle_test,
|
|
574
|
-
num_workers=self.num_workers,
|
|
575
|
-
pin_memory=self.pin_memory,
|
|
576
|
-
)
|
|
577
|
-
|
|
578
|
-
@staticmethod
|
|
579
|
-
def transform(df: pd.DataFrame) -> pd.DataFrame:
|
|
580
|
-
def normalize_columns_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
581
|
-
"""Normalize the columns for the Family Income dataframe. This can also be applied to other dataframes because this function normalizes
|
|
582
|
-
all columns individually."""
|
|
583
|
-
# copy the dataframe
|
|
584
|
-
temp_df = df.copy()
|
|
585
|
-
# normalize each column
|
|
586
|
-
for feature_name in df.columns:
|
|
587
|
-
max_value = df[feature_name].max()
|
|
588
|
-
min_value = df[feature_name].min()
|
|
589
|
-
temp_df[feature_name] = (df[feature_name] - min_value) / (
|
|
590
|
-
max_value - min_value
|
|
591
|
-
)
|
|
592
|
-
return temp_df
|
|
593
|
-
|
|
594
|
-
def check_constraints_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
595
|
-
"""Check if all the constraints are satisfied for the dataframe and remove the examples that do not satisfy the constraint. This
|
|
596
|
-
function only works for the Family Income dataset and the constraints are that the household income is larger than all the expenses
|
|
597
|
-
and the food expense is larger than the sum of the other (more detailed) food expenses.
|
|
598
|
-
"""
|
|
599
|
-
temp_df = df.copy()
|
|
600
|
-
# check that household income is larger than expenses in the output
|
|
601
|
-
input_array = temp_df["Input"].to_numpy()
|
|
602
|
-
income_array = np.add(
|
|
603
|
-
np.multiply(
|
|
604
|
-
input_array[:, [0, 1]],
|
|
605
|
-
np.subtract(
|
|
606
|
-
np.asarray([11815988, 9234485]), np.asarray([11285, 0])
|
|
607
|
-
),
|
|
608
|
-
),
|
|
609
|
-
np.asarray([11285, 0]),
|
|
610
|
-
)
|
|
611
|
-
expense_array = temp_df["Output"].to_numpy()
|
|
612
|
-
expense_array = np.add(
|
|
613
|
-
np.multiply(
|
|
614
|
-
expense_array,
|
|
615
|
-
np.subtract(
|
|
616
|
-
np.asarray(
|
|
617
|
-
[
|
|
618
|
-
791848,
|
|
619
|
-
437467,
|
|
620
|
-
140992,
|
|
621
|
-
74800,
|
|
622
|
-
2188560,
|
|
623
|
-
1049275,
|
|
624
|
-
149940,
|
|
625
|
-
731000,
|
|
626
|
-
]
|
|
627
|
-
),
|
|
628
|
-
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
629
|
-
),
|
|
630
|
-
),
|
|
631
|
-
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
632
|
-
)
|
|
633
|
-
expense_array_without_dup = expense_array[:, [0, 4, 5, 6, 7]]
|
|
634
|
-
sum_expenses = np.sum(expense_array_without_dup, axis=1)
|
|
635
|
-
total_income = np.sum(income_array, axis=1)
|
|
636
|
-
sanity_check_array = np.greater_equal(total_income, sum_expenses)
|
|
637
|
-
temp_df["Unimportant"] = sanity_check_array.tolist()
|
|
638
|
-
reduction = temp_df[temp_df.Unimportant]
|
|
639
|
-
drop_reduction = reduction.drop("Unimportant", axis=1)
|
|
640
|
-
|
|
641
|
-
# check that the food expense is larger than all the sub expenses
|
|
642
|
-
expense_reduced_array = drop_reduction["Output"].to_numpy()
|
|
643
|
-
expense_reduced_array = np.add(
|
|
644
|
-
np.multiply(
|
|
645
|
-
expense_reduced_array,
|
|
646
|
-
np.subtract(
|
|
647
|
-
np.asarray(
|
|
648
|
-
[
|
|
649
|
-
791848,
|
|
650
|
-
437467,
|
|
651
|
-
140992,
|
|
652
|
-
74800,
|
|
653
|
-
2188560,
|
|
654
|
-
1049275,
|
|
655
|
-
149940,
|
|
656
|
-
731000,
|
|
657
|
-
]
|
|
658
|
-
),
|
|
659
|
-
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
660
|
-
),
|
|
661
|
-
),
|
|
662
|
-
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
663
|
-
)
|
|
664
|
-
food_mul_expense_array = expense_reduced_array[:, [1, 2, 3]]
|
|
665
|
-
food_mul_expense_array_sum = np.sum(food_mul_expense_array, axis=1)
|
|
666
|
-
food_expense_array = expense_reduced_array[:, 0]
|
|
667
|
-
sanity_check_array = np.greater_equal(
|
|
668
|
-
food_expense_array, food_mul_expense_array_sum
|
|
669
|
-
)
|
|
670
|
-
drop_reduction["Unimportant"] = sanity_check_array.tolist()
|
|
671
|
-
new_reduction = drop_reduction[drop_reduction.Unimportant]
|
|
672
|
-
satisfied_constraints_df = new_reduction.drop("Unimportant", axis=1)
|
|
673
|
-
|
|
674
|
-
return satisfied_constraints_df
|
|
675
|
-
|
|
676
|
-
def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
|
|
677
|
-
"""Sample 2500 examples from the dataframe without replacement."""
|
|
678
|
-
temp_df = df.copy()
|
|
679
|
-
sample_df = temp_df.sample(n=2500, replace=False, random_state=3, axis=0)
|
|
680
|
-
return sample_df
|
|
681
|
-
|
|
682
|
-
def add_input_output_family_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
683
|
-
"""Add a multiindex denoting if the column is an input or output variable."""
|
|
684
|
-
# copy the dataframe
|
|
685
|
-
temp_df = df.copy()
|
|
686
|
-
# extract all the column names
|
|
687
|
-
column_names = temp_df.columns.tolist()
|
|
688
|
-
# the 2nd-9th columns correspond to output variables and all others to input variables. So make list of corresponding lengths of 'Input' and 'Output'
|
|
689
|
-
input_list_start = ["Input"]
|
|
690
|
-
input_list_end = ["Input"] * (len(column_names) - 9)
|
|
691
|
-
output_list = ["Output"] * 8
|
|
692
|
-
# concat both lists
|
|
693
|
-
input_output_list = input_list_start + output_list + input_list_end
|
|
694
|
-
# define multi index for attaching this 'Input' and 'Output' list with the column names already existing
|
|
695
|
-
multiindex_bias = pd.MultiIndex.from_arrays(
|
|
696
|
-
[input_output_list, column_names]
|
|
697
|
-
)
|
|
698
|
-
# transpose such that index can be adjusted to multi index
|
|
699
|
-
new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
|
|
700
|
-
# transpose back such that columns are the same as before except with different labels
|
|
701
|
-
return new_df.transpose()
|
|
702
|
-
|
|
703
|
-
return (
|
|
704
|
-
# drop missing values
|
|
705
|
-
df.dropna(how="any")
|
|
706
|
-
# convert object to fitting dtype
|
|
707
|
-
.convert_dtypes()
|
|
708
|
-
# remove all strings (no other dtypes are present except for integers and floats)
|
|
709
|
-
.select_dtypes(exclude=["string"])
|
|
710
|
-
# transform all numbers to same dtype
|
|
711
|
-
.astype("float32")
|
|
712
|
-
# drop column with label Agricultural Household indicator because this is not really a numerical input but rather a categorical/classification
|
|
713
|
-
.drop(["Agricultural Household indicator"], axis=1, inplace=False)
|
|
714
|
-
# this column is dropped because it depends on Agricultural Household indicator
|
|
715
|
-
.drop(["Crop Farming and Gardening expenses"], axis=1, inplace=False)
|
|
716
|
-
# use 8 output variables and 24 input variables
|
|
717
|
-
.drop(
|
|
718
|
-
[
|
|
719
|
-
"Total Rice Expenditure",
|
|
720
|
-
"Total Fish and marine products Expenditure",
|
|
721
|
-
"Fruit Expenditure",
|
|
722
|
-
"Restaurant and hotels Expenditure",
|
|
723
|
-
"Alcoholic Beverages Expenditure",
|
|
724
|
-
"Tobacco Expenditure",
|
|
725
|
-
"Clothing, Footwear and Other Wear Expenditure",
|
|
726
|
-
"Imputed House Rental Value",
|
|
727
|
-
"Transportation Expenditure",
|
|
728
|
-
"Miscellaneous Goods and Services Expenditure",
|
|
729
|
-
"Special Occasions Expenditure",
|
|
730
|
-
],
|
|
731
|
-
axis=1,
|
|
732
|
-
inplace=False,
|
|
733
|
-
)
|
|
734
|
-
# add input and output labels to each column
|
|
735
|
-
.pipe(add_input_output_family_income)
|
|
736
|
-
# normalize all the columns
|
|
737
|
-
.pipe(normalize_columns_income)
|
|
738
|
-
# remove all datapoints that do not satisfy the constraints
|
|
739
|
-
.pipe(check_constraints_income)
|
|
740
|
-
# sample 2500 examples
|
|
741
|
-
.pipe(sample_2500_examples)
|
|
742
|
-
)
|