congrads 0.1.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
congrads/datasets.py CHANGED
@@ -1,34 +1,80 @@
1
+ """
2
+ This module defines several PyTorch dataset classes for loading and
3
+ working with various datasets. Each dataset class extends the
4
+ `torch.utils.data.Dataset` class and provides functionality for
5
+ downloading, loading, and transforming specific datasets.
6
+
7
+ Classes:
8
+
9
+ - BiasCorrection: A dataset class for the Bias Correction dataset
10
+ focused on temperature forecast data.
11
+ - FamilyIncome: A dataset class for the Family Income and
12
+ Expenditure dataset.
13
+ - NoisySines: A dataset class that generates noisy sine wave
14
+ samples with added Gaussian noise.
15
+
16
+ Each dataset class provides methods for downloading the data
17
+ (if not already available), checking the integrity of the dataset, loading
18
+ the data from CSV files or generating synthetic data, and applying
19
+ transformations to the data.
20
+
21
+ Key Methods:
22
+
23
+ - `__init__`: Initializes the dataset by specifying the root directory,
24
+ transformation function, and optional download flag.
25
+ - `__getitem__`: Retrieves a specific data point given its index,
26
+ returning input-output pairs.
27
+ - `__len__`: Returns the total number of examples in the dataset.
28
+ - `download`: Downloads and extracts the dataset from
29
+ the specified mirrors.
30
+ - `_load_data`: Loads the dataset from CSV files and
31
+ applies transformations.
32
+ - `_check_exists`: Checks if the dataset is already
33
+ downloaded and verified.
34
+
35
+ Each dataset class allows the user to apply custom transformations to the
36
+ dataset through the `transform` argument to allow pre-processing and offers
37
+ the ability to download the dataset if it's not already present on
38
+ the local disk.
39
+ """
40
+
1
41
  import os
42
+ from pathlib import Path
43
+ from typing import Callable, Union
2
44
  from urllib.error import URLError
45
+
3
46
  import numpy as np
4
- from pathlib import Path
5
- from typing import Callable, Optional, Union
6
47
  import pandas as pd
7
- import lightning as L
8
- from torch.utils.data import Dataset, random_split, DataLoader
9
48
  import torch
10
-
11
- from torchvision.datasets.utils import check_integrity, download_and_extract_archive
49
+ from torch.utils.data import Dataset
50
+ from torchvision.datasets.utils import (
51
+ check_integrity,
52
+ download_and_extract_archive,
53
+ )
12
54
 
13
55
 
14
56
  class BiasCorrection(Dataset):
15
57
  """
16
- Bias Correction Dataset.
58
+ A dataset class for accessing the Bias Correction dataset.
17
59
 
18
- This class provides access to a dataset used for bias correction of numerical prediction models,
19
- specifically for temperature forecasts. The dataset is stored in CSV format and can be optionally
20
- transformed using a provided function. The class includes methods for downloading, loading, and
21
- accessing the dataset, as well as applying transformations.
60
+ This class extends the `Dataset` class and provides functionality for
61
+ downloading, loading, and transforming the Bias Correction dataset.
62
+ The dataset is focused on temperature forecast data and is made available
63
+ for use with PyTorch. If `download` is set to True, the dataset will be
64
+ downloaded if it is not already available. The data is then loaded,
65
+ and a transformation function is applied to it.
22
66
 
23
67
  Args:
24
- root (str or pathlib.Path): Root directory of the dataset where the 'BiasCorrection/Bias_correction_ucl.csv' file is located.
25
- download (bool, optional): If True, the dataset will be downloaded from the internet and saved to the root directory.
26
- If the dataset already exists, it will not be downloaded again. Default is False.
27
- transform (callable, optional): A function/transform that takes a Pandas DataFrame and returns a transformed version.
28
- This allows for data preprocessing before it is loaded. Default is None.
29
-
30
- Acknowledgement:
31
- This class was developed with inspiration from the MNIST dataset class in torchvision.
68
+ root (Union[str, Path]): The root directory where the dataset
69
+ will be stored or loaded from.
70
+ transform (Callable): A function to transform the dataset
71
+ (e.g., preprocessing).
72
+ download (bool, optional): Whether to download the dataset if it's
73
+ not already present. Defaults to False.
74
+
75
+ Raises:
76
+ RuntimeError: If the dataset is not found and `download`
77
+ is not set to True or if all mirrors fail to provide the dataset.
32
78
  """
33
79
 
34
80
  mirrors = [
@@ -37,6 +83,7 @@ class BiasCorrection(Dataset):
37
83
 
38
84
  resources = [
39
85
  (
86
+ # pylint: disable-next=line-too-long
40
87
  "bias+correction+of+numerical+prediction+model+temperature+forecast.zip",
41
88
  "3deee56d461a2686887c4ae38fe3ccf3",
42
89
  ),
@@ -45,16 +92,11 @@ class BiasCorrection(Dataset):
45
92
  def __init__(
46
93
  self,
47
94
  root: Union[str, Path],
95
+ transform: Callable,
48
96
  download: bool = False,
49
- transform: Optional[Callable] = None,
50
97
  ) -> None:
51
98
  """
52
- Initializes the BiasCorrection dataset, optionally downloading it if necessary.
53
-
54
- Args:
55
- root (Union[str, Path]): The root directory where the dataset will be stored or accessed.
56
- download (bool, optional): If True, downloads the dataset from the provided mirrors. Default is False.
57
- transform (Optional[Callable], optional): A function to transform the data before loading. Default is None.
99
+ Constructor method to initialize the dataset.
58
100
  """
59
101
 
60
102
  super().__init__()
@@ -73,22 +115,20 @@ class BiasCorrection(Dataset):
73
115
 
74
116
  def _load_data(self):
75
117
  """
76
- Loads the dataset from the CSV file, applies transformations (if provided), and separates the data into input and output variables.
118
+ Loads the dataset from the CSV file and applies the transformation.
119
+
120
+ The data is read from the `Bias_correction_ucl.csv` file, and the
121
+ transformation function is applied to it.
122
+ The input and output data are separated and returned as numpy arrays.
77
123
 
78
124
  Returns:
79
- Tuple: A tuple containing two numpy arrays: `data_input` (input data) and `data_output` (output data).
125
+ Tuple[numpy.ndarray, numpy.ndarray]: A tuple containing the input
126
+ and output data as numpy arrays.
80
127
  """
81
128
 
82
- if self.transform:
83
- data: pd.DataFrame = (
84
- pd.read_csv(os.path.join(self.data_folder, "Bias_correction_ucl.csv"))
85
- .pipe(self.transform)
86
- .pipe(self.add_input_output_temperature)
87
- )
88
- else:
89
- data: pd.DataFrame = pd.read_csv(
90
- os.path.join(self.data_folder, "Bias_correction_ucl.csv")
91
- ).pipe(self.add_input_output_temperature)
129
+ data: pd.DataFrame = pd.read_csv(
130
+ os.path.join(self.data_folder, "Bias_correction_ucl.csv")
131
+ ).pipe(self.transform)
92
132
 
93
133
  data_input = data["Input"].to_numpy(dtype=np.float32)
94
134
  data_output = data["Output"].to_numpy(dtype=np.float32)
@@ -100,20 +140,22 @@ class BiasCorrection(Dataset):
100
140
  Returns the number of examples in the dataset.
101
141
 
102
142
  Returns:
103
- int: The total number of examples in the dataset (i.e., the number of rows in the input data).
143
+ int: The number of examples in the dataset
144
+ (i.e., the number of rows in the input data).
104
145
  """
105
-
146
+
106
147
  return self.data_input.shape[0]
107
148
 
108
149
  def __getitem__(self, idx):
109
150
  """
110
- Retrieves a single example and its corresponding target from the dataset.
151
+ Returns the input-output pair for a given index.
111
152
 
112
153
  Args:
113
154
  idx (int): The index of the example to retrieve.
114
155
 
115
156
  Returns:
116
- Tuple: A tuple containing two tensors: the input example and the target output.
157
+ Tuple[torch.Tensor, torch.Tensor]: The input-output pair
158
+ as PyTorch tensors.
117
159
  """
118
160
 
119
161
  example = self.data_input[idx, :]
@@ -128,17 +170,21 @@ class BiasCorrection(Dataset):
128
170
  Returns the path to the folder where the dataset is stored.
129
171
 
130
172
  Returns:
131
- str: The path to the dataset folder within the root directory.
173
+ str: The path to the dataset folder.
132
174
  """
133
175
 
134
176
  return os.path.join(self.root, self.__class__.__name__)
135
177
 
136
178
  def _check_exists(self) -> bool:
137
179
  """
138
- Checks if the dataset files exist and their integrity is verified.
180
+ Checks if the dataset is already downloaded and verified.
181
+
182
+ This method checks that all required files exist and
183
+ their integrity is validated via MD5 checksums.
139
184
 
140
185
  Returns:
141
- bool: True if the dataset exists and is valid, otherwise False.
186
+ bool: True if all resources exist and their
187
+ integrity is valid, False otherwise.
142
188
  """
143
189
 
144
190
  return all(
@@ -148,10 +194,14 @@ class BiasCorrection(Dataset):
148
194
 
149
195
  def download(self) -> None:
150
196
  """
151
- Downloads the dataset from the provided mirrors and extracts it.
197
+ Downloads and extracts the dataset.
198
+
199
+ This method attempts to download the dataset from the mirrors and
200
+ extract it into the appropriate folder. If any error occurs during
201
+ downloading, it will try each mirror in sequence.
152
202
 
153
- The method ensures the dataset is downloaded and extracted to the appropriate folder.
154
- If the dataset is already present, it will not be downloaded again.
203
+ Raises:
204
+ RuntimeError: If all mirrors fail to provide the dataset.
155
205
  """
156
206
 
157
207
  if self._check_exists():
@@ -166,7 +216,10 @@ class BiasCorrection(Dataset):
166
216
  url = f"{mirror}{filename}"
167
217
  try:
168
218
  download_and_extract_archive(
169
- url, download_root=self.data_folder, filename=filename, md5=md5
219
+ url,
220
+ download_root=self.data_folder,
221
+ filename=filename,
222
+ md5=md5,
170
223
  )
171
224
  except URLError as e:
172
225
  errors.append(e)
@@ -178,212 +231,41 @@ class BiasCorrection(Dataset):
178
231
  s += f"Tried {mirror}, got:\n{str(err)}\n"
179
232
  raise RuntimeError(s)
180
233
 
181
- @staticmethod
182
- def add_input_output_temperature(df: pd.DataFrame) -> pd.DataFrame:
183
- """Add a multiindex denoting if the column is an input or output variable."""
184
- # copy the dataframe
185
- temp_df = df.copy()
186
- # extract all the column names
187
- column_names = temp_df.columns.tolist()
188
- # only the last 2 columns are output variables, all others are input variables. So make list of corresponding lengths of 'Input' and 'Output'
189
- input_list = ["Input"] * (len(column_names) - 2)
190
- output_list = ["Output"] * 2
191
- # concat both lists
192
- input_output_list = input_list + output_list
193
- # define multi index for attaching this 'Input' and 'Output' list with the column names already existing
194
- multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
195
- # transpose such that index can be adjusted to multi index
196
- new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
197
- # transpose back such that columns are the same as before except with different labels
198
- return new_df.transpose()
199
-
200
-
201
- class BCDataModule(L.LightningDataModule):
202
- """Bias Correction dataset module."""
203
-
204
- def __init__(
205
- self,
206
- dataset_directory: str = "./datasets",
207
- batch_size: int = 32,
208
- train_size: float = 0.8,
209
- val_size: float = 0.1,
210
- test_size: float = 0.1,
211
- shuffle_train: bool = True,
212
- shuffle_val: bool = False,
213
- shuffle_test: bool = False,
214
- num_workers: int = 4,
215
- pin_memory: bool = False,
216
- ) -> None:
217
- super().__init__()
218
- # Define required parameters here
219
- self.batch_size = batch_size
220
- self.dataset_directory = dataset_directory
221
- self.train_size = train_size
222
- self.val_size = val_size
223
- self.test_size = test_size
224
- self.shuffle_train = shuffle_train
225
- self.shuffle_val = shuffle_val
226
- self.shuffle_test = shuffle_test
227
- self.num_workers = num_workers
228
- self.pin_memory = pin_memory
229
-
230
- def prepare_data(self):
231
- # Define steps that should be done
232
- # on only one GPU, like getting data.
233
- BiasCorrection(self.dataset_directory, download=True, transform=self.transform)
234
-
235
- def setup(self, stage=None):
236
- # Define steps that should be done on
237
- # every GPU, like splitting data, applying
238
- # transform etc.
239
- data = BiasCorrection(self.dataset_directory, transform=self.transform)
240
-
241
- train_val_data, self.test_data = random_split(
242
- data, [1 - self.test_size, self.test_size]
243
- )
244
- self.train_data, self.val_data = random_split(
245
- train_val_data,
246
- [
247
- self.train_size / (1 - self.test_size),
248
- self.val_size / (1 - self.test_size),
249
- ],
250
- )
251
-
252
- def train_dataloader(self):
253
- # Return DataLoader for Training Data here
254
- return DataLoader(
255
- self.train_data,
256
- batch_size=self.batch_size,
257
- shuffle=self.shuffle_train,
258
- num_workers=self.num_workers,
259
- pin_memory=self.pin_memory,
260
- )
261
-
262
- def val_dataloader(self):
263
- # Return DataLoader for Validation Data here
264
- return DataLoader(
265
- self.val_data,
266
- batch_size=self.batch_size,
267
- shuffle=self.shuffle_val,
268
- num_workers=self.num_workers,
269
- pin_memory=self.pin_memory,
270
- )
271
-
272
- def test_dataloader(self):
273
- # Return DataLoader for Testing Data here
274
- return DataLoader(
275
- self.test_data,
276
- batch_size=self.batch_size,
277
- shuffle=self.shuffle_test,
278
- num_workers=self.num_workers,
279
- pin_memory=self.pin_memory,
280
- )
281
-
282
- @staticmethod
283
- def transform(df: pd.DataFrame) -> pd.DataFrame:
284
- def date_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
285
- """Transform the string that denotes the date to the datetime format in pandas."""
286
- # make copy of dataframe
287
- df_temp = df.copy()
288
- # add new column at the front where the date string is transformed to the datetime format
289
- df_temp.insert(0, "DateTransformed", pd.to_datetime(df_temp["Date"]))
290
- return df_temp
291
-
292
- def add_year(df: pd.DataFrame) -> pd.DataFrame:
293
- """Extract the year from the datetime cell and add it as a new column to the dataframe at the front."""
294
- # make copy of dataframe
295
- df_temp = df.copy()
296
- # extract year and add new column at the front containing these numbers
297
- df_temp.insert(0, "Year", df_temp["DateTransformed"].dt.year)
298
- return df_temp
299
-
300
- def add_month(df: pd.DataFrame) -> pd.DataFrame:
301
- """Extract the month from the datetime cell and add it as a new column to the dataframe at the front."""
302
- # make copy of dataframe
303
- df_temp = df.copy()
304
- # extract month and add new column at index 1 containing these numbers
305
- df_temp.insert(1, "Month", df_temp["DateTransformed"].dt.month)
306
- return df_temp
307
-
308
- def add_day(df: pd.DataFrame) -> pd.DataFrame:
309
- """Extract the day from the datetime cell and add it as a new column to the dataframe at the front."""
310
- # make copy of dataframe
311
- df_temp = df.copy()
312
- # extract day and add new column at index 2 containing these numbers
313
- df_temp.insert(2, "Day", df_temp["DateTransformed"].dt.day)
314
- return df_temp
315
-
316
- def normalize_columns_bias(df: pd.DataFrame) -> pd.DataFrame:
317
- """Normalize the columns for the bias correction dataset. This is different from normalizing all the columns separately because the
318
- upper and lower bounds for the output variables are assumed to be the same.
319
- """
320
- # copy the dataframe
321
- temp_df = df.copy()
322
- # normalize each column
323
- for feature_name in df.columns:
324
- # the output columns are normalized using the same upper and lower bound for more efficient check of the inequality
325
- if feature_name == "Next_Tmax" or feature_name == "Next_Tmin":
326
- max_value = 38.9
327
- min_value = 11.3
328
- # the input columns are normalized using their respective upper and lower bounds
329
- else:
330
- max_value = df[feature_name].max()
331
- min_value = df[feature_name].min()
332
- temp_df[feature_name] = (df[feature_name] - min_value) / (
333
- max_value - min_value
334
- )
335
- return temp_df
336
-
337
- def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
338
- """Sample 2500 examples from the dataframe without replacement."""
339
- temp_df = df.copy()
340
- sample_df = temp_df.sample(n=2500, replace=False, random_state=3, axis=0)
341
- return sample_df
342
-
343
- return (
344
- # drop missing values
345
- df.dropna(how="any")
346
- # transform string date to datetime format
347
- .pipe(date_to_datetime)
348
- # add year as a single column
349
- .pipe(add_year)
350
- # add month as a single column
351
- .pipe(add_month)
352
- # add day as a single column
353
- .pipe(add_day)
354
- # remove original date string and the datetime format
355
- .drop(["Date", "DateTransformed"], axis=1, inplace=False)
356
- # convert all numbers to float32
357
- .astype("float32")
358
- # normalize columns
359
- .pipe(normalize_columns_bias)
360
- # sample 2500 examples out of the dataset
361
- .pipe(sample_2500_examples)
362
- )
363
234
 
235
+ class FamilyIncome(Dataset):
236
+ """
237
+ A dataset class for accessing the Family Income and Expenditure dataset.
364
238
 
365
- class FiniteIncome(Dataset):
366
- """Finite Income Dataset.
239
+ This class extends the `Dataset` class and provides functionality for
240
+ downloading, loading, and transforming the Family Income and
241
+ Expenditure dataset. The dataset is intended for use with
242
+ PyTorch-based projects, offering a convenient interface for data handling.
243
+ This class provides access to the Family Income and Expenditure dataset
244
+ for use with PyTorch. If `download` is set to True, the dataset will be
245
+ downloaded if it is not already available. The data is then loaded,
246
+ and a user-defined transformation function is applied to it.
367
247
 
368
248
  Args:
369
- root (str or ``pathlib.Path``): Root directory of dataset where ``BiasCorrection/Bias_correction_ucl.csv`` exists.
370
- download (bool, optional): If True, downloads the dataset from the internet and
371
- puts it in root directory. If dataset is already downloaded, it is not
372
- downloaded again.
373
- transform (callable, optional): A function/transform that takes in a Pandas DataFrame
374
- and returns a transformed version.
375
-
376
- Acknowledgement:
377
- This class was developed with inspiration from the MNIST dataset class in torchvision.
249
+ root (Union[str, Path]): The root directory where the dataset will
250
+ be stored or loaded from.
251
+ transform (Callable): A function to transform the dataset
252
+ (e.g., preprocessing).
253
+ download (bool, optional): Whether to download the dataset if it's
254
+ not already present. Defaults to False.
255
+
256
+ Raises:
257
+ RuntimeError: If the dataset is not found and `download`
258
+ is not set to True or if all mirrors fail to provide the dataset.
378
259
  """
379
260
 
380
261
  mirrors = [
381
- "https://www.kaggle.com/api/v1/datasets/download/grosvenpaul/",
262
+ # pylint: disable-next=line-too-long
263
+ "https://www.kaggle.com/api/v1/datasets/download/grosvenpaul/family-income-and-expenditure",
382
264
  ]
383
265
 
384
266
  resources = [
385
267
  (
386
- "family-income-and-expenditure",
268
+ "archive.zip",
387
269
  "7d74bc7facc3d7c07c4df1c1c6ac563e",
388
270
  ),
389
271
  ]
@@ -391,9 +273,13 @@ class FiniteIncome(Dataset):
391
273
  def __init__(
392
274
  self,
393
275
  root: Union[str, Path],
276
+ transform: Callable,
394
277
  download: bool = False,
395
- transform: Optional[Callable] = None,
396
278
  ) -> None:
279
+ """
280
+ Constructor method to initialize the dataset.
281
+ """
282
+
397
283
  super().__init__()
398
284
  self.root = root
399
285
  self.transform = transform
@@ -403,21 +289,28 @@ class FiniteIncome(Dataset):
403
289
 
404
290
  if not self._check_exists():
405
291
  raise RuntimeError(
406
- "Dataset not found. You can use download=True to download it"
292
+ "Dataset not found. You can use download=True to download it."
407
293
  )
408
294
 
409
295
  self.data_input, self.data_output = self._load_data()
410
296
 
411
297
  def _load_data(self):
298
+ """
299
+ Loads the Family Income and Expenditure dataset from the CSV file
300
+ and applies the transformation.
412
301
 
413
- if self.transform:
414
- data: pd.DataFrame = pd.read_csv(
415
- os.path.join(self.data_folder, "Family Income and Expenditure.csv")
416
- ).pipe(self.transform)
417
- else:
418
- data: pd.DataFrame = pd.read_csv(
419
- os.path.join(self.data_folder, "Family Income and Expenditure.csv")
420
- ).pipe(self.add_input_output_family_income)
302
+ The data is read from the `Family Income and Expenditure.csv` file,
303
+ and the transformation function is applied to it. The input and
304
+ output data are separated and returned as numpy arrays.
305
+
306
+ Returns:
307
+ Tuple[numpy.ndarray, numpy.ndarray]: A tuple containing the input
308
+ and output data as numpy arrays.
309
+ """
310
+
311
+ data: pd.DataFrame = pd.read_csv(
312
+ os.path.join(self.data_folder, "Family Income and Expenditure.csv")
313
+ ).pipe(self.transform)
421
314
 
422
315
  data_input = data["Input"].to_numpy(dtype=np.float32)
423
316
  data_output = data["Output"].to_numpy(dtype=np.float32)
@@ -425,9 +318,28 @@ class FiniteIncome(Dataset):
425
318
  return data_input, data_output
426
319
 
427
320
  def __len__(self):
321
+ """
322
+ Returns the number of examples in the dataset.
323
+
324
+ Returns:
325
+ int: The number of examples in the dataset
326
+ (i.e., the number of rows in the input data).
327
+ """
328
+
428
329
  return self.data_input.shape[0]
429
330
 
430
331
  def __getitem__(self, idx):
332
+ """
333
+ Returns the input-output pair for a given index.
334
+
335
+ Args:
336
+ idx (int): The index of the example to retrieve.
337
+
338
+ Returns:
339
+ Tuple[torch.Tensor, torch.Tensor]: The input-output pair
340
+ as PyTorch tensors.
341
+ """
342
+
431
343
  example = self.data_input[idx, :]
432
344
  target = self.data_output[idx, :]
433
345
  example = torch.tensor(example)
@@ -436,16 +348,43 @@ class FiniteIncome(Dataset):
436
348
 
437
349
  @property
438
350
  def data_folder(self) -> str:
351
+ """
352
+ Returns the path to the folder where the dataset is stored.
353
+
354
+ Returns:
355
+ str: The path to the dataset folder.
356
+ """
357
+
439
358
  return os.path.join(self.root, self.__class__.__name__)
440
359
 
441
360
  def _check_exists(self) -> bool:
361
+ """
362
+ Checks if the dataset is already downloaded and verified.
363
+
364
+ This method checks that all required files exist and
365
+ their integrity is validated via MD5 checksums.
366
+
367
+ Returns:
368
+ bool: True if all resources exist and their
369
+ integrity is valid, False otherwise.
370
+ """
371
+
442
372
  return all(
443
373
  check_integrity(os.path.join(self.data_folder, file_path), checksum)
444
374
  for file_path, checksum in self.resources
445
375
  )
446
376
 
447
377
  def download(self) -> None:
448
- """Download the MNIST data if it doesn't exist already."""
378
+ """
379
+ Downloads and extracts the dataset.
380
+
381
+ This method attempts to download the dataset from the mirrors
382
+ and extract it into the appropriate folder. If any error occurs
383
+ during downloading, it will try each mirror in sequence.
384
+
385
+ Raises:
386
+ RuntimeError: If all mirrors fail to provide the dataset.
387
+ """
449
388
 
450
389
  if self._check_exists():
451
390
  return
@@ -456,10 +395,13 @@ class FiniteIncome(Dataset):
456
395
  for filename, md5 in self.resources:
457
396
  errors = []
458
397
  for mirror in self.mirrors:
459
- url = f"{mirror}{filename}"
398
+ url = f"{mirror}"
460
399
  try:
461
400
  download_and_extract_archive(
462
- url, download_root=self.data_folder, filename=filename, md5=md5
401
+ url,
402
+ download_root=self.data_folder,
403
+ filename=filename,
404
+ md5=md5,
463
405
  )
464
406
  except URLError as e:
465
407
  errors.append(e)
@@ -471,272 +413,87 @@ class FiniteIncome(Dataset):
471
413
  s += f"Tried {mirror}, got:\n{str(err)}\n"
472
414
  raise RuntimeError(s)
473
415
 
474
- @staticmethod
475
- def add_input_output_family_income(df: pd.DataFrame) -> pd.DataFrame:
476
- """Add a multiindex denoting if the column is an input or output variable."""
477
- # copy the dataframe
478
- temp_df = df.copy()
479
- # extract all the column names
480
- column_names = temp_df.columns.tolist()
481
- # the 2nd-9th columns correspond to output variables and all others to input variables. So make list of corresponding lengths of 'Input' and 'Output'
482
- input_list_start = ["Input"]
483
- input_list_end = ["Input"] * (len(column_names) - 9)
484
- output_list = ["Output"] * 8
485
- # concat both lists
486
- input_output_list = input_list_start + output_list + input_list_end
487
- # define multi index for attaching this 'Input' and 'Output' list with the column names already existing
488
- multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
489
- # transpose such that index can be adjusted to multi index
490
- new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
491
- # transpose back such that columns are the same as before except with different labels
492
- return new_df.transpose()
493
-
494
-
495
- class FIDataModule(L.LightningDataModule):
496
- """Finite Income dataset module."""
416
+
417
+ class NoisySines(Dataset):
418
+ """
419
+ A PyTorch dataset generating samples from a causal
420
+ sine wave with added noise.
421
+
422
+ Args:
423
+ length (int): Number of data points in the dataset.
424
+ amplitude (float): Amplitude of the sine wave.
425
+ frequency (float): Frequency of the sine wave in Hz.
426
+ noise_std (float): Standard deviation of the Gaussian noise.
427
+ bias (float): Offset from zero.
428
+
429
+ The sine wave is zero for times before t=0 and follows a
430
+ standard sine wave after t=0, with Gaussian noise added to all points.
431
+ """
497
432
 
498
433
  def __init__(
499
434
  self,
500
- dataset_directory: str = "./datasets",
501
- batch_size: int = 32,
502
- train_size: float = 0.8,
503
- val_size: float = 0.1,
504
- test_size: float = 0.1,
505
- shuffle_train: bool = True,
506
- shuffle_val: bool = False,
507
- shuffle_test: bool = False,
508
- num_workers: int = 4,
509
- pin_memory: bool = False,
510
- ) -> None:
511
- super().__init__()
512
- # Define required parameters here
513
- self.batch_size = batch_size
514
- self.dataset_directory = dataset_directory
515
- self.train_size = train_size
516
- self.val_size = val_size
517
- self.test_size = test_size
518
- self.shuffle_train = shuffle_train
519
- self.shuffle_val = shuffle_val
520
- self.shuffle_test = shuffle_test
521
- self.num_workers = num_workers
522
- self.pin_memory = pin_memory
523
-
524
- def prepare_data(self):
525
- # Define steps that should be done
526
- # on only one GPU, like getting data.
527
- # TODO downloading currently disabled since not compatible with api
528
- # FiniteIncome(self.dataset_directory, download=True, transform=self.transform)
529
- pass
530
-
531
- def setup(self, stage=None):
532
- # Define steps that should be done on
533
- # every GPU, like splitting data, applying
534
- # transform etc.
535
- data = FiniteIncome(self.dataset_directory, transform=self.transform)
536
-
537
- train_val_data, self.test_data = random_split(
538
- data, [1 - self.test_size, self.test_size]
539
- )
540
- self.train_data, self.val_data = random_split(
541
- train_val_data,
542
- [
543
- self.train_size / (1 - self.test_size),
544
- self.val_size / (1 - self.test_size),
545
- ],
546
- )
435
+ length,
436
+ amplitude=1,
437
+ frequency=10.0,
438
+ noise_std=0.05,
439
+ bias=0,
440
+ random_seed=42,
441
+ ):
442
+ """
443
+ Initializes the NoisyCausalSine dataset.
444
+ """
445
+ self.length = length
446
+ self.amplitude = amplitude
447
+ self.frequency = frequency
448
+ self.noise_std = noise_std
449
+ self.bias = bias
450
+ self.random_seed = random_seed
547
451
 
548
- def train_dataloader(self):
549
- # Return DataLoader for Training Data here
550
- return DataLoader(
551
- self.train_data,
552
- batch_size=self.batch_size,
553
- shuffle=self.shuffle_train,
554
- num_workers=self.num_workers,
555
- pin_memory=self.pin_memory,
556
- )
452
+ np.random.seed(self.random_seed)
453
+ self.time = np.linspace(0, 1, length)
454
+ self.noise = np.random.normal(0, self.noise_std, length)
557
455
 
558
- def val_dataloader(self):
559
- # Return DataLoader for Validation Data here
560
- return DataLoader(
561
- self.val_data,
562
- batch_size=self.batch_size,
563
- shuffle=self.shuffle_val,
564
- num_workers=self.num_workers,
565
- pin_memory=self.pin_memory,
566
- )
456
+ def __getitem__(self, idx):
457
+ """
458
+ Returns the time and noisy sine wave value for a given index.
567
459
 
568
- def test_dataloader(self):
569
- # Return DataLoader for Testing Data here
570
- return DataLoader(
571
- self.test_data,
572
- batch_size=self.batch_size,
573
- shuffle=self.shuffle_test,
574
- num_workers=self.num_workers,
575
- pin_memory=self.pin_memory,
576
- )
460
+ Args:
461
+ idx (int): Index of the data point to retrieve.
577
462
 
578
- @staticmethod
579
- def transform(df: pd.DataFrame) -> pd.DataFrame:
580
- def normalize_columns_income(df: pd.DataFrame) -> pd.DataFrame:
581
- """Normalize the columns for the Family Income dataframe. This can also be applied to other dataframes because this function normalizes
582
- all columns individually."""
583
- # copy the dataframe
584
- temp_df = df.copy()
585
- # normalize each column
586
- for feature_name in df.columns:
587
- max_value = df[feature_name].max()
588
- min_value = df[feature_name].min()
589
- temp_df[feature_name] = (df[feature_name] - min_value) / (
590
- max_value - min_value
591
- )
592
- return temp_df
593
-
594
- def check_constraints_income(df: pd.DataFrame) -> pd.DataFrame:
595
- """Check if all the constraints are satisfied for the dataframe and remove the examples that do not satisfy the constraint. This
596
- function only works for the Family Income dataset and the constraints are that the household income is larger than all the expenses
597
- and the food expense is larger than the sum of the other (more detailed) food expenses.
598
- """
599
- temp_df = df.copy()
600
- # check that household income is larger than expenses in the output
601
- input_array = temp_df["Input"].to_numpy()
602
- income_array = np.add(
603
- np.multiply(
604
- input_array[:, [0, 1]],
605
- np.subtract(
606
- np.asarray([11815988, 9234485]), np.asarray([11285, 0])
607
- ),
608
- ),
609
- np.asarray([11285, 0]),
610
- )
611
- expense_array = temp_df["Output"].to_numpy()
612
- expense_array = np.add(
613
- np.multiply(
614
- expense_array,
615
- np.subtract(
616
- np.asarray(
617
- [
618
- 791848,
619
- 437467,
620
- 140992,
621
- 74800,
622
- 2188560,
623
- 1049275,
624
- 149940,
625
- 731000,
626
- ]
627
- ),
628
- np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
629
- ),
630
- ),
631
- np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
632
- )
633
- expense_array_without_dup = expense_array[:, [0, 4, 5, 6, 7]]
634
- sum_expenses = np.sum(expense_array_without_dup, axis=1)
635
- total_income = np.sum(income_array, axis=1)
636
- sanity_check_array = np.greater_equal(total_income, sum_expenses)
637
- temp_df["Unimportant"] = sanity_check_array.tolist()
638
- reduction = temp_df[temp_df.Unimportant]
639
- drop_reduction = reduction.drop("Unimportant", axis=1)
640
-
641
- # check that the food expense is larger than all the sub expenses
642
- expense_reduced_array = drop_reduction["Output"].to_numpy()
643
- expense_reduced_array = np.add(
644
- np.multiply(
645
- expense_reduced_array,
646
- np.subtract(
647
- np.asarray(
648
- [
649
- 791848,
650
- 437467,
651
- 140992,
652
- 74800,
653
- 2188560,
654
- 1049275,
655
- 149940,
656
- 731000,
657
- ]
658
- ),
659
- np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
660
- ),
661
- ),
662
- np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
663
- )
664
- food_mul_expense_array = expense_reduced_array[:, [1, 2, 3]]
665
- food_mul_expense_array_sum = np.sum(food_mul_expense_array, axis=1)
666
- food_expense_array = expense_reduced_array[:, 0]
667
- sanity_check_array = np.greater_equal(
668
- food_expense_array, food_mul_expense_array_sum
669
- )
670
- drop_reduction["Unimportant"] = sanity_check_array.tolist()
671
- new_reduction = drop_reduction[drop_reduction.Unimportant]
672
- satisfied_constraints_df = new_reduction.drop("Unimportant", axis=1)
673
-
674
- return satisfied_constraints_df
675
-
676
- def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
677
- """Sample 2500 examples from the dataframe without replacement."""
678
- temp_df = df.copy()
679
- sample_df = temp_df.sample(n=2500, replace=False, random_state=3, axis=0)
680
- return sample_df
681
-
682
- def add_input_output_family_income(df: pd.DataFrame) -> pd.DataFrame:
683
- """Add a multiindex denoting if the column is an input or output variable."""
684
- # copy the dataframe
685
- temp_df = df.copy()
686
- # extract all the column names
687
- column_names = temp_df.columns.tolist()
688
- # the 2nd-9th columns correspond to output variables and all others to input variables. So make list of corresponding lengths of 'Input' and 'Output'
689
- input_list_start = ["Input"]
690
- input_list_end = ["Input"] * (len(column_names) - 9)
691
- output_list = ["Output"] * 8
692
- # concat both lists
693
- input_output_list = input_list_start + output_list + input_list_end
694
- # define multi index for attaching this 'Input' and 'Output' list with the column names already existing
695
- multiindex_bias = pd.MultiIndex.from_arrays(
696
- [input_output_list, column_names]
463
+ Returns:
464
+ Tuple[torch.Tensor, torch.Tensor]: A tuple containing the
465
+ time value and the noisy sine wave value.
466
+ """
467
+
468
+ t = self.time[idx]
469
+ if idx < self.length // 2:
470
+ sine_value = self.bias
471
+ cosine_value = self.bias
472
+ else:
473
+ sine_value = (
474
+ self.amplitude * np.sin(2 * np.pi * self.frequency * t)
475
+ + self.bias
697
476
  )
698
- # transpose such that index can be adjusted to multi index
699
- new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
700
- # transpose back such that columns are the same as before except with different labels
701
- return new_df.transpose()
702
-
703
- return (
704
- # drop missing values
705
- df.dropna(how="any")
706
- # convert object to fitting dtype
707
- .convert_dtypes()
708
- # remove all strings (no other dtypes are present except for integers and floats)
709
- .select_dtypes(exclude=["string"])
710
- # transform all numbers to same dtype
711
- .astype("float32")
712
- # drop column with label Agricultural Household indicator because this is not really a numerical input but rather a categorical/classification
713
- .drop(["Agricultural Household indicator"], axis=1, inplace=False)
714
- # this column is dropped because it depends on Agricultural Household indicator
715
- .drop(["Crop Farming and Gardening expenses"], axis=1, inplace=False)
716
- # use 8 output variables and 24 input variables
717
- .drop(
718
- [
719
- "Total Rice Expenditure",
720
- "Total Fish and marine products Expenditure",
721
- "Fruit Expenditure",
722
- "Restaurant and hotels Expenditure",
723
- "Alcoholic Beverages Expenditure",
724
- "Tobacco Expenditure",
725
- "Clothing, Footwear and Other Wear Expenditure",
726
- "Imputed House Rental Value",
727
- "Transportation Expenditure",
728
- "Miscellaneous Goods and Services Expenditure",
729
- "Special Occasions Expenditure",
730
- ],
731
- axis=1,
732
- inplace=False,
477
+ cosine_value = (
478
+ self.amplitude * np.cos(2 * np.pi * self.frequency * t)
479
+ + self.bias
733
480
  )
734
- # add input and output labels to each column
735
- .pipe(add_input_output_family_income)
736
- # normalize all the columns
737
- .pipe(normalize_columns_income)
738
- # remove all datapoints that do not satisfy the constraints
739
- .pipe(check_constraints_income)
740
- # sample 2500 examples
741
- .pipe(sample_2500_examples)
481
+
482
+ # Add noise to the signals
483
+ noisy_sine = sine_value + self.noise[idx]
484
+ noisy_cosine = cosine_value + self.noise[idx]
485
+
486
+ # Convert to tensor
487
+ example, target = torch.tensor([t], dtype=torch.float32), torch.tensor(
488
+ [noisy_sine, noisy_cosine], dtype=torch.float32
742
489
  )
490
+ return example, target
491
+
492
+ def __len__(self):
493
+ """
494
+ Returns the total number of data points in the dataset.
495
+
496
+ Returns:
497
+ int: The length of the dataset.
498
+ """
499
+ return self.length