congrads 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,57 +1,47 @@
1
- import pandas as pd
1
+ """Preprocessing functions for various datasets.
2
+
3
+ This module provides preprocessing pipelines for multiple datasets:
4
+ - BiasCorrection: Temperature bias correction dataset
5
+ - FamilyIncome: Family income and expenses dataset
6
+ - AdultCensusIncome: Adult Census Income dataset
7
+
8
+ Each preprocessing function applies appropriate transformations including
9
+ normalization, feature engineering, constraint filtering, and sampling.
10
+ """
11
+
2
12
  import numpy as np
3
- from torch.utils.data import Dataset, random_split, DataLoader
4
-
5
-
6
- def splitDataLoaders(
7
- data: Dataset,
8
- loader_args: dict = None,
9
- train_loader_args: dict = None,
10
- valid_loader_args: dict = None,
11
- test_loader_args: dict = None,
12
- train_size: float = 0.8,
13
- valid_size: float = 0.1,
14
- test_size: float = 0.1,
15
- ) -> tuple[DataLoader, DataLoader, DataLoader]:
16
-
17
- # Validate split sizes
18
- if not (0 < train_size < 1 and 0 < valid_size < 1 and 0 < test_size < 1):
19
- raise ValueError(
20
- "train_size, valid_size, and test_size must be between 0 and 1."
21
- )
22
- if not abs(train_size + valid_size + test_size - 1.0) < 1e-6:
23
- raise ValueError("train_size, valid_size, and test_size must sum to 1.")
24
-
25
- # Perform the splits
26
- train_val_data, test_data = random_split(data, [1 - test_size, test_size])
27
- train_data, valid_data = random_split(
28
- train_val_data,
29
- [
30
- train_size / (1 - test_size),
31
- valid_size / (1 - test_size),
32
- ],
33
- )
13
+ import pandas as pd
14
+
15
+ __all__ = ["preprocess_BiasCorrection", "preprocess_FamilyIncome", "preprocess_AdultCensusIncome"]
16
+
34
17
 
35
- # Set default arguments for each loader
36
- train_loader_args = train_loader_args or loader_args or {}
37
- valid_loader_args = valid_loader_args or loader_args or {}
38
- test_loader_args = test_loader_args or loader_args or {}
18
+ def preprocess_BiasCorrection(df: pd.DataFrame) -> pd.DataFrame: # noqa: N802
19
+ """Preprocesses the given dataframe for bias correction by performing a series of transformations.
39
20
 
40
- # Create the DataLoaders
41
- train_generator = DataLoader(train_data, **train_loader_args)
42
- valid_generator = DataLoader(valid_data, **valid_loader_args)
43
- test_generator = DataLoader(test_data, **test_loader_args)
21
+ The function sequentially:
44
22
 
45
- return train_generator, valid_generator, test_generator
23
+ - Drops rows with missing values.
24
+ - Converts a date string to datetime format and adds year, month,
25
+ and day columns.
26
+ - Normalizes the columns with specific logic for input and output variables.
27
+ - Adds a multi-index indicating which columns are input or output variables.
28
+ - Samples 2500 examples from the dataset without replacement.
46
29
 
30
+ Args:
31
+ df (pd.DataFrame): The input dataframe containing the data
32
+ to be processed.
47
33
 
48
- def preprocess_BiasCorrection(df: pd.DataFrame) -> pd.DataFrame:
34
+ Returns:
35
+ pd.DataFrame: The processed dataframe after applying
36
+ the transformations.
37
+ """
49
38
 
50
39
  def date_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
51
40
  """Transform the string that denotes the date to the datetime format in pandas."""
52
41
  # make copy of dataframe
53
42
  df_temp = df.copy()
54
- # add new column at the front where the date string is transformed to the datetime format
43
+ # add new column at the front where the date string is
44
+ # transformed to the datetime format
55
45
  df_temp.insert(0, "DateTransformed", pd.to_datetime(df_temp["Date"]))
56
46
  return df_temp
57
47
 
@@ -85,36 +75,44 @@ def preprocess_BiasCorrection(df: pd.DataFrame) -> pd.DataFrame:
85
75
  temp_df = df.copy()
86
76
  # extract all the column names
87
77
  column_names = temp_df.columns.tolist()
88
- # only the last 2 columns are output variables, all others are input variables. So make list of corresponding lengths of 'Input' and 'Output'
78
+ # only the last 2 columns are output variables, all others are input
79
+ # variables. So make list of corresponding lengths of
80
+ # 'Input' and 'Output'
89
81
  input_list = ["Input"] * (len(column_names) - 2)
90
82
  output_list = ["Output"] * 2
91
83
  # concat both lists
92
84
  input_output_list = input_list + output_list
93
- # define multi index for attaching this 'Input' and 'Output' list with the column names already existing
85
+ # define multi index for attaching this 'Input' and 'Output' list with
86
+ # the column names already existing
94
87
  multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
95
88
  # transpose such that index can be adjusted to multi index
96
89
  new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
97
- # transpose back such that columns are the same as before except with different labels
90
+ # transpose back such that columns are the same as before
91
+ # except with different labels
98
92
  return new_df.transpose()
99
93
 
100
94
  def normalize_columns_bias(df: pd.DataFrame) -> pd.DataFrame:
101
- """Normalize the columns for the bias correction dataset. This is different from normalizing all the columns separately because the
102
- upper and lower bounds for the output variables are assumed to be the same."""
95
+ """Normalize the columns for the bias correction dataset.
96
+
97
+ This is different from normalizing all the columns separately
98
+ because the upper and lower bounds for the output variables
99
+ are assumed to be the same.
100
+ """
103
101
  # copy the dataframe
104
102
  temp_df = df.copy()
105
103
  # normalize each column
106
104
  for feature_name in df.columns:
107
- # the output columns are normalized using the same upper and lower bound for more efficient check of the inequality
105
+ # the output columns are normalized using the same upper and
106
+ # lower bound for more efficient check of the inequality
108
107
  if feature_name == "Next_Tmax" or feature_name == "Next_Tmin":
109
108
  max_value = 38.9
110
109
  min_value = 11.3
111
- # the input columns are normalized using their respective upper and lower bounds
110
+ # the input columns are normalized using their respective
111
+ # upper and lower bounds
112
112
  else:
113
113
  max_value = df[feature_name].max()
114
114
  min_value = df[feature_name].min()
115
- temp_df[feature_name] = (df[feature_name] - min_value) / (
116
- max_value - min_value
117
- )
115
+ temp_df[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
118
116
  return temp_df
119
117
 
120
118
  def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
@@ -140,33 +138,78 @@ def preprocess_BiasCorrection(df: pd.DataFrame) -> pd.DataFrame:
140
138
  .astype("float32")
141
139
  # normalize columns
142
140
  .pipe(normalize_columns_bias)
143
- # add multi index indicating which columns are corresponding to input and output variables
141
+ # add multi index indicating which columns are corresponding
142
+ # to input and output variables
144
143
  .pipe(add_input_output_temperature)
145
144
  # sample 2500 examples out of the dataset
146
145
  .pipe(sample_2500_examples)
147
146
  )
148
147
 
149
148
 
150
- def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
149
+ def preprocess_FamilyIncome(df: pd.DataFrame) -> pd.DataFrame: # noqa: N802
150
+ """Preprocesses the given Family Income dataframe.
151
+
152
+ The function sequentially:
153
+
154
+ - Drops rows with missing values.
155
+ - Converts object columns to appropriate data types and
156
+ removes string columns.
157
+ - Removes certain unnecessary columns like
158
+ 'Agricultural Household indicator' and related features.
159
+ - Adds labels to columns indicating whether they are
160
+ input or output variables.
161
+ - Normalizes the columns individually.
162
+ - Checks and removes rows that do not satisfy predefined constraints
163
+ (household income > expenses, food expenses > sub-expenses).
164
+ - Samples 2500 examples from the dataset without replacement.
165
+
166
+ Args:
167
+ df (pd.DataFrame): The input Family Income dataframe containing
168
+ the data to be processed.
169
+
170
+ Returns:
171
+ pd.DataFrame: The processed dataframe after applying the
172
+ transformations and constraints.
173
+ """
151
174
 
152
175
  def normalize_columns_income(df: pd.DataFrame) -> pd.DataFrame:
153
- """Normalize the columns for the Family Income dataframe. This can also be applied to other dataframes because this function normalizes
154
- all columns individually."""
176
+ """Normalize each column of the dataframe independently.
177
+
178
+ This function scales each column to have values between 0 and 1
179
+ (or another standard normalization, depending on implementation),
180
+ making it suitable for numerical processing. While designed for
181
+ the Family Income dataset, it can be applied to any dataframe
182
+ with numeric columns.
183
+
184
+ Args:
185
+ df (pd.DataFrame): Input dataframe to normalize.
186
+
187
+ Returns:
188
+ pd.DataFrame: Dataframe with each column normalized independently.
189
+ """
155
190
  # copy the dataframe
156
191
  temp_df = df.copy()
157
192
  # normalize each column
158
193
  for feature_name in df.columns:
159
194
  max_value = df[feature_name].max()
160
195
  min_value = df[feature_name].min()
161
- temp_df[feature_name] = (df[feature_name] - min_value) / (
162
- max_value - min_value
163
- )
196
+ temp_df[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
164
197
  return temp_df
165
198
 
166
199
  def check_constraints_income(df: pd.DataFrame) -> pd.DataFrame:
167
- """Check if all the constraints are satisfied for the dataframe and remove the examples that do not satisfy the constraint. This
168
- function only works for the Family Income dataset and the constraints are that the household income is larger than all the expenses
169
- and the food expense is larger than the sum of the other (more detailed) food expenses.
200
+ """Filter rows that violate income-related constraints.
201
+
202
+ This function is specific to the Family Income dataset. It removes rows
203
+ that do not satisfy the following constraints:
204
+ 1. Household income must be greater than all expenses.
205
+ 2. Food expense must be greater than the sum of detailed food expenses.
206
+
207
+ Args:
208
+ df (pd.DataFrame): Input dataframe containing income and expense data.
209
+
210
+ Returns:
211
+ pd.DataFrame: Filtered dataframe containing only rows that satisfy
212
+ all constraints.
170
213
  """
171
214
  temp_df = df.copy()
172
215
  # check that household income is larger than expenses in the output
@@ -234,9 +277,7 @@ def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
234
277
  food_mul_expense_array = expense_reduced_array[:, [1, 2, 3]]
235
278
  food_mul_expense_array_sum = np.sum(food_mul_expense_array, axis=1)
236
279
  food_expense_array = expense_reduced_array[:, 0]
237
- sanity_check_array = np.greater_equal(
238
- food_expense_array, food_mul_expense_array_sum
239
- )
280
+ sanity_check_array = np.greater_equal(food_expense_array, food_mul_expense_array_sum)
240
281
  drop_reduction["Unimportant"] = sanity_check_array.tolist()
241
282
  new_reduction = drop_reduction[drop_reduction.Unimportant]
242
283
  satisfied_constraints_df = new_reduction.drop("Unimportant", axis=1)
@@ -249,17 +290,21 @@ def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
249
290
  temp_df = df.copy()
250
291
  # extract all the column names
251
292
  column_names = temp_df.columns.tolist()
252
- # the 2nd-9th columns correspond to output variables and all others to input variables. So make list of corresponding lengths of 'Input' and 'Output'
293
+ # the 2nd-9th columns correspond to output variables and all
294
+ # others to input variables. So make list of corresponding
295
+ # lengths of 'Input' and 'Output'
253
296
  input_list_start = ["Input"]
254
297
  input_list_end = ["Input"] * (len(column_names) - 9)
255
298
  output_list = ["Output"] * 8
256
299
  # concat both lists
257
300
  input_output_list = input_list_start + output_list + input_list_end
258
- # define multi index for attaching this 'Input' and 'Output' list with the column names already existing
301
+ # define multi index for attaching this 'Input' and
302
+ # 'Output' list with the column names already existing
259
303
  multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
260
304
  # transpose such that index can be adjusted to multi index
261
305
  new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
262
- # transpose back such that columns are the same as before except with different labels
306
+ # transpose back such that columns are the same as
307
+ # before except with different labels
263
308
  return new_df.transpose()
264
309
 
265
310
  def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
@@ -273,13 +318,17 @@ def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
273
318
  df.dropna(how="any")
274
319
  # convert object to fitting dtype
275
320
  .convert_dtypes()
276
- # remove all strings (no other dtypes are present except for integers and floats)
321
+ # remove all strings (no other dtypes are present
322
+ # except for integers and floats)
277
323
  .select_dtypes(exclude=["string"])
278
324
  # transform all numbers to same dtype
279
325
  .astype("float32")
280
- # drop column with label Agricultural Household indicator because this is not really a numerical input but rather a categorical/classification
326
+ # drop column with label Agricultural Household indicator
327
+ # because this is not really a numerical input but
328
+ # rather a categorical/classification
281
329
  .drop(["Agricultural Household indicator"], axis=1, inplace=False)
282
- # this column is dropped because it depends on Agricultural Household indicator
330
+ # this column is dropped because it depends on
331
+ # Agricultural Household indicator
283
332
  .drop(["Crop Farming and Gardening expenses"], axis=1, inplace=False)
284
333
  # use 8 output variables and 24 input variables
285
334
  .drop(
@@ -308,3 +357,85 @@ def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
308
357
  # sample 2500 examples
309
358
  .pipe(sample_2500_examples)
310
359
  )
360
+
361
+
362
+ def preprocess_AdultCensusIncome(df: pd.DataFrame) -> pd.DataFrame: # noqa: N802
363
+ """Preprocesses the Adult Census Income dataset for PyTorch ML.
364
+
365
+ Sequential steps:
366
+ - Drop rows with missing values.
367
+ - Encode categorical variables to integer labels.
368
+ - Map the target 'income' column to 0/1.
369
+ - Convert all data to float32.
370
+ - Add a multiindex to denote Input vs Output columns.
371
+
372
+ Args:
373
+ df (pd.DataFrame): Raw dataframe containing Adult Census Income data.
374
+
375
+ Returns:
376
+ pd.DataFrame: Preprocessed dataframe.
377
+ """
378
+
379
+ def drop_missing(df: pd.DataFrame) -> pd.DataFrame:
380
+ """Drop rows with any missing values."""
381
+ return df.dropna(how="any")
382
+
383
+ def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
384
+ return df.drop(columns=["fnlwgt", "education.num"], errors="ignore")
385
+
386
+ def label_encode_column(series: pd.Series, col_name: str = None) -> pd.Series:
387
+ """Encode a pandas Series of categorical strings into integers."""
388
+ categories = series.dropna().unique().tolist()
389
+ cat_to_int = {cat: i for i, cat in enumerate(categories)}
390
+ if col_name:
391
+ print(f"Column '{col_name}' encoding:")
392
+ for cat, idx in cat_to_int.items():
393
+ print(f" {cat} -> {idx}")
394
+ return series.map(cat_to_int).astype(int)
395
+
396
+ def encode_categorical(df: pd.DataFrame) -> pd.DataFrame:
397
+ """Convert categorical string columns to integer labels using label_encode_column."""
398
+ df_temp = df.copy()
399
+ categorical_cols = [
400
+ "workclass",
401
+ "education",
402
+ "marital.status",
403
+ "occupation",
404
+ "relationship",
405
+ "race",
406
+ "sex",
407
+ "native.country",
408
+ ]
409
+ for col in categorical_cols:
410
+ df_temp[col] = label_encode_column(df_temp[col].astype(str), col_name=col)
411
+ return df_temp
412
+
413
+ def map_target(df: pd.DataFrame) -> pd.DataFrame:
414
+ """Map income column to 0 (<=50K) and 1 (>50K)."""
415
+ df_temp = df.copy()
416
+ df_temp["income"] = df_temp["income"].map({"<=50K": 0, ">50K": 1})
417
+ return df_temp
418
+
419
+ def convert_float32(df: pd.DataFrame) -> pd.DataFrame:
420
+ """Convert all data to float32 for PyTorch compatibility."""
421
+ return df.astype("float32")
422
+
423
+ def add_input_output_index(df: pd.DataFrame) -> pd.DataFrame:
424
+ """Add a multiindex indicating input and output columns."""
425
+ temp_df = df.copy()
426
+ column_names = temp_df.columns.tolist()
427
+ # Only the 'income' column is output
428
+ input_list = ["Input"] * (len(column_names) - 1)
429
+ output_list = ["Output"]
430
+ multiindex_list = input_list + output_list
431
+ multiindex = pd.MultiIndex.from_arrays([multiindex_list, column_names])
432
+ return pd.DataFrame(temp_df.to_numpy(), columns=multiindex)
433
+
434
+ return (
435
+ df.pipe(drop_missing)
436
+ .pipe(drop_columns)
437
+ .pipe(encode_categorical)
438
+ .pipe(map_target)
439
+ .pipe(convert_float32)
440
+ .pipe(add_input_output_index)
441
+ )