congrads 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- congrads/__init__.py +10 -21
- congrads/callbacks/base.py +357 -0
- congrads/callbacks/registry.py +106 -0
- congrads/checkpoints.py +178 -0
- congrads/constraints/base.py +242 -0
- congrads/constraints/registry.py +1255 -0
- congrads/core/batch_runner.py +200 -0
- congrads/core/congradscore.py +271 -0
- congrads/core/constraint_engine.py +209 -0
- congrads/core/epoch_runner.py +119 -0
- congrads/datasets/registry.py +799 -0
- congrads/descriptor.py +148 -29
- congrads/metrics.py +109 -19
- congrads/networks/registry.py +68 -0
- congrads/py.typed +0 -0
- congrads/transformations/base.py +37 -0
- congrads/transformations/registry.py +86 -0
- congrads/{utils.py → utils/preprocessors.py} +201 -72
- congrads/utils/utility.py +506 -0
- congrads/utils/validation.py +182 -0
- congrads-0.3.0.dist-info/METADATA +234 -0
- congrads-0.3.0.dist-info/RECORD +23 -0
- congrads-0.3.0.dist-info/WHEEL +4 -0
- congrads/constraints.py +0 -389
- congrads/core.py +0 -225
- congrads/datasets.py +0 -195
- congrads/networks.py +0 -90
- congrads-0.2.0.dist-info/LICENSE +0 -26
- congrads-0.2.0.dist-info/METADATA +0 -222
- congrads-0.2.0.dist-info/RECORD +0 -13
- congrads-0.2.0.dist-info/WHEEL +0 -5
- congrads-0.2.0.dist-info/top_level.txt +0 -1
|
@@ -1,57 +1,45 @@
|
|
|
1
|
-
|
|
1
|
+
"""Preprocessing functions for various datasets.
|
|
2
|
+
|
|
3
|
+
This module provides preprocessing pipelines for multiple datasets:
|
|
4
|
+
- BiasCorrection: Temperature bias correction dataset
|
|
5
|
+
- FamilyIncome: Family income and expenses dataset
|
|
6
|
+
- AdultCensusIncome: Adult Census Income dataset
|
|
7
|
+
|
|
8
|
+
Each preprocessing function applies appropriate transformations including
|
|
9
|
+
normalization, feature engineering, constraint filtering, and sampling.
|
|
10
|
+
"""
|
|
11
|
+
|
|
2
12
|
import numpy as np
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
def splitDataLoaders(
|
|
7
|
-
data: Dataset,
|
|
8
|
-
loader_args: dict = None,
|
|
9
|
-
train_loader_args: dict = None,
|
|
10
|
-
valid_loader_args: dict = None,
|
|
11
|
-
test_loader_args: dict = None,
|
|
12
|
-
train_size: float = 0.8,
|
|
13
|
-
valid_size: float = 0.1,
|
|
14
|
-
test_size: float = 0.1,
|
|
15
|
-
) -> tuple[DataLoader, DataLoader, DataLoader]:
|
|
16
|
-
|
|
17
|
-
# Validate split sizes
|
|
18
|
-
if not (0 < train_size < 1 and 0 < valid_size < 1 and 0 < test_size < 1):
|
|
19
|
-
raise ValueError(
|
|
20
|
-
"train_size, valid_size, and test_size must be between 0 and 1."
|
|
21
|
-
)
|
|
22
|
-
if not abs(train_size + valid_size + test_size - 1.0) < 1e-6:
|
|
23
|
-
raise ValueError("train_size, valid_size, and test_size must sum to 1.")
|
|
24
|
-
|
|
25
|
-
# Perform the splits
|
|
26
|
-
train_val_data, test_data = random_split(data, [1 - test_size, test_size])
|
|
27
|
-
train_data, valid_data = random_split(
|
|
28
|
-
train_val_data,
|
|
29
|
-
[
|
|
30
|
-
train_size / (1 - test_size),
|
|
31
|
-
valid_size / (1 - test_size),
|
|
32
|
-
],
|
|
33
|
-
)
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
34
15
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
valid_loader_args = valid_loader_args or loader_args or {}
|
|
38
|
-
test_loader_args = test_loader_args or loader_args or {}
|
|
16
|
+
def preprocess_BiasCorrection(df: pd.DataFrame) -> pd.DataFrame: # noqa: N802
|
|
17
|
+
"""Preprocesses the given dataframe for bias correction by performing a series of transformations.
|
|
39
18
|
|
|
40
|
-
|
|
41
|
-
train_generator = DataLoader(train_data, **train_loader_args)
|
|
42
|
-
valid_generator = DataLoader(valid_data, **valid_loader_args)
|
|
43
|
-
test_generator = DataLoader(test_data, **test_loader_args)
|
|
19
|
+
The function sequentially:
|
|
44
20
|
|
|
45
|
-
|
|
21
|
+
- Drops rows with missing values.
|
|
22
|
+
- Converts a date string to datetime format and adds year, month,
|
|
23
|
+
and day columns.
|
|
24
|
+
- Normalizes the columns with specific logic for input and output variables.
|
|
25
|
+
- Adds a multi-index indicating which columns are input or output variables.
|
|
26
|
+
- Samples 2500 examples from the dataset without replacement.
|
|
46
27
|
|
|
28
|
+
Args:
|
|
29
|
+
df (pd.DataFrame): The input dataframe containing the data
|
|
30
|
+
to be processed.
|
|
47
31
|
|
|
48
|
-
|
|
32
|
+
Returns:
|
|
33
|
+
pd.DataFrame: The processed dataframe after applying
|
|
34
|
+
the transformations.
|
|
35
|
+
"""
|
|
49
36
|
|
|
50
37
|
def date_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
|
|
51
38
|
"""Transform the string that denotes the date to the datetime format in pandas."""
|
|
52
39
|
# make copy of dataframe
|
|
53
40
|
df_temp = df.copy()
|
|
54
|
-
# add new column at the front where the date string is
|
|
41
|
+
# add new column at the front where the date string is
|
|
42
|
+
# transformed to the datetime format
|
|
55
43
|
df_temp.insert(0, "DateTransformed", pd.to_datetime(df_temp["Date"]))
|
|
56
44
|
return df_temp
|
|
57
45
|
|
|
@@ -85,36 +73,44 @@ def preprocess_BiasCorrection(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
85
73
|
temp_df = df.copy()
|
|
86
74
|
# extract all the column names
|
|
87
75
|
column_names = temp_df.columns.tolist()
|
|
88
|
-
# only the last 2 columns are output variables, all others are input
|
|
76
|
+
# only the last 2 columns are output variables, all others are input
|
|
77
|
+
# variables. So make list of corresponding lengths of
|
|
78
|
+
# 'Input' and 'Output'
|
|
89
79
|
input_list = ["Input"] * (len(column_names) - 2)
|
|
90
80
|
output_list = ["Output"] * 2
|
|
91
81
|
# concat both lists
|
|
92
82
|
input_output_list = input_list + output_list
|
|
93
|
-
# define multi index for attaching this 'Input' and 'Output' list with
|
|
83
|
+
# define multi index for attaching this 'Input' and 'Output' list with
|
|
84
|
+
# the column names already existing
|
|
94
85
|
multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
|
|
95
86
|
# transpose such that index can be adjusted to multi index
|
|
96
87
|
new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
|
|
97
|
-
# transpose back such that columns are the same as before
|
|
88
|
+
# transpose back such that columns are the same as before
|
|
89
|
+
# except with different labels
|
|
98
90
|
return new_df.transpose()
|
|
99
91
|
|
|
100
92
|
def normalize_columns_bias(df: pd.DataFrame) -> pd.DataFrame:
|
|
101
|
-
"""Normalize the columns for the bias correction dataset.
|
|
102
|
-
|
|
93
|
+
"""Normalize the columns for the bias correction dataset.
|
|
94
|
+
|
|
95
|
+
This is different from normalizing all the columns separately
|
|
96
|
+
because the upper and lower bounds for the output variables
|
|
97
|
+
are assumed to be the same.
|
|
98
|
+
"""
|
|
103
99
|
# copy the dataframe
|
|
104
100
|
temp_df = df.copy()
|
|
105
101
|
# normalize each column
|
|
106
102
|
for feature_name in df.columns:
|
|
107
|
-
# the output columns are normalized using the same upper and
|
|
103
|
+
# the output columns are normalized using the same upper and
|
|
104
|
+
# lower bound for more efficient check of the inequality
|
|
108
105
|
if feature_name == "Next_Tmax" or feature_name == "Next_Tmin":
|
|
109
106
|
max_value = 38.9
|
|
110
107
|
min_value = 11.3
|
|
111
|
-
# the input columns are normalized using their respective
|
|
108
|
+
# the input columns are normalized using their respective
|
|
109
|
+
# upper and lower bounds
|
|
112
110
|
else:
|
|
113
111
|
max_value = df[feature_name].max()
|
|
114
112
|
min_value = df[feature_name].min()
|
|
115
|
-
temp_df[feature_name] = (df[feature_name] - min_value) / (
|
|
116
|
-
max_value - min_value
|
|
117
|
-
)
|
|
113
|
+
temp_df[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
|
|
118
114
|
return temp_df
|
|
119
115
|
|
|
120
116
|
def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -140,33 +136,78 @@ def preprocess_BiasCorrection(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
140
136
|
.astype("float32")
|
|
141
137
|
# normalize columns
|
|
142
138
|
.pipe(normalize_columns_bias)
|
|
143
|
-
# add multi index indicating which columns are corresponding
|
|
139
|
+
# add multi index indicating which columns are corresponding
|
|
140
|
+
# to input and output variables
|
|
144
141
|
.pipe(add_input_output_temperature)
|
|
145
142
|
# sample 2500 examples out of the dataset
|
|
146
143
|
.pipe(sample_2500_examples)
|
|
147
144
|
)
|
|
148
145
|
|
|
149
146
|
|
|
150
|
-
def
|
|
147
|
+
def preprocess_FamilyIncome(df: pd.DataFrame) -> pd.DataFrame: # noqa: N802
|
|
148
|
+
"""Preprocesses the given Family Income dataframe.
|
|
149
|
+
|
|
150
|
+
The function sequentially:
|
|
151
|
+
|
|
152
|
+
- Drops rows with missing values.
|
|
153
|
+
- Converts object columns to appropriate data types and
|
|
154
|
+
removes string columns.
|
|
155
|
+
- Removes certain unnecessary columns like
|
|
156
|
+
'Agricultural Household indicator' and related features.
|
|
157
|
+
- Adds labels to columns indicating whether they are
|
|
158
|
+
input or output variables.
|
|
159
|
+
- Normalizes the columns individually.
|
|
160
|
+
- Checks and removes rows that do not satisfy predefined constraints
|
|
161
|
+
(household income > expenses, food expenses > sub-expenses).
|
|
162
|
+
- Samples 2500 examples from the dataset without replacement.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
df (pd.DataFrame): The input Family Income dataframe containing
|
|
166
|
+
the data to be processed.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
pd.DataFrame: The processed dataframe after applying the
|
|
170
|
+
transformations and constraints.
|
|
171
|
+
"""
|
|
151
172
|
|
|
152
173
|
def normalize_columns_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
153
|
-
"""Normalize
|
|
154
|
-
|
|
174
|
+
"""Normalize each column of the dataframe independently.
|
|
175
|
+
|
|
176
|
+
This function scales each column to have values between 0 and 1
|
|
177
|
+
(or another standard normalization, depending on implementation),
|
|
178
|
+
making it suitable for numerical processing. While designed for
|
|
179
|
+
the Family Income dataset, it can be applied to any dataframe
|
|
180
|
+
with numeric columns.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
df (pd.DataFrame): Input dataframe to normalize.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
pd.DataFrame: Dataframe with each column normalized independently.
|
|
187
|
+
"""
|
|
155
188
|
# copy the dataframe
|
|
156
189
|
temp_df = df.copy()
|
|
157
190
|
# normalize each column
|
|
158
191
|
for feature_name in df.columns:
|
|
159
192
|
max_value = df[feature_name].max()
|
|
160
193
|
min_value = df[feature_name].min()
|
|
161
|
-
temp_df[feature_name] = (df[feature_name] - min_value) / (
|
|
162
|
-
max_value - min_value
|
|
163
|
-
)
|
|
194
|
+
temp_df[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
|
|
164
195
|
return temp_df
|
|
165
196
|
|
|
166
197
|
def check_constraints_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
167
|
-
"""
|
|
168
|
-
|
|
169
|
-
|
|
198
|
+
"""Filter rows that violate income-related constraints.
|
|
199
|
+
|
|
200
|
+
This function is specific to the Family Income dataset. It removes rows
|
|
201
|
+
that do not satisfy the following constraints:
|
|
202
|
+
1. Household income must be greater than all expenses.
|
|
203
|
+
2. Food expense must be greater than the sum of detailed food expenses.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
df (pd.DataFrame): Input dataframe containing income and expense data.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
pd.DataFrame: Filtered dataframe containing only rows that satisfy
|
|
210
|
+
all constraints.
|
|
170
211
|
"""
|
|
171
212
|
temp_df = df.copy()
|
|
172
213
|
# check that household income is larger than expenses in the output
|
|
@@ -234,9 +275,7 @@ def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
234
275
|
food_mul_expense_array = expense_reduced_array[:, [1, 2, 3]]
|
|
235
276
|
food_mul_expense_array_sum = np.sum(food_mul_expense_array, axis=1)
|
|
236
277
|
food_expense_array = expense_reduced_array[:, 0]
|
|
237
|
-
sanity_check_array = np.greater_equal(
|
|
238
|
-
food_expense_array, food_mul_expense_array_sum
|
|
239
|
-
)
|
|
278
|
+
sanity_check_array = np.greater_equal(food_expense_array, food_mul_expense_array_sum)
|
|
240
279
|
drop_reduction["Unimportant"] = sanity_check_array.tolist()
|
|
241
280
|
new_reduction = drop_reduction[drop_reduction.Unimportant]
|
|
242
281
|
satisfied_constraints_df = new_reduction.drop("Unimportant", axis=1)
|
|
@@ -249,17 +288,21 @@ def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
249
288
|
temp_df = df.copy()
|
|
250
289
|
# extract all the column names
|
|
251
290
|
column_names = temp_df.columns.tolist()
|
|
252
|
-
# the 2nd-9th columns correspond to output variables and all
|
|
291
|
+
# the 2nd-9th columns correspond to output variables and all
|
|
292
|
+
# others to input variables. So make list of corresponding
|
|
293
|
+
# lengths of 'Input' and 'Output'
|
|
253
294
|
input_list_start = ["Input"]
|
|
254
295
|
input_list_end = ["Input"] * (len(column_names) - 9)
|
|
255
296
|
output_list = ["Output"] * 8
|
|
256
297
|
# concat both lists
|
|
257
298
|
input_output_list = input_list_start + output_list + input_list_end
|
|
258
|
-
# define multi index for attaching this 'Input' and
|
|
299
|
+
# define multi index for attaching this 'Input' and
|
|
300
|
+
# 'Output' list with the column names already existing
|
|
259
301
|
multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
|
|
260
302
|
# transpose such that index can be adjusted to multi index
|
|
261
303
|
new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
|
|
262
|
-
# transpose back such that columns are the same as
|
|
304
|
+
# transpose back such that columns are the same as
|
|
305
|
+
# before except with different labels
|
|
263
306
|
return new_df.transpose()
|
|
264
307
|
|
|
265
308
|
def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -273,13 +316,17 @@ def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
273
316
|
df.dropna(how="any")
|
|
274
317
|
# convert object to fitting dtype
|
|
275
318
|
.convert_dtypes()
|
|
276
|
-
# remove all strings (no other dtypes are present
|
|
319
|
+
# remove all strings (no other dtypes are present
|
|
320
|
+
# except for integers and floats)
|
|
277
321
|
.select_dtypes(exclude=["string"])
|
|
278
322
|
# transform all numbers to same dtype
|
|
279
323
|
.astype("float32")
|
|
280
|
-
# drop column with label Agricultural Household indicator
|
|
324
|
+
# drop column with label Agricultural Household indicator
|
|
325
|
+
# because this is not really a numerical input but
|
|
326
|
+
# rather a categorical/classification
|
|
281
327
|
.drop(["Agricultural Household indicator"], axis=1, inplace=False)
|
|
282
|
-
# this column is dropped because it depends on
|
|
328
|
+
# this column is dropped because it depends on
|
|
329
|
+
# Agricultural Household indicator
|
|
283
330
|
.drop(["Crop Farming and Gardening expenses"], axis=1, inplace=False)
|
|
284
331
|
# use 8 output variables and 24 input variables
|
|
285
332
|
.drop(
|
|
@@ -308,3 +355,85 @@ def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
308
355
|
# sample 2500 examples
|
|
309
356
|
.pipe(sample_2500_examples)
|
|
310
357
|
)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def preprocess_AdultCensusIncome(df: pd.DataFrame) -> pd.DataFrame: # noqa: N802
|
|
361
|
+
"""Preprocesses the Adult Census Income dataset for PyTorch ML.
|
|
362
|
+
|
|
363
|
+
Sequential steps:
|
|
364
|
+
- Drop rows with missing values.
|
|
365
|
+
- Encode categorical variables to integer labels.
|
|
366
|
+
- Map the target 'income' column to 0/1.
|
|
367
|
+
- Convert all data to float32.
|
|
368
|
+
- Add a multiindex to denote Input vs Output columns.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
df (pd.DataFrame): Raw dataframe containing Adult Census Income data.
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
pd.DataFrame: Preprocessed dataframe.
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
def drop_missing(df: pd.DataFrame) -> pd.DataFrame:
|
|
378
|
+
"""Drop rows with any missing values."""
|
|
379
|
+
return df.dropna(how="any")
|
|
380
|
+
|
|
381
|
+
def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
382
|
+
return df.drop(columns=["fnlwgt", "education.num"], errors="ignore")
|
|
383
|
+
|
|
384
|
+
def label_encode_column(series: pd.Series, col_name: str = None) -> pd.Series:
|
|
385
|
+
"""Encode a pandas Series of categorical strings into integers."""
|
|
386
|
+
categories = series.dropna().unique().tolist()
|
|
387
|
+
cat_to_int = {cat: i for i, cat in enumerate(categories)}
|
|
388
|
+
if col_name:
|
|
389
|
+
print(f"Column '{col_name}' encoding:")
|
|
390
|
+
for cat, idx in cat_to_int.items():
|
|
391
|
+
print(f" {cat} -> {idx}")
|
|
392
|
+
return series.map(cat_to_int).astype(int)
|
|
393
|
+
|
|
394
|
+
def encode_categorical(df: pd.DataFrame) -> pd.DataFrame:
|
|
395
|
+
"""Convert categorical string columns to integer labels using label_encode_column."""
|
|
396
|
+
df_temp = df.copy()
|
|
397
|
+
categorical_cols = [
|
|
398
|
+
"workclass",
|
|
399
|
+
"education",
|
|
400
|
+
"marital.status",
|
|
401
|
+
"occupation",
|
|
402
|
+
"relationship",
|
|
403
|
+
"race",
|
|
404
|
+
"sex",
|
|
405
|
+
"native.country",
|
|
406
|
+
]
|
|
407
|
+
for col in categorical_cols:
|
|
408
|
+
df_temp[col] = label_encode_column(df_temp[col].astype(str), col_name=col)
|
|
409
|
+
return df_temp
|
|
410
|
+
|
|
411
|
+
def map_target(df: pd.DataFrame) -> pd.DataFrame:
|
|
412
|
+
"""Map income column to 0 (<=50K) and 1 (>50K)."""
|
|
413
|
+
df_temp = df.copy()
|
|
414
|
+
df_temp["income"] = df_temp["income"].map({"<=50K": 0, ">50K": 1})
|
|
415
|
+
return df_temp
|
|
416
|
+
|
|
417
|
+
def convert_float32(df: pd.DataFrame) -> pd.DataFrame:
|
|
418
|
+
"""Convert all data to float32 for PyTorch compatibility."""
|
|
419
|
+
return df.astype("float32")
|
|
420
|
+
|
|
421
|
+
def add_input_output_index(df: pd.DataFrame) -> pd.DataFrame:
|
|
422
|
+
"""Add a multiindex indicating input and output columns."""
|
|
423
|
+
temp_df = df.copy()
|
|
424
|
+
column_names = temp_df.columns.tolist()
|
|
425
|
+
# Only the 'income' column is output
|
|
426
|
+
input_list = ["Input"] * (len(column_names) - 1)
|
|
427
|
+
output_list = ["Output"]
|
|
428
|
+
multiindex_list = input_list + output_list
|
|
429
|
+
multiindex = pd.MultiIndex.from_arrays([multiindex_list, column_names])
|
|
430
|
+
return pd.DataFrame(temp_df.to_numpy(), columns=multiindex)
|
|
431
|
+
|
|
432
|
+
return (
|
|
433
|
+
df.pipe(drop_missing)
|
|
434
|
+
.pipe(drop_columns)
|
|
435
|
+
.pipe(encode_categorical)
|
|
436
|
+
.pipe(map_target)
|
|
437
|
+
.pipe(convert_float32)
|
|
438
|
+
.pipe(add_input_output_index)
|
|
439
|
+
)
|