congrads 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- congrads/__init__.py +7 -6
- congrads/constraints.py +182 -300
- congrads/core.py +158 -144
- congrads/datasets.py +12 -559
- congrads/descriptor.py +20 -35
- congrads/metrics.py +37 -52
- congrads/networks.py +5 -6
- congrads/utils.py +310 -0
- congrads-0.2.0.dist-info/LICENSE +26 -0
- congrads-0.2.0.dist-info/METADATA +222 -0
- congrads-0.2.0.dist-info/RECORD +13 -0
- congrads/learners.py +0 -233
- congrads-0.1.0.dist-info/LICENSE +0 -34
- congrads-0.1.0.dist-info/METADATA +0 -196
- congrads-0.1.0.dist-info/RECORD +0 -13
- {congrads-0.1.0.dist-info → congrads-0.2.0.dist-info}/WHEEL +0 -0
- {congrads-0.1.0.dist-info → congrads-0.2.0.dist-info}/top_level.txt +0 -0
congrads/descriptor.py
CHANGED
|
@@ -1,33 +1,14 @@
|
|
|
1
1
|
class Descriptor:
|
|
2
|
-
|
|
3
|
-
A class to manage the mapping of neurons to layers and their properties
|
|
4
|
-
(e.g., output, constant, or variable) in a neural network.
|
|
5
|
-
|
|
6
|
-
This class enables the organization and description of network elements,
|
|
7
|
-
such as associating neurons with specific layers and categorizing layers
|
|
8
|
-
as outputs, constants, or variables.
|
|
9
|
-
|
|
10
|
-
This allows users to easily place constraints on parts of the network by
|
|
11
|
-
referencing the name that is configured in this class.
|
|
12
|
-
"""
|
|
2
|
+
# TODO regenerate documentation
|
|
13
3
|
|
|
14
4
|
def __init__(
|
|
15
5
|
self,
|
|
16
6
|
):
|
|
17
|
-
"""
|
|
18
|
-
Initialize the Descriptor class with empty mappings for neurons and layers.
|
|
19
|
-
|
|
20
|
-
This includes:
|
|
21
|
-
- `neuron_to_layer`: A dictionary mapping neuron names to their corresponding layer names.
|
|
22
|
-
- `neuron_to_index`: A dictionary mapping neuron names to their corresponding index within a layer.
|
|
23
|
-
- `output_layers`: A set that holds the names of layers marked as output layers.
|
|
24
|
-
- `constant_layers`: A set that holds the names of layers marked as constant layers.
|
|
25
|
-
- `variable_layers`: A set that holds the names of layers marked as variable layers.
|
|
26
|
-
"""
|
|
27
7
|
|
|
28
8
|
# Define dictionaries that will translate neuron names to layer and index
|
|
29
9
|
self.neuron_to_layer: dict[str, str] = {}
|
|
30
10
|
self.neuron_to_index: dict[str, int] = {}
|
|
11
|
+
self.neuron_to_minmax: dict[str, tuple[float, float]] = {}
|
|
31
12
|
|
|
32
13
|
# Define sets that will hold the layers based on which type
|
|
33
14
|
self.output_layers: set[str] = set()
|
|
@@ -37,20 +18,13 @@ class Descriptor:
|
|
|
37
18
|
def add(
|
|
38
19
|
self,
|
|
39
20
|
layer_name: str,
|
|
40
|
-
|
|
21
|
+
index: int,
|
|
22
|
+
neuron_name: str,
|
|
23
|
+
min: float = 0,
|
|
24
|
+
max: float = 1,
|
|
41
25
|
output: bool = False,
|
|
42
26
|
constant: bool = False,
|
|
43
27
|
):
|
|
44
|
-
"""
|
|
45
|
-
Add a layer to the descriptor, associating it with neurons and marking it
|
|
46
|
-
as an output or constant layer.
|
|
47
|
-
|
|
48
|
-
Args:
|
|
49
|
-
layer_name (str): The name of the layer to be added.
|
|
50
|
-
neuron_names (list[str]): A list of neuron names that belong to the layer.
|
|
51
|
-
output (bool, optional): If True, mark this layer as an output layer. Defaults to False.
|
|
52
|
-
constant (bool, optional): If True, mark this layer as a constant layer. Defaults to False.
|
|
53
|
-
"""
|
|
54
28
|
|
|
55
29
|
if output:
|
|
56
30
|
self.output_layers.add(layer_name)
|
|
@@ -60,6 +34,17 @@ class Descriptor:
|
|
|
60
34
|
else:
|
|
61
35
|
self.variable_layers.add(layer_name)
|
|
62
36
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
37
|
+
self.neuron_to_layer[neuron_name] = layer_name
|
|
38
|
+
self.neuron_to_index[neuron_name] = index
|
|
39
|
+
|
|
40
|
+
if min != None and max == None:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"The min parameter was set without setting the max parameter. Either set both or set none."
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
if max != None and min == None:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"The max parameter was set without setting the min parameter. Either set both or set none."
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
self.neuron_to_minmax[neuron_name] = (min, max)
|
congrads/metrics.py
CHANGED
|
@@ -1,64 +1,49 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from typing import Callable
|
|
2
|
+
from torch import Tensor, mean, cat
|
|
3
|
+
from torch.utils.tensorboard import SummaryWriter
|
|
3
4
|
|
|
4
|
-
# NOTE
|
|
5
5
|
|
|
6
|
+
class Metric:
|
|
7
|
+
def __init__(
|
|
8
|
+
self, name: str, accumulator: Callable[..., Tensor] = mean, device=None
|
|
9
|
+
) -> None:
|
|
10
|
+
self.name = name
|
|
11
|
+
self.accumulator = accumulator
|
|
12
|
+
self.device = device
|
|
6
13
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
A custom metric to calculate the ratio of satisfied constraints in a neural network model.
|
|
10
|
-
It computes the proportion of constraints that have been satisfied,
|
|
11
|
-
where satisfaction is determined based on the provided constraint results.
|
|
14
|
+
self.values = []
|
|
15
|
+
self.sample_count = 0
|
|
12
16
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
17
|
+
def accumulate(self, value: Tensor) -> None:
|
|
18
|
+
self.values.append(value)
|
|
19
|
+
self.sample_count += value.size(0)
|
|
16
20
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
21
|
+
def aggregate(self) -> Tensor:
|
|
22
|
+
combined = cat(self.values)
|
|
23
|
+
return self.accumulator(combined)
|
|
20
24
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
"""
|
|
25
|
+
def reset(self) -> None:
|
|
26
|
+
self.values = []
|
|
27
|
+
self.sample_count = 0
|
|
25
28
|
|
|
26
|
-
def __init__(self, **kwargs):
|
|
27
|
-
"""
|
|
28
|
-
Initializes the ConstraintSatisfactionRatio metric by setting up the
|
|
29
|
-
state variables to track the number of unsatisfied and total constraints.
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
30
|
+
class MetricManager:
|
|
31
|
+
def __init__(self, writer: SummaryWriter, device: str) -> None:
|
|
32
|
+
self.writer = writer
|
|
33
|
+
self.device = device
|
|
34
|
+
self.metrics: dict[str, Metric] = {}
|
|
34
35
|
|
|
35
|
-
|
|
36
|
-
|
|
36
|
+
def register(self, name: str, accumulator: Callable[..., Tensor] = mean) -> None:
|
|
37
|
+
self.metrics[name] = Metric(name, accumulator, self.device)
|
|
37
38
|
|
|
38
|
-
|
|
39
|
-
self.
|
|
40
|
-
self.add_state("total", default=tensor(0), dist_reduce_fx="sum")
|
|
39
|
+
def accumulate(self, name: str, value: Tensor) -> None:
|
|
40
|
+
self.metrics[name].accumulate(value)
|
|
41
41
|
|
|
42
|
-
def
|
|
43
|
-
|
|
44
|
-
|
|
42
|
+
def record(self, epoch: int) -> None:
|
|
43
|
+
for name, metric in self.metrics.items():
|
|
44
|
+
result = metric.aggregate()
|
|
45
|
+
self.writer.add_scalar(name, result.item(), epoch)
|
|
45
46
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
element indicates whether a constraint
|
|
50
|
-
is satisfied (e.g., 0 for satisfied,
|
|
51
|
-
1 for unsatisfied).
|
|
52
|
-
"""
|
|
53
|
-
self.unsatisfied += sum(constraint_result)
|
|
54
|
-
self.total += numel(constraint_result)
|
|
55
|
-
|
|
56
|
-
def compute(self) -> Tensor:
|
|
57
|
-
"""
|
|
58
|
-
Computes the constraint satisfaction ratio, defined as:
|
|
59
|
-
1 - (number of unsatisfied constraints / total constraints).
|
|
60
|
-
|
|
61
|
-
Returns:
|
|
62
|
-
Tensor: The satisfaction ratio as a scalar tensor.
|
|
63
|
-
"""
|
|
64
|
-
return 1 - (self.unsatisfied.float() / self.total)
|
|
47
|
+
def reset(self) -> None:
|
|
48
|
+
for metric in self.metrics.values():
|
|
49
|
+
metric.reset()
|
congrads/networks.py
CHANGED
|
@@ -24,9 +24,9 @@ class MLPNetwork(Module):
|
|
|
24
24
|
|
|
25
25
|
def __init__(
|
|
26
26
|
self,
|
|
27
|
-
n_inputs
|
|
28
|
-
n_outputs
|
|
29
|
-
n_hidden_layers=
|
|
27
|
+
n_inputs,
|
|
28
|
+
n_outputs,
|
|
29
|
+
n_hidden_layers=3,
|
|
30
30
|
hidden_dim=35,
|
|
31
31
|
):
|
|
32
32
|
"""
|
|
@@ -47,7 +47,7 @@ class MLPNetwork(Module):
|
|
|
47
47
|
self.hidden_dim = hidden_dim
|
|
48
48
|
|
|
49
49
|
# Set up the components of our model
|
|
50
|
-
self.input =
|
|
50
|
+
self.input = Linear(self.n_inputs, self.hidden_dim)
|
|
51
51
|
self.hidden = Sequential(
|
|
52
52
|
*(
|
|
53
53
|
self.linear(self.hidden_dim, self.hidden_dim)
|
|
@@ -67,10 +67,9 @@ class MLPNetwork(Module):
|
|
|
67
67
|
dict: A dictionary containing the 'input' (original input) and
|
|
68
68
|
'output' (predicted output) of the network.
|
|
69
69
|
"""
|
|
70
|
-
input = X
|
|
71
70
|
output = self.out(self.hidden(self.input(X)))
|
|
72
71
|
|
|
73
|
-
return {"input":
|
|
72
|
+
return {"input": X, "output": output}
|
|
74
73
|
|
|
75
74
|
@staticmethod
|
|
76
75
|
def linear(in_features, out_features):
|
congrads/utils.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from torch.utils.data import Dataset, random_split, DataLoader
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def splitDataLoaders(
|
|
7
|
+
data: Dataset,
|
|
8
|
+
loader_args: dict = None,
|
|
9
|
+
train_loader_args: dict = None,
|
|
10
|
+
valid_loader_args: dict = None,
|
|
11
|
+
test_loader_args: dict = None,
|
|
12
|
+
train_size: float = 0.8,
|
|
13
|
+
valid_size: float = 0.1,
|
|
14
|
+
test_size: float = 0.1,
|
|
15
|
+
) -> tuple[DataLoader, DataLoader, DataLoader]:
|
|
16
|
+
|
|
17
|
+
# Validate split sizes
|
|
18
|
+
if not (0 < train_size < 1 and 0 < valid_size < 1 and 0 < test_size < 1):
|
|
19
|
+
raise ValueError(
|
|
20
|
+
"train_size, valid_size, and test_size must be between 0 and 1."
|
|
21
|
+
)
|
|
22
|
+
if not abs(train_size + valid_size + test_size - 1.0) < 1e-6:
|
|
23
|
+
raise ValueError("train_size, valid_size, and test_size must sum to 1.")
|
|
24
|
+
|
|
25
|
+
# Perform the splits
|
|
26
|
+
train_val_data, test_data = random_split(data, [1 - test_size, test_size])
|
|
27
|
+
train_data, valid_data = random_split(
|
|
28
|
+
train_val_data,
|
|
29
|
+
[
|
|
30
|
+
train_size / (1 - test_size),
|
|
31
|
+
valid_size / (1 - test_size),
|
|
32
|
+
],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Set default arguments for each loader
|
|
36
|
+
train_loader_args = train_loader_args or loader_args or {}
|
|
37
|
+
valid_loader_args = valid_loader_args or loader_args or {}
|
|
38
|
+
test_loader_args = test_loader_args or loader_args or {}
|
|
39
|
+
|
|
40
|
+
# Create the DataLoaders
|
|
41
|
+
train_generator = DataLoader(train_data, **train_loader_args)
|
|
42
|
+
valid_generator = DataLoader(valid_data, **valid_loader_args)
|
|
43
|
+
test_generator = DataLoader(test_data, **test_loader_args)
|
|
44
|
+
|
|
45
|
+
return train_generator, valid_generator, test_generator
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def preprocess_BiasCorrection(df: pd.DataFrame) -> pd.DataFrame:
|
|
49
|
+
|
|
50
|
+
def date_to_datetime(df: pd.DataFrame) -> pd.DataFrame:
|
|
51
|
+
"""Transform the string that denotes the date to the datetime format in pandas."""
|
|
52
|
+
# make copy of dataframe
|
|
53
|
+
df_temp = df.copy()
|
|
54
|
+
# add new column at the front where the date string is transformed to the datetime format
|
|
55
|
+
df_temp.insert(0, "DateTransformed", pd.to_datetime(df_temp["Date"]))
|
|
56
|
+
return df_temp
|
|
57
|
+
|
|
58
|
+
def add_year(df: pd.DataFrame) -> pd.DataFrame:
|
|
59
|
+
"""Extract the year from the datetime cell and add it as a new column to the dataframe at the front."""
|
|
60
|
+
# make copy of dataframe
|
|
61
|
+
df_temp = df.copy()
|
|
62
|
+
# extract year and add new column at the front containing these numbers
|
|
63
|
+
df_temp.insert(0, "Year", df_temp["DateTransformed"].dt.year)
|
|
64
|
+
return df_temp
|
|
65
|
+
|
|
66
|
+
def add_month(df: pd.DataFrame) -> pd.DataFrame:
|
|
67
|
+
"""Extract the month from the datetime cell and add it as a new column to the dataframe at the front."""
|
|
68
|
+
# make copy of dataframe
|
|
69
|
+
df_temp = df.copy()
|
|
70
|
+
# extract month and add new column at index 1 containing these numbers
|
|
71
|
+
df_temp.insert(1, "Month", df_temp["DateTransformed"].dt.month)
|
|
72
|
+
return df_temp
|
|
73
|
+
|
|
74
|
+
def add_day(df: pd.DataFrame) -> pd.DataFrame:
|
|
75
|
+
"""Extract the day from the datetime cell and add it as a new column to the dataframe at the front."""
|
|
76
|
+
# make copy of dataframe
|
|
77
|
+
df_temp = df.copy()
|
|
78
|
+
# extract day and add new column at index 2 containing these numbers
|
|
79
|
+
df_temp.insert(2, "Day", df_temp["DateTransformed"].dt.day)
|
|
80
|
+
return df_temp
|
|
81
|
+
|
|
82
|
+
def add_input_output_temperature(df: pd.DataFrame) -> pd.DataFrame:
|
|
83
|
+
"""Add a multiindex denoting if the column is an input or output variable."""
|
|
84
|
+
# copy the dataframe
|
|
85
|
+
temp_df = df.copy()
|
|
86
|
+
# extract all the column names
|
|
87
|
+
column_names = temp_df.columns.tolist()
|
|
88
|
+
# only the last 2 columns are output variables, all others are input variables. So make list of corresponding lengths of 'Input' and 'Output'
|
|
89
|
+
input_list = ["Input"] * (len(column_names) - 2)
|
|
90
|
+
output_list = ["Output"] * 2
|
|
91
|
+
# concat both lists
|
|
92
|
+
input_output_list = input_list + output_list
|
|
93
|
+
# define multi index for attaching this 'Input' and 'Output' list with the column names already existing
|
|
94
|
+
multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
|
|
95
|
+
# transpose such that index can be adjusted to multi index
|
|
96
|
+
new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
|
|
97
|
+
# transpose back such that columns are the same as before except with different labels
|
|
98
|
+
return new_df.transpose()
|
|
99
|
+
|
|
100
|
+
def normalize_columns_bias(df: pd.DataFrame) -> pd.DataFrame:
|
|
101
|
+
"""Normalize the columns for the bias correction dataset. This is different from normalizing all the columns separately because the
|
|
102
|
+
upper and lower bounds for the output variables are assumed to be the same."""
|
|
103
|
+
# copy the dataframe
|
|
104
|
+
temp_df = df.copy()
|
|
105
|
+
# normalize each column
|
|
106
|
+
for feature_name in df.columns:
|
|
107
|
+
# the output columns are normalized using the same upper and lower bound for more efficient check of the inequality
|
|
108
|
+
if feature_name == "Next_Tmax" or feature_name == "Next_Tmin":
|
|
109
|
+
max_value = 38.9
|
|
110
|
+
min_value = 11.3
|
|
111
|
+
# the input columns are normalized using their respective upper and lower bounds
|
|
112
|
+
else:
|
|
113
|
+
max_value = df[feature_name].max()
|
|
114
|
+
min_value = df[feature_name].min()
|
|
115
|
+
temp_df[feature_name] = (df[feature_name] - min_value) / (
|
|
116
|
+
max_value - min_value
|
|
117
|
+
)
|
|
118
|
+
return temp_df
|
|
119
|
+
|
|
120
|
+
def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
|
|
121
|
+
"""Sample 2500 examples from the dataframe without replacement."""
|
|
122
|
+
temp_df = df.copy()
|
|
123
|
+
sample_df = temp_df.sample(n=2500, replace=False, random_state=3, axis=0)
|
|
124
|
+
return sample_df
|
|
125
|
+
|
|
126
|
+
return (
|
|
127
|
+
# drop missing values
|
|
128
|
+
df.dropna(how="any")
|
|
129
|
+
# transform string date to datetime format
|
|
130
|
+
.pipe(date_to_datetime)
|
|
131
|
+
# add year as a single column
|
|
132
|
+
.pipe(add_year)
|
|
133
|
+
# add month as a single column
|
|
134
|
+
.pipe(add_month)
|
|
135
|
+
# add day as a single column
|
|
136
|
+
.pipe(add_day)
|
|
137
|
+
# remove original date string and the datetime format
|
|
138
|
+
.drop(["Date", "DateTransformed"], axis=1, inplace=False)
|
|
139
|
+
# convert all numbers to float32
|
|
140
|
+
.astype("float32")
|
|
141
|
+
# normalize columns
|
|
142
|
+
.pipe(normalize_columns_bias)
|
|
143
|
+
# add multi index indicating which columns are corresponding to input and output variables
|
|
144
|
+
.pipe(add_input_output_temperature)
|
|
145
|
+
# sample 2500 examples out of the dataset
|
|
146
|
+
.pipe(sample_2500_examples)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def preprocess_FiniteIncome(df: pd.DataFrame) -> pd.DataFrame:
|
|
151
|
+
|
|
152
|
+
def normalize_columns_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
153
|
+
"""Normalize the columns for the Family Income dataframe. This can also be applied to other dataframes because this function normalizes
|
|
154
|
+
all columns individually."""
|
|
155
|
+
# copy the dataframe
|
|
156
|
+
temp_df = df.copy()
|
|
157
|
+
# normalize each column
|
|
158
|
+
for feature_name in df.columns:
|
|
159
|
+
max_value = df[feature_name].max()
|
|
160
|
+
min_value = df[feature_name].min()
|
|
161
|
+
temp_df[feature_name] = (df[feature_name] - min_value) / (
|
|
162
|
+
max_value - min_value
|
|
163
|
+
)
|
|
164
|
+
return temp_df
|
|
165
|
+
|
|
166
|
+
def check_constraints_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
167
|
+
"""Check if all the constraints are satisfied for the dataframe and remove the examples that do not satisfy the constraint. This
|
|
168
|
+
function only works for the Family Income dataset and the constraints are that the household income is larger than all the expenses
|
|
169
|
+
and the food expense is larger than the sum of the other (more detailed) food expenses.
|
|
170
|
+
"""
|
|
171
|
+
temp_df = df.copy()
|
|
172
|
+
# check that household income is larger than expenses in the output
|
|
173
|
+
input_array = temp_df["Input"].to_numpy()
|
|
174
|
+
income_array = np.add(
|
|
175
|
+
np.multiply(
|
|
176
|
+
input_array[:, [0, 1]],
|
|
177
|
+
np.subtract(np.asarray([11815988, 9234485]), np.asarray([11285, 0])),
|
|
178
|
+
),
|
|
179
|
+
np.asarray([11285, 0]),
|
|
180
|
+
)
|
|
181
|
+
expense_array = temp_df["Output"].to_numpy()
|
|
182
|
+
expense_array = np.add(
|
|
183
|
+
np.multiply(
|
|
184
|
+
expense_array,
|
|
185
|
+
np.subtract(
|
|
186
|
+
np.asarray(
|
|
187
|
+
[
|
|
188
|
+
791848,
|
|
189
|
+
437467,
|
|
190
|
+
140992,
|
|
191
|
+
74800,
|
|
192
|
+
2188560,
|
|
193
|
+
1049275,
|
|
194
|
+
149940,
|
|
195
|
+
731000,
|
|
196
|
+
]
|
|
197
|
+
),
|
|
198
|
+
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
199
|
+
),
|
|
200
|
+
),
|
|
201
|
+
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
202
|
+
)
|
|
203
|
+
expense_array_without_dup = expense_array[:, [0, 4, 5, 6, 7]]
|
|
204
|
+
sum_expenses = np.sum(expense_array_without_dup, axis=1)
|
|
205
|
+
total_income = np.sum(income_array, axis=1)
|
|
206
|
+
sanity_check_array = np.greater_equal(total_income, sum_expenses)
|
|
207
|
+
temp_df["Unimportant"] = sanity_check_array.tolist()
|
|
208
|
+
reduction = temp_df[temp_df.Unimportant]
|
|
209
|
+
drop_reduction = reduction.drop("Unimportant", axis=1)
|
|
210
|
+
|
|
211
|
+
# check that the food expense is larger than all the sub expenses
|
|
212
|
+
expense_reduced_array = drop_reduction["Output"].to_numpy()
|
|
213
|
+
expense_reduced_array = np.add(
|
|
214
|
+
np.multiply(
|
|
215
|
+
expense_reduced_array,
|
|
216
|
+
np.subtract(
|
|
217
|
+
np.asarray(
|
|
218
|
+
[
|
|
219
|
+
791848,
|
|
220
|
+
437467,
|
|
221
|
+
140992,
|
|
222
|
+
74800,
|
|
223
|
+
2188560,
|
|
224
|
+
1049275,
|
|
225
|
+
149940,
|
|
226
|
+
731000,
|
|
227
|
+
]
|
|
228
|
+
),
|
|
229
|
+
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
230
|
+
),
|
|
231
|
+
),
|
|
232
|
+
np.asarray([3704, 0, 0, 0, 1950, 0, 0, 0]),
|
|
233
|
+
)
|
|
234
|
+
food_mul_expense_array = expense_reduced_array[:, [1, 2, 3]]
|
|
235
|
+
food_mul_expense_array_sum = np.sum(food_mul_expense_array, axis=1)
|
|
236
|
+
food_expense_array = expense_reduced_array[:, 0]
|
|
237
|
+
sanity_check_array = np.greater_equal(
|
|
238
|
+
food_expense_array, food_mul_expense_array_sum
|
|
239
|
+
)
|
|
240
|
+
drop_reduction["Unimportant"] = sanity_check_array.tolist()
|
|
241
|
+
new_reduction = drop_reduction[drop_reduction.Unimportant]
|
|
242
|
+
satisfied_constraints_df = new_reduction.drop("Unimportant", axis=1)
|
|
243
|
+
|
|
244
|
+
return satisfied_constraints_df
|
|
245
|
+
|
|
246
|
+
def add_input_output_family_income(df: pd.DataFrame) -> pd.DataFrame:
|
|
247
|
+
"""Add a multiindex denoting if the column is an input or output variable."""
|
|
248
|
+
# copy the dataframe
|
|
249
|
+
temp_df = df.copy()
|
|
250
|
+
# extract all the column names
|
|
251
|
+
column_names = temp_df.columns.tolist()
|
|
252
|
+
# the 2nd-9th columns correspond to output variables and all others to input variables. So make list of corresponding lengths of 'Input' and 'Output'
|
|
253
|
+
input_list_start = ["Input"]
|
|
254
|
+
input_list_end = ["Input"] * (len(column_names) - 9)
|
|
255
|
+
output_list = ["Output"] * 8
|
|
256
|
+
# concat both lists
|
|
257
|
+
input_output_list = input_list_start + output_list + input_list_end
|
|
258
|
+
# define multi index for attaching this 'Input' and 'Output' list with the column names already existing
|
|
259
|
+
multiindex_bias = pd.MultiIndex.from_arrays([input_output_list, column_names])
|
|
260
|
+
# transpose such that index can be adjusted to multi index
|
|
261
|
+
new_df = pd.DataFrame(df.transpose().to_numpy(), index=multiindex_bias)
|
|
262
|
+
# transpose back such that columns are the same as before except with different labels
|
|
263
|
+
return new_df.transpose()
|
|
264
|
+
|
|
265
|
+
def sample_2500_examples(df: pd.DataFrame) -> pd.DataFrame:
|
|
266
|
+
"""Sample 2500 examples from the dataframe without replacement."""
|
|
267
|
+
temp_df = df.copy()
|
|
268
|
+
sample_df = temp_df.sample(n=2500, replace=False, random_state=3, axis=0)
|
|
269
|
+
return sample_df
|
|
270
|
+
|
|
271
|
+
return (
|
|
272
|
+
# drop missing values
|
|
273
|
+
df.dropna(how="any")
|
|
274
|
+
# convert object to fitting dtype
|
|
275
|
+
.convert_dtypes()
|
|
276
|
+
# remove all strings (no other dtypes are present except for integers and floats)
|
|
277
|
+
.select_dtypes(exclude=["string"])
|
|
278
|
+
# transform all numbers to same dtype
|
|
279
|
+
.astype("float32")
|
|
280
|
+
# drop column with label Agricultural Household indicator because this is not really a numerical input but rather a categorical/classification
|
|
281
|
+
.drop(["Agricultural Household indicator"], axis=1, inplace=False)
|
|
282
|
+
# this column is dropped because it depends on Agricultural Household indicator
|
|
283
|
+
.drop(["Crop Farming and Gardening expenses"], axis=1, inplace=False)
|
|
284
|
+
# use 8 output variables and 24 input variables
|
|
285
|
+
.drop(
|
|
286
|
+
[
|
|
287
|
+
"Total Rice Expenditure",
|
|
288
|
+
"Total Fish and marine products Expenditure",
|
|
289
|
+
"Fruit Expenditure",
|
|
290
|
+
"Restaurant and hotels Expenditure",
|
|
291
|
+
"Alcoholic Beverages Expenditure",
|
|
292
|
+
"Tobacco Expenditure",
|
|
293
|
+
"Clothing, Footwear and Other Wear Expenditure",
|
|
294
|
+
"Imputed House Rental Value",
|
|
295
|
+
"Transportation Expenditure",
|
|
296
|
+
"Miscellaneous Goods and Services Expenditure",
|
|
297
|
+
"Special Occasions Expenditure",
|
|
298
|
+
],
|
|
299
|
+
axis=1,
|
|
300
|
+
inplace=False,
|
|
301
|
+
)
|
|
302
|
+
# add input and output labels to each column
|
|
303
|
+
.pipe(add_input_output_family_income)
|
|
304
|
+
# normalize all the columns
|
|
305
|
+
.pipe(normalize_columns_income)
|
|
306
|
+
# remove all datapoints that do not satisfy the constraints
|
|
307
|
+
.pipe(check_constraints_income)
|
|
308
|
+
# sample 2500 examples
|
|
309
|
+
.pipe(sample_2500_examples)
|
|
310
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Copyright 2024 DTAI - KU Leuven
|
|
2
|
+
|
|
3
|
+
Redistribution and use in source and binary forms, with or without modification,
|
|
4
|
+
are permitted provided that the following conditions are met:
|
|
5
|
+
|
|
6
|
+
1. Redistributions of source code must retain the above copyright notice,
|
|
7
|
+
this list of conditions and the following disclaimer.
|
|
8
|
+
|
|
9
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
10
|
+
this list of conditions and the following disclaimer in the documentation
|
|
11
|
+
and/or other materials provided with the distribution.
|
|
12
|
+
|
|
13
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
14
|
+
contributors may be used to endorse or promote products derived from
|
|
15
|
+
this software without specific prior written permission.
|
|
16
|
+
|
|
17
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
|
|
18
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
19
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
20
|
+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
21
|
+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
22
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
23
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
24
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
25
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
26
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|