humancompatible-detect 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- humancompatible/__init__.py +0 -0
- humancompatible/detect/__init__.py +13 -0
- humancompatible/detect/binarizer/Binarizer.py +265 -0
- humancompatible/detect/binarizer/__init__.py +3 -0
- humancompatible/detect/data_handler/DataHandler.py +351 -0
- humancompatible/detect/data_handler/__init__.py +3 -0
- humancompatible/detect/data_handler/features/Binary.py +107 -0
- humancompatible/detect/data_handler/features/Categorical.py +157 -0
- humancompatible/detect/data_handler/features/Contiguous.py +107 -0
- humancompatible/detect/data_handler/features/Feature.py +105 -0
- humancompatible/detect/data_handler/features/Mixed.py +147 -0
- humancompatible/detect/data_handler/features/__init__.py +6 -0
- humancompatible/detect/data_handler/features/utils.py +88 -0
- humancompatible/detect/data_handler/types.py +15 -0
- humancompatible/detect/detect_bias.py +280 -0
- humancompatible/detect/evaluate_bias.py +286 -0
- humancompatible/detect/helpers/__init__.py +5 -0
- humancompatible/detect/helpers/prepare.py +105 -0
- humancompatible/detect/helpers/utils.py +364 -0
- humancompatible/detect/methods/__init__.py +0 -0
- humancompatible/detect/methods/l_inf/__init__.py +3 -0
- humancompatible/detect/methods/l_inf/l_inf.py +109 -0
- humancompatible/detect/methods/l_inf/lp_tools.py +53 -0
- humancompatible/detect/methods/msd/__init__.py +4 -0
- humancompatible/detect/methods/msd/mapping_msd.py +112 -0
- humancompatible/detect/methods/msd/metrics_msd.py +34 -0
- humancompatible/detect/methods/msd/msd.py +93 -0
- humancompatible/detect/methods/msd/one_rule.py +324 -0
- humancompatible_detect-0.1.4.dist-info/METADATA +253 -0
- humancompatible_detect-0.1.4.dist-info/RECORD +33 -0
- humancompatible_detect-0.1.4.dist-info/WHEEL +5 -0
- humancompatible_detect-0.1.4.dist-info/licenses/LICENSE +201 -0
- humancompatible_detect-0.1.4.dist-info/top_level.txt +1 -0
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .detect_bias import most_biased_subgroup, most_biased_subgroup_csv, most_biased_subgroup_two_samples
|
|
2
|
+
from .evaluate_bias import evaluate_biased_subgroup, evaluate_biased_subgroup_csv, evaluate_biased_subgroup_two_samples
|
|
3
|
+
from .helpers.utils import detect_and_score
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"detect_and_score",
|
|
7
|
+
"most_biased_subgroup",
|
|
8
|
+
"most_biased_subgroup_csv",
|
|
9
|
+
"most_biased_subgroup_two_samples",
|
|
10
|
+
"evaluate_biased_subgroup",
|
|
11
|
+
"evaluate_biased_subgroup_csv",
|
|
12
|
+
"evaluate_biased_subgroup_two_samples",
|
|
13
|
+
]
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from ..data_handler import DataHandler
|
|
9
|
+
from ..data_handler.features import Binary, Categorical, Contiguous, Feature, Mixed
|
|
10
|
+
from ..data_handler.types import CategValue, DataLike, OneDimData
|
|
11
|
+
|
|
12
|
+
BinValue = float | tuple[float, float] | CategValue | list[CategValue] | bool
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Operation(Enum):
|
|
16
|
+
EQ = "="
|
|
17
|
+
NE = "!="
|
|
18
|
+
LE = "<="
|
|
19
|
+
LT = "<"
|
|
20
|
+
GE = ">="
|
|
21
|
+
GT = ">"
|
|
22
|
+
IN = "in"
|
|
23
|
+
NOT_IN = "not in"
|
|
24
|
+
BETWEEN = "between"
|
|
25
|
+
OUTSIDE = "outside"
|
|
26
|
+
|
|
27
|
+
# TODO move the perform to the Bin class
|
|
28
|
+
@classmethod
|
|
29
|
+
def perform(
|
|
30
|
+
cls, op: Operation, vals: np.ndarray[int | float | str], reference: BinValue
|
|
31
|
+
) -> np.ndarray[bool]:
|
|
32
|
+
if op == Operation.EQ:
|
|
33
|
+
return vals == reference
|
|
34
|
+
elif op == Operation.NE:
|
|
35
|
+
return vals != reference
|
|
36
|
+
elif op == Operation.LE:
|
|
37
|
+
return vals <= reference
|
|
38
|
+
elif op == Operation.LT:
|
|
39
|
+
return vals < reference
|
|
40
|
+
elif op == Operation.GE:
|
|
41
|
+
return vals >= reference
|
|
42
|
+
elif op == Operation.GT:
|
|
43
|
+
return vals > reference
|
|
44
|
+
elif op == Operation.IN:
|
|
45
|
+
result = np.zeros_like(vals, dtype=bool)
|
|
46
|
+
for r in reference:
|
|
47
|
+
result |= vals == r
|
|
48
|
+
return result
|
|
49
|
+
elif op == Operation.NOT_IN:
|
|
50
|
+
return ~Operation.perform(Operation.IN, vals, reference)
|
|
51
|
+
elif op == Operation.BETWEEN:
|
|
52
|
+
return (vals >= reference[0]) & (vals < reference[1])
|
|
53
|
+
elif op == Operation.OUTSIDE:
|
|
54
|
+
return ~Operation.perform(Operation.BETWEEN, vals, reference)
|
|
55
|
+
else:
|
|
56
|
+
raise NotImplementedError(f"Operation {op} is not implemented")
|
|
57
|
+
|
|
58
|
+
@classmethod
|
|
59
|
+
def negated(cls, op) -> Operation:
|
|
60
|
+
return {
|
|
61
|
+
Operation.EQ: Operation.NE,
|
|
62
|
+
Operation.NE: Operation.EQ,
|
|
63
|
+
Operation.LE: Operation.GT,
|
|
64
|
+
Operation.LT: Operation.GE,
|
|
65
|
+
Operation.GE: Operation.LT,
|
|
66
|
+
Operation.GT: Operation.LE,
|
|
67
|
+
Operation.IN: Operation.NOT_IN,
|
|
68
|
+
Operation.NOT_IN: Operation.IN,
|
|
69
|
+
Operation.BETWEEN: Operation.OUTSIDE,
|
|
70
|
+
Operation.OUTSIDE: Operation.BETWEEN,
|
|
71
|
+
}[op]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Bin:
|
|
75
|
+
def __init__(self, feature: Feature, operation: Operation, value: BinValue):
|
|
76
|
+
self.feature = feature
|
|
77
|
+
self.operation = operation
|
|
78
|
+
self.value = value
|
|
79
|
+
|
|
80
|
+
def negate_self(self):
|
|
81
|
+
if isinstance(self.feature, Binary):
|
|
82
|
+
vals = list(self.feature.value_mapping.keys())
|
|
83
|
+
negated_value = vals[0] if vals[1] == self.value else vals[1]
|
|
84
|
+
return Bin(self.feature, self.operation, negated_value)
|
|
85
|
+
else:
|
|
86
|
+
return Bin(self.feature, Operation.negated(self.operation), self.value)
|
|
87
|
+
|
|
88
|
+
def evaluate(self, values: np.ndarray[int | float | str]) -> np.ndarray[bool]:
|
|
89
|
+
return Operation.perform(self.operation, values, self.value)
|
|
90
|
+
|
|
91
|
+
def __repr__(self):
|
|
92
|
+
return f"Bin({repr(self.feature)}, {repr(self.operation)}, {repr(self.value)})"
|
|
93
|
+
|
|
94
|
+
def __str__(self):
|
|
95
|
+
return f"{str(self.feature)} {self.operation.value} {str(self.value)}"
|
|
96
|
+
|
|
97
|
+
def __eq__(self, other):
|
|
98
|
+
return (
|
|
99
|
+
self.feature == other.feature
|
|
100
|
+
and self.operation == other.operation
|
|
101
|
+
and self.value == other.value
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class Binarizer:
|
|
106
|
+
"""Handles binarizing the dataset"""
|
|
107
|
+
|
|
108
|
+
# TODO add specific options for binarization of categoricals (only positive and custom sets) and continuous (custom bins - i.e. quantiles)
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
data_handler: DataHandler,
|
|
112
|
+
target_positive_vals: list[CategValue] | None = None,
|
|
113
|
+
):
|
|
114
|
+
self.__original_dhandler = data_handler
|
|
115
|
+
|
|
116
|
+
binarized_features: list[list[Bin]] = []
|
|
117
|
+
binarized_negations: list[list[Bin]] = []
|
|
118
|
+
for feature in data_handler.features:
|
|
119
|
+
if isinstance(feature, Contiguous):
|
|
120
|
+
binarizations = []
|
|
121
|
+
negations = []
|
|
122
|
+
minval, maxval = feature.bounds
|
|
123
|
+
# to make the last bin include the max value
|
|
124
|
+
eps = (maxval - minval) / 10000
|
|
125
|
+
n_bins = 10
|
|
126
|
+
prev = minval
|
|
127
|
+
for curr in np.linspace(minval, maxval + eps, n_bins + 1)[1:]:
|
|
128
|
+
bounds = (prev, curr)
|
|
129
|
+
binarizations.append(Bin(feature, Operation.BETWEEN, bounds))
|
|
130
|
+
negations.append(Bin(feature, Operation.OUTSIDE, bounds))
|
|
131
|
+
prev = curr
|
|
132
|
+
binarized_features.append(binarizations)
|
|
133
|
+
binarized_negations.append(negations)
|
|
134
|
+
elif isinstance(feature, Mixed):
|
|
135
|
+
raise NotImplementedError("Mixed features are not yet implemented")
|
|
136
|
+
elif isinstance(feature, Binary):
|
|
137
|
+
inv_map = {i: v for v, i in feature.value_mapping.items()}
|
|
138
|
+
binarized_features.append([Bin(feature, Operation.EQ, inv_map[1])])
|
|
139
|
+
binarized_negations.append([Bin(feature, Operation.EQ, inv_map[0])])
|
|
140
|
+
elif isinstance(feature, Categorical):
|
|
141
|
+
binarizations = []
|
|
142
|
+
negations = []
|
|
143
|
+
for value in feature.orig_vals:
|
|
144
|
+
binarizations.append(Bin(feature, Operation.EQ, value))
|
|
145
|
+
negations.append(Bin(feature, Operation.NE, value))
|
|
146
|
+
binarized_features.append(binarizations)
|
|
147
|
+
binarized_negations.append(negations)
|
|
148
|
+
else:
|
|
149
|
+
raise ValueError("Unsupported feature type")
|
|
150
|
+
|
|
151
|
+
# TARGET
|
|
152
|
+
target = data_handler.target_feature
|
|
153
|
+
if isinstance(target, Binary):
|
|
154
|
+
inv_map = {i: v for v, i in target.value_mapping.items()}
|
|
155
|
+
self.binarized_target = Bin(target, Operation.EQ, inv_map[1])
|
|
156
|
+
self.binarized_target_neg = Bin(target, Operation.EQ, inv_map[0])
|
|
157
|
+
elif isinstance(target, Categorical) and (target_positive_vals is not None):
|
|
158
|
+
self.binarized_target = Bin(target, Operation.IN, target_positive_vals)
|
|
159
|
+
negative_vals = [
|
|
160
|
+
v for v in target.orig_vals if v not in target_positive_vals
|
|
161
|
+
]
|
|
162
|
+
self.binarized_target_neg = Bin(target, Operation.IN, negative_vals)
|
|
163
|
+
else:
|
|
164
|
+
raise NotImplementedError(
|
|
165
|
+
"Target feature must be Binary or Categorical with single binarization"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
self.__binarized_features = binarized_features
|
|
169
|
+
self.__binarized_negations = binarized_negations
|
|
170
|
+
|
|
171
|
+
def encode(
|
|
172
|
+
self, X: DataLike, include_negations=False, include_binary_negations=False
|
|
173
|
+
) -> np.ndarray[bool]:
|
|
174
|
+
if isinstance(X, pd.DataFrame):
|
|
175
|
+
X = X.values
|
|
176
|
+
|
|
177
|
+
values = []
|
|
178
|
+
for i, binariaztions in enumerate(self.__binarized_features):
|
|
179
|
+
for bin in binariaztions:
|
|
180
|
+
values.append(Operation.perform(bin.operation, X[:, [i]], bin.value))
|
|
181
|
+
if include_negations:
|
|
182
|
+
for i, binariaztions in enumerate(self.__binarized_negations):
|
|
183
|
+
for bin in binariaztions:
|
|
184
|
+
values.append(
|
|
185
|
+
Operation.perform(bin.operation, X[:, [i]], bin.value)
|
|
186
|
+
)
|
|
187
|
+
elif include_binary_negations:
|
|
188
|
+
for i, binariaztions in enumerate(self.__binarized_negations):
|
|
189
|
+
for bin in binariaztions:
|
|
190
|
+
if isinstance(bin.feature, Binary):
|
|
191
|
+
values.append(
|
|
192
|
+
Operation.perform(bin.operation, X[:, [i]], bin.value)
|
|
193
|
+
)
|
|
194
|
+
return np.hstack(values)
|
|
195
|
+
|
|
196
|
+
def encode_y(self, y: OneDimData) -> np.ndarray[bool]:
|
|
197
|
+
if isinstance(y, pd.Series):
|
|
198
|
+
y = y.values
|
|
199
|
+
res = Operation.perform(
|
|
200
|
+
self.binarized_target.operation, y, self.binarized_target.value
|
|
201
|
+
)
|
|
202
|
+
return res.flatten()
|
|
203
|
+
|
|
204
|
+
def __feature_name_tuples(self, include_negations, include_binary_negations):
|
|
205
|
+
names = []
|
|
206
|
+
if include_negations:
|
|
207
|
+
feats = self.__binarized_features + self.__binarized_negations
|
|
208
|
+
else:
|
|
209
|
+
feats = [f for f in self.__binarized_features]
|
|
210
|
+
if include_binary_negations:
|
|
211
|
+
for binarization in self.__binarized_negations:
|
|
212
|
+
if isinstance(binarization[0].feature, Binary):
|
|
213
|
+
feats.append(binarization)
|
|
214
|
+
for binarization in feats:
|
|
215
|
+
for bin in binarization:
|
|
216
|
+
names.append((bin.feature.name, bin.operation.value, str(bin.value)))
|
|
217
|
+
return names
|
|
218
|
+
|
|
219
|
+
def feature_names(
|
|
220
|
+
self, include_negations=False, include_binary_negations=False
|
|
221
|
+
) -> list[str]:
|
|
222
|
+
return [
|
|
223
|
+
f"{feat} {op} {val}"
|
|
224
|
+
for (feat, op, val) in self.__feature_name_tuples(
|
|
225
|
+
include_negations, include_binary_negations
|
|
226
|
+
)
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
def target_name(self) -> tuple[str, str]:
|
|
230
|
+
bin = self.binarized_target
|
|
231
|
+
positive = f"{bin.feature} {bin.operation.value} {bin.value}"
|
|
232
|
+
bin = self.binarized_target_neg
|
|
233
|
+
negative = f"{bin.feature} {bin.operation.value} {bin.value}"
|
|
234
|
+
return positive, negative
|
|
235
|
+
|
|
236
|
+
def multi_index_feats(
|
|
237
|
+
self, include_negations=False, include_binary_negations=False
|
|
238
|
+
) -> pd.MultiIndex:
|
|
239
|
+
return pd.MultiIndex.from_tuples(
|
|
240
|
+
self.__feature_name_tuples(include_negations, include_binary_negations),
|
|
241
|
+
names=["feature", "operation", "value"],
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def get_bin_encodings(
|
|
245
|
+
self, include_negations=False, include_binary_negations=False, return_flat=True
|
|
246
|
+
):
|
|
247
|
+
if include_negations:
|
|
248
|
+
feats = self.__binarized_features + self.__binarized_negations
|
|
249
|
+
else:
|
|
250
|
+
feats = [f for f in self.__binarized_features]
|
|
251
|
+
if include_binary_negations:
|
|
252
|
+
for binarization in self.__binarized_negations:
|
|
253
|
+
if isinstance(binarization[0].feature, Binary):
|
|
254
|
+
feats.append(binarization)
|
|
255
|
+
if not return_flat:
|
|
256
|
+
return feats
|
|
257
|
+
flat = []
|
|
258
|
+
for binariaztions in feats:
|
|
259
|
+
for bin in binariaztions:
|
|
260
|
+
flat.append(bin)
|
|
261
|
+
return flat
|
|
262
|
+
|
|
263
|
+
@property
|
|
264
|
+
def data_handler(self):
|
|
265
|
+
return self.__original_dhandler
|
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from .features import (
|
|
7
|
+
Binary,
|
|
8
|
+
Categorical,
|
|
9
|
+
Contiguous,
|
|
10
|
+
Feature,
|
|
11
|
+
Monotonicity,
|
|
12
|
+
make_feature,
|
|
13
|
+
)
|
|
14
|
+
from .types import CategValue, DataLike, FeatureID, OneDimData
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DataHandler:
|
|
18
|
+
"""
|
|
19
|
+
Performs all data processing from a pandas DataFrame/numpy array to a normalized and encoded input
|
|
20
|
+
Expected use is to initialize this with training data and then use it to encode all data.
|
|
21
|
+
Supports mixed encoding, where only some values are categorical
|
|
22
|
+
Normalizes contiguous data to [0, 1] range
|
|
23
|
+
Produces either one-hot encoded data or direct data with mapped categorical data to negative integers
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
features: list[Feature],
|
|
29
|
+
target: Feature | None = None,
|
|
30
|
+
causal_inc: list[tuple[Feature, Feature]] | None = None,
|
|
31
|
+
greater_than: list[tuple[Feature, Feature]] | None = None,
|
|
32
|
+
):
|
|
33
|
+
self.__input_features = features
|
|
34
|
+
self.__target_feature = target
|
|
35
|
+
self.__causal_inc = causal_inc if causal_inc is not None else []
|
|
36
|
+
self.__greater_than = greater_than if greater_than is not None else []
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_data(
|
|
40
|
+
cls,
|
|
41
|
+
X: DataLike,
|
|
42
|
+
y: OneDimData | None = None,
|
|
43
|
+
categ_map: dict[FeatureID, list[CategValue]] = {},
|
|
44
|
+
ordered: list[FeatureID] = [],
|
|
45
|
+
bounds_map: dict[FeatureID, tuple[int, int]] = {},
|
|
46
|
+
discrete: list[FeatureID] = [],
|
|
47
|
+
immutable: list[FeatureID] = [],
|
|
48
|
+
monotonicity: dict[FeatureID, Monotonicity] = {},
|
|
49
|
+
# TODO more general causality
|
|
50
|
+
causal_inc: list[tuple[FeatureID, FeatureID]] = [],
|
|
51
|
+
greater_than: list[tuple[FeatureID, FeatureID]] = [],
|
|
52
|
+
regression: bool = False,
|
|
53
|
+
feature_names: list[str] | None = None,
|
|
54
|
+
target_name: str | None = None,
|
|
55
|
+
) -> DataHandler:
|
|
56
|
+
"""
|
|
57
|
+
Construct a DataHandler instance.
|
|
58
|
+
|
|
59
|
+
Parameters:
|
|
60
|
+
-----------
|
|
61
|
+
X : array-like (2 dimensional)
|
|
62
|
+
Input features. Shape: (num_samples, num_features)
|
|
63
|
+
y : array-like (1 dimensional)
|
|
64
|
+
Target feature (e.g., labels or regression targets). Shape: (num_samples,)
|
|
65
|
+
categ : dictionary
|
|
66
|
+
Dictionary with indices (or column names for DataFrame) of categorical features as keys
|
|
67
|
+
and a list of unique categorical values as values.
|
|
68
|
+
|
|
69
|
+
If the list is empty, each unique value of the feature is considered categorical
|
|
70
|
+
If the list is non-empty, but does not cover all values, the feature is considered mixed
|
|
71
|
+
regression : bool
|
|
72
|
+
True if the task is regression, False if y is categorical and task is classification.
|
|
73
|
+
feature_names : optional list of strings
|
|
74
|
+
List of feature names, if None it is recovered from column names if X is a DataFrame
|
|
75
|
+
target_name : optional string
|
|
76
|
+
Name of the target feature, if None it is recovered from X if X is a pandas Series
|
|
77
|
+
"""
|
|
78
|
+
if isinstance(X, pd.DataFrame):
|
|
79
|
+
if feature_names is None:
|
|
80
|
+
feature_names = X.columns
|
|
81
|
+
if target_name is not None and y is None:
|
|
82
|
+
print("Taking target values from the X matrix")
|
|
83
|
+
y = X[target_name]
|
|
84
|
+
X = X.drop(columns=target_name)
|
|
85
|
+
X = X.to_numpy()
|
|
86
|
+
|
|
87
|
+
if y is not None:
|
|
88
|
+
if target_name is None:
|
|
89
|
+
if isinstance(y, pd.Series):
|
|
90
|
+
target_name = y.name
|
|
91
|
+
else:
|
|
92
|
+
target_name = "target"
|
|
93
|
+
|
|
94
|
+
if regression:
|
|
95
|
+
target_feature = Contiguous(y, target_name)
|
|
96
|
+
else:
|
|
97
|
+
if len(np.unique(y)) > 2:
|
|
98
|
+
target_feature = Categorical(y, name=target_name)
|
|
99
|
+
else:
|
|
100
|
+
target_feature = Binary(y, name=target_name)
|
|
101
|
+
# TODO make the target values specifiable
|
|
102
|
+
else:
|
|
103
|
+
target_feature = None
|
|
104
|
+
|
|
105
|
+
n_features = X.shape[1]
|
|
106
|
+
if feature_names is None:
|
|
107
|
+
feature_names = [None] * n_features
|
|
108
|
+
if len(feature_names) != n_features:
|
|
109
|
+
raise ValueError("Incorrect length of list of feature names.")
|
|
110
|
+
|
|
111
|
+
input_features: list[Feature] = []
|
|
112
|
+
# stores lists of categorical values of applicable features, used for mapping to integer values
|
|
113
|
+
for feat_i, feat_name in enumerate(feature_names):
|
|
114
|
+
input_features.append(
|
|
115
|
+
make_feature(
|
|
116
|
+
X[:, feat_i],
|
|
117
|
+
feat_name,
|
|
118
|
+
categ_map.get(feat_name, None),
|
|
119
|
+
bounds_map.get(feat_name, None),
|
|
120
|
+
feat_name in ordered,
|
|
121
|
+
feat_name in discrete,
|
|
122
|
+
monotone=monotonicity.get(feat_name, Monotonicity.NONE),
|
|
123
|
+
modifiable=feat_name not in immutable,
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
causal_inc = [
|
|
128
|
+
(
|
|
129
|
+
input_features[feature_names.index(i)],
|
|
130
|
+
input_features[feature_names.index(j)],
|
|
131
|
+
)
|
|
132
|
+
for i, j in causal_inc
|
|
133
|
+
]
|
|
134
|
+
greater_than = [
|
|
135
|
+
(
|
|
136
|
+
input_features[feature_names.index(i)],
|
|
137
|
+
input_features[feature_names.index(j)],
|
|
138
|
+
)
|
|
139
|
+
for i, j in greater_than
|
|
140
|
+
]
|
|
141
|
+
return DataHandler(input_features, target_feature, causal_inc, greater_than)
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def causal_inc(self) -> list[tuple[Feature, Feature]]:
|
|
145
|
+
return self.__causal_inc
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def greater_than(self) -> list[tuple[Feature, Feature]]:
|
|
149
|
+
return self.__greater_than
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def n_features(self) -> int:
|
|
153
|
+
"""Number of features in the input space"""
|
|
154
|
+
return len(self.__input_features)
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def features(self) -> list[Feature]:
|
|
158
|
+
"""List of input features"""
|
|
159
|
+
return self.__input_features
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def target_feature(self) -> Feature:
|
|
163
|
+
"""Target feature"""
|
|
164
|
+
return self.__target_feature
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def feature_names(self) -> list[str]:
|
|
168
|
+
"""List of feature names"""
|
|
169
|
+
return [f.name for f in self.__input_features]
|
|
170
|
+
|
|
171
|
+
def encode(
|
|
172
|
+
self, X: DataLike, normalize: bool = True, one_hot: bool = True
|
|
173
|
+
) -> np.ndarray[np.float64]:
|
|
174
|
+
"""
|
|
175
|
+
Encode input features.
|
|
176
|
+
|
|
177
|
+
Parameters:
|
|
178
|
+
-----------
|
|
179
|
+
X : array-like
|
|
180
|
+
Input features (data matrix or DataFrame). Shape: (num_samples, num_features)
|
|
181
|
+
normalize : bool, optional
|
|
182
|
+
Whether to normalize the features (default is True).
|
|
183
|
+
one_hot : bool, optional
|
|
184
|
+
Whether to perform one-hot encoding for categorical values (default is True).
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
--------
|
|
188
|
+
encoded_X : numpy array
|
|
189
|
+
Encoded input features. Shape: (num_samples, one_hot_features) when one hot encoding is performed, (num_samples, num_features) otherwise
|
|
190
|
+
"""
|
|
191
|
+
if isinstance(X, pd.DataFrame):
|
|
192
|
+
X = X.to_numpy()
|
|
193
|
+
if isinstance(X, pd.Series):
|
|
194
|
+
X = X.to_numpy()
|
|
195
|
+
|
|
196
|
+
if len(X.shape) == 1:
|
|
197
|
+
Xmat = X.reshape(1, -1)
|
|
198
|
+
return self.encode(Xmat, normalize=normalize, one_hot=one_hot)[0]
|
|
199
|
+
|
|
200
|
+
enc = []
|
|
201
|
+
for feat_i, feature in enumerate(self.__input_features):
|
|
202
|
+
enc.append(
|
|
203
|
+
feature.encode(X[:, feat_i], normalize, one_hot).reshape(X.shape[0], -1)
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return np.concatenate(enc, axis=1).astype(np.float64)
|
|
207
|
+
|
|
208
|
+
def encode_y(
|
|
209
|
+
self, y: OneDimData, normalize: bool = True, one_hot: bool = True
|
|
210
|
+
) -> np.ndarray[np.float64]:
|
|
211
|
+
"""
|
|
212
|
+
Encode target feature.
|
|
213
|
+
|
|
214
|
+
Parameters:
|
|
215
|
+
-----------
|
|
216
|
+
y : array-like
|
|
217
|
+
Target feature (data matrix or DataFrame of labels or regression targets). Shape: (num_samples,)
|
|
218
|
+
normalize : bool, optional
|
|
219
|
+
Whether to normalize the features (default is True).
|
|
220
|
+
one_hot : bool, optional
|
|
221
|
+
Whether to perform one-hot encoding for categorical values (default is True).
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
--------
|
|
225
|
+
encoded_y : numpy array
|
|
226
|
+
Encoded target feature. Shape: (num_samples, num_values) for one hot encoding or (num_samples,) otherwise
|
|
227
|
+
"""
|
|
228
|
+
return self.__target_feature.encode(y, normalize, one_hot)
|
|
229
|
+
|
|
230
|
+
def encode_all(self, X_all: np.ndarray, normalize: bool, one_hot: bool):
|
|
231
|
+
return np.concatenate(
|
|
232
|
+
[
|
|
233
|
+
self.encode(X_all[:, :-1], normalize, one_hot),
|
|
234
|
+
self.encode_y(X_all[:, -1], normalize, one_hot).reshape(-1, 1),
|
|
235
|
+
],
|
|
236
|
+
axis=1,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
def decode(
|
|
240
|
+
self,
|
|
241
|
+
X: np.ndarray[np.float64],
|
|
242
|
+
denormalize: bool = True,
|
|
243
|
+
encoded_one_hot: bool = True,
|
|
244
|
+
as_dataframe: bool = True,
|
|
245
|
+
) -> np.ndarray[np.float64]:
|
|
246
|
+
"""
|
|
247
|
+
Decode input features.
|
|
248
|
+
|
|
249
|
+
Parameters:
|
|
250
|
+
-----------
|
|
251
|
+
X : array-like
|
|
252
|
+
Input data matrix. Shape: (num_samples, num_enc_features)
|
|
253
|
+
where num_enc_features can be higher than num_features, because of one-hot encoding
|
|
254
|
+
denormalize : bool, optional
|
|
255
|
+
Whether to invert the normalization of the features (default is True).
|
|
256
|
+
encoded_one_hot : bool, optional
|
|
257
|
+
Whether the input matrix is one-hot encoded (default is True).
|
|
258
|
+
as_dataframe : bool, optional
|
|
259
|
+
Whether to return a pandas DataFrame or numpy array (default is True - DataFrame).
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
--------
|
|
263
|
+
decoded_X : numpy array
|
|
264
|
+
Decoded features in the original format. Shape: (num_samples, num_features)
|
|
265
|
+
"""
|
|
266
|
+
if X.shape[0] == 0:
|
|
267
|
+
if as_dataframe:
|
|
268
|
+
return pd.DataFrame([], columns=[f.name for f in self.__input_features])
|
|
269
|
+
return np.empty((0, self.n_features))
|
|
270
|
+
dec = []
|
|
271
|
+
curr_col = 0
|
|
272
|
+
for feature in self.__input_features:
|
|
273
|
+
w = feature.encoding_width(encoded_one_hot)
|
|
274
|
+
dec.append(
|
|
275
|
+
feature.decode(X[:, curr_col : curr_col + w], denormalize, as_dataframe)
|
|
276
|
+
)
|
|
277
|
+
curr_col += w
|
|
278
|
+
if as_dataframe:
|
|
279
|
+
return pd.concat(dec, axis=1)
|
|
280
|
+
return np.concatenate([x.reshape(X.shape[0], -1) for x in dec], axis=1)
|
|
281
|
+
|
|
282
|
+
def decode_y(
|
|
283
|
+
self,
|
|
284
|
+
y: np.ndarray[np.float64],
|
|
285
|
+
denormalize: bool = True,
|
|
286
|
+
as_series: bool = True,
|
|
287
|
+
) -> np.ndarray[np.float64]:
|
|
288
|
+
"""
|
|
289
|
+
Decode target feature.
|
|
290
|
+
|
|
291
|
+
Parameters:
|
|
292
|
+
-----------
|
|
293
|
+
y : array-like
|
|
294
|
+
Target feature data. Shape: (num_samples,) for general case
|
|
295
|
+
or (num_samples, num_categorical_values) in case of one-hot encoding
|
|
296
|
+
denormalize : bool, optional
|
|
297
|
+
Whether to invert the normalization of the feature (default is True).
|
|
298
|
+
as_series : bool, optional
|
|
299
|
+
Whether to return a pandas Series or numpy array (default is True - Series).
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
--------
|
|
303
|
+
decoded_y : numpy array
|
|
304
|
+
Decoded target feature data. Shape: (num_samples,)
|
|
305
|
+
"""
|
|
306
|
+
return self.__target_feature.decode(y, denormalize, as_series)
|
|
307
|
+
|
|
308
|
+
def encoding_width(self, one_hot: bool) -> int:
|
|
309
|
+
return sum([f.encoding_width(one_hot) for f in self.__input_features])
|
|
310
|
+
|
|
311
|
+
def allowed_changes(self, pre_vals, post_vals):
|
|
312
|
+
for f, pre, pos in zip(self.features, pre_vals, post_vals):
|
|
313
|
+
if not f.allowed_change(pre, pos):
|
|
314
|
+
return False
|
|
315
|
+
|
|
316
|
+
for cause, effect in self.__causal_inc:
|
|
317
|
+
cause_i = self.features.index(cause)
|
|
318
|
+
pre_cause = cause.encode(pre_vals[cause_i], normalize=False, one_hot=False)
|
|
319
|
+
pos_cause = cause.encode(post_vals[cause_i], normalize=False, one_hot=False)
|
|
320
|
+
if isinstance(cause, Categorical):
|
|
321
|
+
applied = pos_cause in cause.greater_than(pre_cause)
|
|
322
|
+
elif isinstance(cause, Contiguous):
|
|
323
|
+
applied = pos_cause > pre_cause
|
|
324
|
+
else:
|
|
325
|
+
raise ValueError("invalid feature type")
|
|
326
|
+
if applied:
|
|
327
|
+
effect_i = self.features.index(effect)
|
|
328
|
+
pre_effect = effect.encode(
|
|
329
|
+
pre_vals[effect_i], normalize=False, one_hot=False
|
|
330
|
+
)
|
|
331
|
+
pos_effect = effect.encode(
|
|
332
|
+
post_vals[effect_i], normalize=False, one_hot=False
|
|
333
|
+
)
|
|
334
|
+
if isinstance(effect, Categorical):
|
|
335
|
+
if pos_effect not in effect.greater_than(pre_effect):
|
|
336
|
+
return False
|
|
337
|
+
elif isinstance(effect, Contiguous):
|
|
338
|
+
if pos_effect <= pre_effect:
|
|
339
|
+
return False
|
|
340
|
+
else:
|
|
341
|
+
raise ValueError("invalid feature type")
|
|
342
|
+
|
|
343
|
+
for greater, smaller in self.__greater_than:
|
|
344
|
+
if (
|
|
345
|
+
post_vals[self.features.index(smaller)]
|
|
346
|
+
> post_vals[self.features.index(greater)]
|
|
347
|
+
):
|
|
348
|
+
return False
|
|
349
|
+
return True
|
|
350
|
+
|
|
351
|
+
# TODO dalsi nadstavba - datawrapper - ktera si bude pamatovat jestli se slo one-hot, normalizovalo atd
|