psyke 0.4.9.dev6__py3-none-any.whl → 1.0.4.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- psyke/__init__.py +231 -85
- psyke/clustering/__init__.py +9 -4
- psyke/clustering/cream/__init__.py +6 -10
- psyke/clustering/exact/__init__.py +17 -11
- psyke/clustering/utils.py +0 -1
- psyke/extraction/__init__.py +25 -0
- psyke/extraction/cart/CartPredictor.py +128 -0
- psyke/extraction/cart/FairTree.py +205 -0
- psyke/extraction/cart/FairTreePredictor.py +56 -0
- psyke/extraction/cart/__init__.py +48 -62
- psyke/extraction/hypercubic/__init__.py +187 -47
- psyke/extraction/hypercubic/cosmik/__init__.py +47 -0
- psyke/extraction/hypercubic/creepy/__init__.py +24 -29
- psyke/extraction/hypercubic/divine/__init__.py +86 -0
- psyke/extraction/hypercubic/ginger/__init__.py +100 -0
- psyke/extraction/hypercubic/gridex/__init__.py +45 -84
- psyke/extraction/hypercubic/gridrex/__init__.py +4 -4
- psyke/extraction/hypercubic/hex/__init__.py +104 -0
- psyke/extraction/hypercubic/hypercube.py +275 -72
- psyke/extraction/hypercubic/iter/__init__.py +45 -46
- psyke/extraction/hypercubic/strategy.py +13 -9
- psyke/extraction/real/__init__.py +24 -29
- psyke/extraction/real/utils.py +2 -2
- psyke/extraction/trepan/__init__.py +24 -19
- psyke/genetic/__init__.py +0 -0
- psyke/genetic/fgin/__init__.py +74 -0
- psyke/genetic/gin/__init__.py +144 -0
- psyke/hypercubepredictor.py +102 -0
- psyke/schema/__init__.py +230 -36
- psyke/tuning/__init__.py +40 -28
- psyke/tuning/crash/__init__.py +33 -64
- psyke/tuning/orchid/__init__.py +21 -23
- psyke/tuning/pedro/__init__.py +70 -56
- psyke/utils/logic.py +8 -8
- psyke/utils/plot.py +79 -3
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/METADATA +42 -22
- psyke-1.0.4.dev10.dist-info/RECORD +46 -0
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/WHEEL +1 -1
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info/licenses}/LICENSE +2 -1
- psyke/extraction/cart/predictor.py +0 -73
- psyke-0.4.9.dev6.dist-info/RECORD +0 -36
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/top_level.txt +0 -0
psyke/__init__.py
CHANGED
|
@@ -5,17 +5,20 @@ from enum import Enum
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from
|
|
8
|
+
from matplotlib import pyplot as plt
|
|
9
9
|
from sklearn.linear_model import LinearRegression
|
|
10
10
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, f1_score, accuracy_score, \
|
|
11
11
|
adjusted_rand_score, adjusted_mutual_info_score, v_measure_score, fowlkes_mallows_score
|
|
12
|
+
from tuprolog.solve.prolog import prolog_solver
|
|
12
13
|
|
|
13
14
|
from psyke.schema import DiscreteFeature
|
|
14
15
|
from psyke.utils import get_default_random_seed, Target, get_int_precision
|
|
15
|
-
from tuprolog.theory import Theory
|
|
16
|
+
from tuprolog.theory import Theory, mutable_theory
|
|
16
17
|
from typing import Iterable
|
|
17
18
|
import logging
|
|
18
19
|
|
|
20
|
+
from psyke.utils.logic import get_in_rule, data_to_struct, get_not_in_rule
|
|
21
|
+
|
|
19
22
|
logging.basicConfig(level=logging.WARN)
|
|
20
23
|
logger = logging.getLogger('psyke')
|
|
21
24
|
|
|
@@ -45,28 +48,36 @@ class EvaluableModel(object):
|
|
|
45
48
|
V = 3,
|
|
46
49
|
FMI = 4
|
|
47
50
|
|
|
48
|
-
def __init__(self, normalization=None):
|
|
51
|
+
def __init__(self, discretization=None, normalization=None):
|
|
52
|
+
self.discretization = [] if discretization is None else list(discretization)
|
|
49
53
|
self.normalization = normalization
|
|
50
54
|
|
|
51
|
-
def predict(self, dataframe: pd.DataFrame
|
|
55
|
+
def predict(self, dataframe: pd.DataFrame) -> Iterable:
|
|
52
56
|
"""
|
|
53
57
|
Predicts the output values of every sample in dataset.
|
|
54
58
|
|
|
55
|
-
:param dataframe:
|
|
56
|
-
:param mapping: for one-hot encoding.
|
|
59
|
+
:param dataframe: the set of instances to predict.
|
|
57
60
|
:return: a list of predictions.
|
|
58
61
|
"""
|
|
59
|
-
|
|
60
|
-
if mapping is not None:
|
|
61
|
-
inverse_mapping = {v: k for k, v in mapping.items()}
|
|
62
|
-
ys = [inverse_mapping[y] for y in ys]
|
|
63
|
-
return ys
|
|
62
|
+
return self.__convert(self._predict(dataframe))
|
|
64
63
|
|
|
65
64
|
def _predict(self, dataframe: pd.DataFrame) -> Iterable:
|
|
66
65
|
raise NotImplementedError('predict')
|
|
67
66
|
|
|
67
|
+
def __convert(self, ys: Iterable) -> Iterable:
|
|
68
|
+
if self.normalization is not None and len(ys) > 0 and not isinstance([p for p in ys if p is not None][0], str):
|
|
69
|
+
m, s = self.normalization[list(self.normalization.keys())[-1]]
|
|
70
|
+
ys = [prediction if prediction is None else prediction * s + m for prediction in ys]
|
|
71
|
+
return ys
|
|
72
|
+
|
|
73
|
+
def brute_predict(self, dataframe: pd.DataFrame, criterion: str = 'corner', n: int = 2) -> Iterable:
|
|
74
|
+
return self.__convert(self._brute_predict(dataframe, criterion, n))
|
|
75
|
+
|
|
76
|
+
def _brute_predict(self, dataframe: pd.DataFrame, criterion: str = 'corner', n: int = 2) -> Iterable:
|
|
77
|
+
raise NotImplementedError('brute_predict')
|
|
78
|
+
|
|
68
79
|
def unscale(self, values, name):
|
|
69
|
-
if self.normalization is None or isinstance(values, LinearRegression):
|
|
80
|
+
if self.normalization is None or name not in self.normalization or isinstance(values, LinearRegression):
|
|
70
81
|
return values
|
|
71
82
|
if isinstance(values, Iterable):
|
|
72
83
|
values = [None if value is None else
|
|
@@ -76,9 +87,13 @@ class EvaluableModel(object):
|
|
|
76
87
|
return values
|
|
77
88
|
|
|
78
89
|
def score(self, dataframe: pd.DataFrame, predictor=None, fidelity: bool = False, completeness: bool = True,
|
|
90
|
+
brute: bool = False, criterion: str = 'corners', n: int = 2,
|
|
79
91
|
task: EvaluableModel.Task = Task.CLASSIFICATION,
|
|
80
|
-
scoring_function: Iterable[EvaluableModel.Score] =
|
|
81
|
-
extracted = np.array(
|
|
92
|
+
scoring_function: Iterable[EvaluableModel.Score] = (ClassificationScore.ACCURACY, )):
|
|
93
|
+
extracted = np.array(
|
|
94
|
+
self.predict(dataframe.iloc[:, :-1]) if not brute else
|
|
95
|
+
self.brute_predict(dataframe.iloc[:, :-1], criterion, n)
|
|
96
|
+
)
|
|
82
97
|
idx = [prediction is not None for prediction in extracted]
|
|
83
98
|
y_extracted = extracted[idx]
|
|
84
99
|
true = [dataframe.iloc[idx, -1]]
|
|
@@ -134,110 +149,263 @@ class Extractor(EvaluableModel, ABC):
|
|
|
134
149
|
----------
|
|
135
150
|
predictor : the underling black box predictor.
|
|
136
151
|
discretization : A collection of sets of discretised features.
|
|
137
|
-
|
|
152
|
+
Each set corresponds to a set of features derived from a single non-discrete feature.
|
|
138
153
|
"""
|
|
139
154
|
|
|
140
155
|
def __init__(self, predictor, discretization: Iterable[DiscreteFeature] = None, normalization=None):
|
|
141
|
-
super().__init__(normalization)
|
|
156
|
+
super().__init__(discretization, normalization)
|
|
142
157
|
self.predictor = predictor
|
|
143
|
-
self.
|
|
158
|
+
self.theory = None
|
|
144
159
|
|
|
145
|
-
def extract(self, dataframe: pd.DataFrame
|
|
160
|
+
def extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
146
161
|
"""
|
|
147
162
|
Extracts rules from the underlying predictor.
|
|
148
163
|
|
|
149
|
-
:param dataframe:
|
|
150
|
-
:param mapping: for one-hot encoding.
|
|
151
|
-
:param sort: alphabetically sort the variables of the head of the rules.
|
|
164
|
+
:param dataframe: the set of instances to be used for the extraction.
|
|
152
165
|
:return: the theory created from the extracted rules.
|
|
153
166
|
"""
|
|
154
167
|
raise NotImplementedError('extract')
|
|
155
168
|
|
|
156
|
-
def
|
|
169
|
+
def predict_why(self, data: dict[str, float], verbose: bool = True):
|
|
170
|
+
"""
|
|
171
|
+
Provides a prediction and the corresponding explanation.
|
|
172
|
+
:param data: the instance to predict.
|
|
173
|
+
:param verbose: if True the explanation is printed.
|
|
174
|
+
"""
|
|
175
|
+
raise NotImplementedError('predict_why')
|
|
176
|
+
|
|
177
|
+
def predict_counter(self, data: dict[str, float], verbose: bool = True, only_first: bool = True):
|
|
178
|
+
"""
|
|
179
|
+
Provides a prediction and counterfactual explanations.
|
|
180
|
+
:param data: the instance to predict.
|
|
181
|
+
:param verbose: if True the counterfactual explanation is printed.
|
|
182
|
+
:param only_first: if True only the closest counterfactual explanation is provided for each distinct class.
|
|
183
|
+
"""
|
|
184
|
+
raise NotImplementedError('predict_counter')
|
|
185
|
+
|
|
186
|
+
def plot_fairness(self, dataframe: pd.DataFrame, groups: dict[str, list], colormap='seismic_r', filename=None,
|
|
187
|
+
figsize=(5, 4)):
|
|
188
|
+
"""
|
|
189
|
+
Provides a visual estimation of the fairness exhibited by an extractor with respect to the specified groups.
|
|
190
|
+
:param dataframe: the set of instances to be used for the estimation.
|
|
191
|
+
:param groups: the set of relevant groups to consider.
|
|
192
|
+
:param colormap: the colormap to use for the plot.
|
|
193
|
+
:param filename: if not None, name used to save the plot.
|
|
194
|
+
:param figsize: size of the plot.
|
|
195
|
+
"""
|
|
196
|
+
counts = {group: len(dataframe[idx_g]) for group, idx_g in groups.items()}
|
|
197
|
+
output = {'labels': []}
|
|
198
|
+
for group in groups:
|
|
199
|
+
output[group] = []
|
|
200
|
+
for i, clause in enumerate(self.theory.clauses):
|
|
201
|
+
if len(dataframe) == 0:
|
|
202
|
+
break
|
|
203
|
+
solver = prolog_solver(static_kb=mutable_theory(clause).assertZ(get_in_rule()).assertZ(get_not_in_rule()))
|
|
204
|
+
idx = np.array([query.is_yes for query in
|
|
205
|
+
[solver.solveOnce(data_to_struct(data)) for _, data in dataframe.iterrows()]])
|
|
206
|
+
# print(f'Rule {i + 1}. Outcome {clause.head.args[-1]}. Affecting', end='')
|
|
207
|
+
output['labels'].append(str(clause.head.args[-1]))
|
|
208
|
+
for group, idx_g in groups.items():
|
|
209
|
+
# print(f' {len(dataframe[idx & idx_g]) / counts[group]:.2f}%{group}', end='')
|
|
210
|
+
output[group].append(len(dataframe[idx & idx_g]) / counts[group])
|
|
211
|
+
dataframe = dataframe[~idx]
|
|
212
|
+
groups = {group: indices[~idx] for group, indices in groups.items()}
|
|
213
|
+
# print(f'. Left {len(dataframe)} instances')
|
|
214
|
+
|
|
215
|
+
binary = len(set(output['labels'])) == 2
|
|
216
|
+
labels = sorted(set(output['labels']))
|
|
217
|
+
data = np.vstack([output[group] for group in groups]).T * 100
|
|
218
|
+
if binary:
|
|
219
|
+
data[np.array(output['labels']) == labels[0]] *= -1
|
|
220
|
+
|
|
221
|
+
plt.figure(figsize=figsize)
|
|
222
|
+
plt.imshow(data, cmap=colormap, vmin=-100 if binary else 0, vmax=100)
|
|
223
|
+
|
|
224
|
+
plt.gca().set_xticks(range(len(groups)), labels=groups.keys())
|
|
225
|
+
plt.gca().set_yticks(range(len(output['labels'])),
|
|
226
|
+
labels=[f'Rule {i + 1}\n{l}' for i, l in enumerate(output['labels'])])
|
|
227
|
+
|
|
228
|
+
plt.xlabel('Groups')
|
|
229
|
+
plt.ylabel('Rules')
|
|
230
|
+
plt.title("Rule set impact on groups")
|
|
231
|
+
|
|
232
|
+
for i in range(len(output['labels'])):
|
|
233
|
+
for j in range(len(groups)):
|
|
234
|
+
plt.gca().text(j, i, f'{abs(data[i, j]):.2f}%', ha="center", va="center", color="k")
|
|
235
|
+
|
|
236
|
+
plt.gca().set_xticks([i + .5 for i in range(len(groups))], minor=True)
|
|
237
|
+
plt.gca().set_yticks([i + .5 for i in range(len(output['labels']))], minor=True)
|
|
238
|
+
plt.gca().grid(which='minor', color='k', linestyle='-', linewidth=.8)
|
|
239
|
+
plt.gca().tick_params(which='minor', bottom=False, left=False)
|
|
240
|
+
cbarticks = np.linspace(-100 if binary else 0, 100, 9 if binary else 11, dtype=int)
|
|
241
|
+
cbar = plt.colorbar(fraction=0.046, label='Affected samples (%)', ticks=cbarticks)
|
|
242
|
+
if binary:
|
|
243
|
+
ticklabels = [str(-i) if i < 0 else str(i) for i in cbarticks]
|
|
244
|
+
ticklabels[0] += f' {labels[0]}'
|
|
245
|
+
ticklabels[-1] += f' {labels[-1]}'
|
|
246
|
+
cbar.ax.set_yticklabels(ticklabels)
|
|
247
|
+
|
|
248
|
+
plt.tight_layout()
|
|
249
|
+
if filename is not None:
|
|
250
|
+
plt.savefig(filename, dpi=500)
|
|
251
|
+
plt.show()
|
|
252
|
+
|
|
253
|
+
def make_fair(self, features: Iterable[str]):
|
|
254
|
+
raise NotImplementedError(f'Fairness for {type(self).__name__} is not supported at the moment')
|
|
255
|
+
|
|
256
|
+
def mae(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
|
|
257
|
+
n: int = 3) -> float:
|
|
157
258
|
"""
|
|
158
259
|
Calculates the predictions' MAE w.r.t. the instances given as input.
|
|
159
260
|
|
|
160
|
-
:param dataframe:
|
|
261
|
+
:param dataframe: the set of instances to be used to calculate the mean absolute error.
|
|
161
262
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
263
|
+
:param brute: if True, a brute prediction is executed.
|
|
264
|
+
:param criterion: criterion for brute prediction.
|
|
265
|
+
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
162
266
|
:return: the mean absolute error (MAE) of the predictions.
|
|
163
267
|
"""
|
|
164
|
-
return self.score(dataframe, predictor, predictor is not None, False,
|
|
165
|
-
[Extractor.RegressionScore.MAE])[Extractor.RegressionScore.MAE][-1]
|
|
268
|
+
return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
|
|
269
|
+
Extractor.Task.REGRESSION, [Extractor.RegressionScore.MAE])[Extractor.RegressionScore.MAE][-1]
|
|
166
270
|
|
|
167
|
-
def mse(self, dataframe: pd.DataFrame, predictor=None
|
|
271
|
+
def mse(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
|
|
272
|
+
n: int = 3) -> float:
|
|
168
273
|
"""
|
|
169
274
|
Calculates the predictions' MSE w.r.t. the instances given as input.
|
|
170
275
|
|
|
171
|
-
:param dataframe:
|
|
276
|
+
:param dataframe: the set of instances to be used to calculate the mean squared error.
|
|
172
277
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
278
|
+
:param brute: if True, a brute prediction is executed.
|
|
279
|
+
:param criterion: criterion for brute prediction.
|
|
280
|
+
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
173
281
|
:return: the mean squared error (MSE) of the predictions.
|
|
174
282
|
"""
|
|
175
|
-
return self.score(dataframe, predictor, predictor is not None, False,
|
|
176
|
-
[Extractor.RegressionScore.MSE])[Extractor.RegressionScore.MSE][-1]
|
|
283
|
+
return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
|
|
284
|
+
Extractor.Task.REGRESSION, [Extractor.RegressionScore.MSE])[Extractor.RegressionScore.MSE][-1]
|
|
177
285
|
|
|
178
|
-
def r2(self, dataframe: pd.DataFrame, predictor=None
|
|
286
|
+
def r2(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
|
|
287
|
+
n: int = 3) -> float:
|
|
179
288
|
"""
|
|
180
289
|
Calculates the predictions' R2 score w.r.t. the instances given as input.
|
|
181
290
|
|
|
182
|
-
:param dataframe:
|
|
291
|
+
:param dataframe: the set of instances to be used to calculate the R2 score.
|
|
183
292
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
293
|
+
:param brute: if True, a brute prediction is executed.
|
|
294
|
+
:param criterion: criterion for brute prediction.
|
|
295
|
+
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
184
296
|
:return: the R2 score of the predictions.
|
|
185
297
|
"""
|
|
186
|
-
return self.score(dataframe, predictor, predictor is not None, False,
|
|
298
|
+
return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
|
|
187
299
|
Extractor.Task.REGRESSION, [Extractor.RegressionScore.R2])[Extractor.RegressionScore.R2][-1]
|
|
188
300
|
|
|
189
|
-
def accuracy(self, dataframe: pd.DataFrame, predictor=None
|
|
301
|
+
def accuracy(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
|
|
302
|
+
n: int = 3) -> float:
|
|
190
303
|
"""
|
|
191
304
|
Calculates the predictions' accuracy classification score w.r.t. the instances given as input.
|
|
192
305
|
|
|
193
|
-
:param dataframe:
|
|
306
|
+
:param dataframe: the set of instances to be used to calculate the accuracy classification score.
|
|
194
307
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
308
|
+
:param brute: if True, a brute prediction is executed.
|
|
309
|
+
:param criterion: criterion for brute prediction.
|
|
310
|
+
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
195
311
|
:return: the accuracy classification score of the predictions.
|
|
196
312
|
"""
|
|
197
|
-
return self.score(dataframe, predictor, predictor is not None, False,
|
|
313
|
+
return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
|
|
314
|
+
Extractor.Task.CLASSIFICATION,
|
|
198
315
|
[Extractor.ClassificationScore.ACCURACY])[Extractor.ClassificationScore.ACCURACY][-1]
|
|
199
316
|
|
|
200
|
-
def f1(self, dataframe: pd.DataFrame, predictor=None
|
|
317
|
+
def f1(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
|
|
318
|
+
n: int = 3) -> float:
|
|
201
319
|
"""
|
|
202
320
|
Calculates the predictions' F1 score w.r.t. the instances given as input.
|
|
203
321
|
|
|
204
|
-
:param dataframe:
|
|
322
|
+
:param dataframe: the set of instances to be used to calculate the F1 score.
|
|
205
323
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
324
|
+
:param brute: if True, a brute prediction is executed.
|
|
325
|
+
:param criterion: criterion for brute prediction.
|
|
326
|
+
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
206
327
|
:return: the F1 score of the predictions.
|
|
207
328
|
"""
|
|
208
|
-
return self.score(dataframe, predictor, predictor is not None, False,
|
|
329
|
+
return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
|
|
330
|
+
Extractor.Task.CLASSIFICATION,
|
|
209
331
|
[Extractor.ClassificationScore.F1])[Extractor.ClassificationScore.F1][-1]
|
|
210
332
|
|
|
211
333
|
@staticmethod
|
|
212
|
-
def cart(predictor, max_depth: int = 3, max_leaves: int = 3,
|
|
334
|
+
def cart(predictor, max_depth: int = 3, max_leaves: int = 3, max_features=None,
|
|
213
335
|
discretization: Iterable[DiscreteFeature] = None, normalization=None, simplify: bool = True) -> Extractor:
|
|
214
336
|
"""
|
|
215
337
|
Creates a new Cart extractor.
|
|
216
338
|
"""
|
|
217
339
|
from psyke.extraction.cart import Cart
|
|
218
|
-
return Cart(predictor, max_depth, max_leaves,
|
|
219
|
-
simplify=simplify)
|
|
340
|
+
return Cart(predictor, max_depth, max_leaves, max_features,
|
|
341
|
+
discretization=discretization, normalization=normalization, simplify=simplify)
|
|
342
|
+
|
|
343
|
+
@staticmethod
|
|
344
|
+
def divine(predictor, k: int = 5, patience: int = 15, close_to_center: bool = True,
|
|
345
|
+
discretization: Iterable[DiscreteFeature] = None, normalization=None,
|
|
346
|
+
seed: int = get_default_random_seed()) -> Extractor:
|
|
347
|
+
"""
|
|
348
|
+
Creates a new DiViNE extractor.
|
|
349
|
+
"""
|
|
350
|
+
from psyke.extraction.hypercubic.divine import DiViNE
|
|
351
|
+
return DiViNE(predictor, k=k, patience=patience, close_to_center=close_to_center,
|
|
352
|
+
discretization=discretization, normalization=normalization, seed=seed)
|
|
353
|
+
|
|
354
|
+
@staticmethod
|
|
355
|
+
def cosmik(predictor, max_components: int = 4, k: int = 5, patience: int = 15, close_to_center: bool = True,
|
|
356
|
+
output: Target = Target.CONSTANT, discretization: Iterable[DiscreteFeature] = None, normalization=None,
|
|
357
|
+
seed: int = get_default_random_seed()) -> Extractor:
|
|
358
|
+
"""
|
|
359
|
+
Creates a new COSMiK extractor.
|
|
360
|
+
"""
|
|
361
|
+
from psyke.extraction.hypercubic.cosmik import COSMiK
|
|
362
|
+
return COSMiK(predictor, max_components=max_components, k=k, patience=patience, close_to_center=close_to_center,
|
|
363
|
+
output=output, discretization=discretization, normalization=normalization, seed=seed)
|
|
220
364
|
|
|
221
365
|
@staticmethod
|
|
222
366
|
def iter(predictor, min_update: float = 0.1, n_points: int = 1, max_iterations: int = 600, min_examples: int = 250,
|
|
223
|
-
threshold: float = 0.1, fill_gaps: bool = True,
|
|
224
|
-
|
|
367
|
+
threshold: float = 0.1, fill_gaps: bool = True, ignore_dimensions=None,
|
|
368
|
+
normalization: dict[str, tuple[float, float]] = None, output=None,
|
|
369
|
+
seed: int = get_default_random_seed()) -> Extractor:
|
|
225
370
|
"""
|
|
226
371
|
Creates a new ITER extractor.
|
|
227
372
|
"""
|
|
228
373
|
from psyke.extraction.hypercubic.iter import ITER
|
|
229
374
|
return ITER(predictor, min_update, n_points, max_iterations, min_examples, threshold, fill_gaps,
|
|
230
|
-
normalization, output, seed)
|
|
375
|
+
ignore_dimensions, normalization, output, seed)
|
|
231
376
|
|
|
232
377
|
@staticmethod
|
|
233
|
-
def gridex(predictor, grid, min_examples: int = 250, threshold: float = 0.1,
|
|
234
|
-
normalization: dict[str, tuple[float, float]] = None,
|
|
378
|
+
def gridex(predictor, grid, min_examples: int = 250, threshold: float = 0.1, output: Target = Target.CONSTANT,
|
|
379
|
+
discretization=None, normalization: dict[str, tuple[float, float]] = None,
|
|
235
380
|
seed: int = get_default_random_seed()) -> Extractor:
|
|
236
381
|
"""
|
|
237
382
|
Creates a new GridEx extractor.
|
|
238
383
|
"""
|
|
239
384
|
from psyke.extraction.hypercubic.gridex import GridEx
|
|
240
|
-
return GridEx(predictor, grid, min_examples, threshold, normalization, seed)
|
|
385
|
+
return GridEx(predictor, grid, min_examples, threshold, output, discretization, normalization, seed)
|
|
386
|
+
|
|
387
|
+
@staticmethod
|
|
388
|
+
def hex(predictor, grid, min_examples: int = 250, threshold: float = 0.1, output: Target = Target.CONSTANT,
|
|
389
|
+
discretization=None, normalization: dict[str, tuple[float, float]] = None,
|
|
390
|
+
seed: int = get_default_random_seed()) -> Extractor:
|
|
391
|
+
"""
|
|
392
|
+
Creates a new HEx extractor.
|
|
393
|
+
"""
|
|
394
|
+
from psyke.extraction.hypercubic.hex import HEx
|
|
395
|
+
return HEx(predictor, grid, min_examples, threshold, output, discretization, normalization, seed)
|
|
396
|
+
|
|
397
|
+
@staticmethod
|
|
398
|
+
def ginger(predictor, features: Iterable[str], sigmas: Iterable[float], max_slices: int, min_rules: int = 1,
|
|
399
|
+
max_poly: int = 1, alpha: float = 0.5, indpb: float = 0.5, tournsize: int = 3, metric: str = 'R2',
|
|
400
|
+
n_gen: int = 50, n_pop: int = 50, threshold=None, valid=None, output=Target.REGRESSION,
|
|
401
|
+
normalization: dict[str, tuple[float, float]] = None,
|
|
402
|
+
seed: int = get_default_random_seed()) -> Extractor:
|
|
403
|
+
"""
|
|
404
|
+
Creates a new GInGER extractor.
|
|
405
|
+
"""
|
|
406
|
+
from psyke.extraction.hypercubic.ginger import GInGER
|
|
407
|
+
return GInGER(predictor, features, sigmas, max_slices, min_rules, max_poly, alpha, indpb, tournsize, metric,
|
|
408
|
+
n_gen, n_pop, threshold, valid, output, normalization, seed)
|
|
241
409
|
|
|
242
410
|
@staticmethod
|
|
243
411
|
def gridrex(predictor, grid, min_examples: int = 250, threshold: float = 0.1,
|
|
@@ -250,15 +418,16 @@ class Extractor(EvaluableModel, ABC):
|
|
|
250
418
|
return GridREx(predictor, grid, min_examples, threshold, normalization, seed)
|
|
251
419
|
|
|
252
420
|
@staticmethod
|
|
253
|
-
def creepy(predictor, clustering, depth: int, error_threshold: float, output
|
|
254
|
-
ranks: [(str, float)] =
|
|
255
|
-
normalization: dict[str, tuple[float, float]] = None
|
|
421
|
+
def creepy(predictor, clustering, depth: int, error_threshold: float, output: Target = Target.CONSTANT,
|
|
422
|
+
gauss_components: int = 2, ranks: Iterable[(str, float)] = tuple(), ignore_threshold: float = 0.0,
|
|
423
|
+
discretization=None, normalization: dict[str, tuple[float, float]] = None,
|
|
424
|
+
seed: int = get_default_random_seed()) -> Extractor:
|
|
256
425
|
"""
|
|
257
426
|
Creates a new CReEPy extractor.
|
|
258
427
|
"""
|
|
259
428
|
from psyke.extraction.hypercubic.creepy import CReEPy
|
|
260
|
-
return CReEPy(predictor, depth, error_threshold, output, gauss_components, ranks, ignore_threshold,
|
|
261
|
-
normalization,
|
|
429
|
+
return CReEPy(predictor, clustering, depth, error_threshold, output, gauss_components, ranks, ignore_threshold,
|
|
430
|
+
discretization, normalization, seed)
|
|
262
431
|
|
|
263
432
|
@staticmethod
|
|
264
433
|
def real(predictor, discretization=None) -> Extractor:
|
|
@@ -281,52 +450,29 @@ class Extractor(EvaluableModel, ABC):
|
|
|
281
450
|
|
|
282
451
|
|
|
283
452
|
class Clustering(EvaluableModel, ABC):
|
|
284
|
-
def __init__(self, normalization=None):
|
|
285
|
-
super().__init__(normalization)
|
|
453
|
+
def __init__(self, discretization=None, normalization=None):
|
|
454
|
+
super().__init__(discretization, normalization)
|
|
286
455
|
|
|
287
456
|
def fit(self, dataframe: pd.DataFrame):
|
|
288
|
-
raise NotImplementedError('
|
|
457
|
+
raise NotImplementedError('fit')
|
|
289
458
|
|
|
290
459
|
def explain(self):
|
|
291
|
-
raise NotImplementedError('
|
|
460
|
+
raise NotImplementedError('explain')
|
|
292
461
|
|
|
293
462
|
@staticmethod
|
|
294
|
-
def exact(depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT,
|
|
295
|
-
|
|
463
|
+
def exact(depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT, gauss_components: int = 2,
|
|
464
|
+
discretization=None, normalization=None, seed: int = get_default_random_seed()) -> Clustering:
|
|
296
465
|
"""
|
|
297
466
|
Creates a new ExACT instance.
|
|
298
467
|
"""
|
|
299
468
|
from psyke.clustering.exact import ExACT
|
|
300
|
-
return ExACT(depth, error_threshold, output, gauss_components)
|
|
469
|
+
return ExACT(depth, error_threshold, output, gauss_components, discretization, normalization, seed)
|
|
301
470
|
|
|
302
471
|
@staticmethod
|
|
303
|
-
def cream(depth: int, error_threshold: float, output, gauss_components: int = 2
|
|
472
|
+
def cream(depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT, gauss_components: int = 2,
|
|
473
|
+
discretization=None, normalization=None, seed: int = get_default_random_seed()) -> Clustering:
|
|
304
474
|
"""
|
|
305
475
|
Creates a new CREAM instance.
|
|
306
476
|
"""
|
|
307
477
|
from psyke.clustering.cream import CREAM
|
|
308
|
-
return CREAM(depth, error_threshold, output, gauss_components)
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
class PedagogicalExtractor(Extractor, ABC):
|
|
312
|
-
|
|
313
|
-
def __init__(self, predictor, discretization=None, normalization=None):
|
|
314
|
-
Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization)
|
|
315
|
-
|
|
316
|
-
def extract(self, dataframe: pd.DataFrame, mapping: dict[str: int] = None, sort: bool = True) -> Theory:
|
|
317
|
-
new_y = self.predictor.predict(dataframe.iloc[:, :-1])
|
|
318
|
-
if mapping is not None:
|
|
319
|
-
if hasattr(new_y[0], 'shape'):
|
|
320
|
-
# One-hot encoding for multi-class tasks
|
|
321
|
-
if len(new_y[0].shape) > 0 and new_y[0].shape[0] > 1:
|
|
322
|
-
new_y = [argmax(y, axis=0) for y in new_y]
|
|
323
|
-
# One-hot encoding for binary class tasks
|
|
324
|
-
else:
|
|
325
|
-
new_y = [round(y[0]) for y in new_y]
|
|
326
|
-
new_y = pd.DataFrame(new_y).set_index(dataframe.index)
|
|
327
|
-
data = dataframe.iloc[:, :-1].copy().join(new_y)
|
|
328
|
-
data.columns = dataframe.columns
|
|
329
|
-
return self._extract(data, mapping, sort)
|
|
330
|
-
|
|
331
|
-
def _extract(self, dataframe: pd.DataFrame, mapping: dict[str: int] = None, sort: bool = True) -> Theory:
|
|
332
|
-
raise NotImplementedError('extract')
|
|
478
|
+
return CREAM(depth, error_threshold, output, gauss_components, discretization, normalization, seed)
|
psyke/clustering/__init__.py
CHANGED
|
@@ -2,13 +2,18 @@ from abc import ABC
|
|
|
2
2
|
from typing import Iterable
|
|
3
3
|
|
|
4
4
|
from psyke import Clustering, Target
|
|
5
|
-
from psyke.extraction.hypercubic import HyperCube
|
|
5
|
+
from psyke.extraction.hypercubic import HyperCube
|
|
6
|
+
from psyke.hypercubepredictor import HyperCubePredictor
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class HyperCubeClustering(HyperCubePredictor, Clustering, ABC):
|
|
9
10
|
|
|
10
|
-
def __init__(self, output: Target = Target.CONSTANT, normalization=None):
|
|
11
|
-
HyperCubePredictor.__init__(self, output=output, normalization=normalization)
|
|
11
|
+
def __init__(self, output: Target = Target.CONSTANT, discretization=None, normalization=None):
|
|
12
|
+
HyperCubePredictor.__init__(self, output=output, discretization=discretization, normalization=normalization)
|
|
13
|
+
self._protected_features = []
|
|
12
14
|
|
|
13
15
|
def get_hypercubes(self) -> Iterable[HyperCube]:
|
|
14
|
-
raise NotImplementedError('
|
|
16
|
+
raise NotImplementedError('get_hypercubes')
|
|
17
|
+
|
|
18
|
+
def make_fair(self, features: Iterable[str]):
|
|
19
|
+
self._protected_features = features
|
|
@@ -5,7 +5,7 @@ from typing import Iterable
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
from psyke.utils import Target, get_default_random_seed
|
|
9
9
|
from psyke.clustering.exact import ExACT
|
|
10
10
|
from psyke.extraction.hypercubic import Node, HyperCube, ClosedCube
|
|
11
11
|
from psyke.clustering.utils import select_gaussian_mixture
|
|
@@ -16,9 +16,9 @@ class CREAM(ExACT):
|
|
|
16
16
|
Explanator implementing CREAM algorithm.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
-
def __init__(self, depth: int, error_threshold: float,
|
|
20
|
-
|
|
21
|
-
super().__init__(depth, error_threshold, output, gauss_components)
|
|
19
|
+
def __init__(self, depth: int, error_threshold: float, output: Target = Target.CONSTANT, gauss_components: int = 5,
|
|
20
|
+
discretization=None, normalization=None, seed: int = get_default_random_seed()):
|
|
21
|
+
super().__init__(depth, error_threshold, output, gauss_components, discretization, normalization, seed)
|
|
22
22
|
|
|
23
23
|
def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int):
|
|
24
24
|
cubes = []
|
|
@@ -46,11 +46,7 @@ class CREAM(ExACT):
|
|
|
46
46
|
def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
|
|
47
47
|
to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
|
|
48
48
|
while len(to_split) > 0:
|
|
49
|
-
|
|
50
|
-
(_, depth, _, node) = to_split.pop()
|
|
51
|
-
data = ExACT._remove_string_label(node.dataframe)
|
|
52
|
-
gauss_params = select_gaussian_mixture(data, self.gauss_components)
|
|
53
|
-
gauss_pred = gauss_params[2].predict(data)
|
|
49
|
+
node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split)
|
|
54
50
|
cubes = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
|
|
55
51
|
if len(cubes) < 1:
|
|
56
52
|
continue
|
|
@@ -65,4 +61,4 @@ class CREAM(ExACT):
|
|
|
65
61
|
(error, depth + 1, np.random.uniform(), n) for (n, error) in
|
|
66
62
|
zip(node.children, [right[0].diversity, left[0].diversity]) if error > self.error_threshold
|
|
67
63
|
]
|
|
68
|
-
return self._node_to_cubes(surrounding)
|
|
64
|
+
return self._node_to_cubes(surrounding)
|
|
@@ -13,7 +13,7 @@ from psyke.clustering import HyperCubeClustering
|
|
|
13
13
|
from psyke.extraction.hypercubic import Node, ClosedCube, HyperCube
|
|
14
14
|
from psyke.clustering.utils import select_gaussian_mixture, select_dbscan_epsilon
|
|
15
15
|
from psyke.extraction.hypercubic.hypercube import ClosedRegressionCube, ClosedClassificationCube
|
|
16
|
-
from psyke.utils import Target
|
|
16
|
+
from psyke.utils import Target, get_default_random_seed
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class ExACT(HyperCubeClustering, ABC):
|
|
@@ -22,13 +22,15 @@ class ExACT(HyperCubeClustering, ABC):
|
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
24
|
def __init__(self, depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT,
|
|
25
|
-
gauss_components: int = 2, normalization=None
|
|
26
|
-
|
|
25
|
+
gauss_components: int = 2, discretization=None, normalization=None,
|
|
26
|
+
seed: int = get_default_random_seed()):
|
|
27
|
+
super().__init__(output, discretization, normalization)
|
|
27
28
|
self.depth = depth
|
|
28
29
|
self.error_threshold = error_threshold
|
|
29
30
|
self.gauss_components = gauss_components
|
|
30
31
|
self._predictor = KNeighborsClassifier() if output == Target.CLASSIFICATION else KNeighborsRegressor()
|
|
31
32
|
self._predictor.n_neighbors = 1
|
|
33
|
+
self.seed = seed
|
|
32
34
|
|
|
33
35
|
def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int):
|
|
34
36
|
cubes = []
|
|
@@ -52,13 +54,14 @@ class ExACT(HyperCubeClustering, ABC):
|
|
|
52
54
|
dbscan_pred = DBSCAN(eps=select_dbscan_epsilon(data, clusters)).fit_predict(data.iloc[:, :-1])
|
|
53
55
|
return HyperCube.create_surrounding_cube(
|
|
54
56
|
dataframe.iloc[np.where(dbscan_pred == Counter(dbscan_pred).most_common(1)[0][0])],
|
|
55
|
-
True, self._output
|
|
57
|
+
True, self._output, self._protected_features
|
|
56
58
|
)
|
|
57
59
|
|
|
58
60
|
def fit(self, dataframe: pd.DataFrame):
|
|
61
|
+
np.random.seed(self.seed)
|
|
59
62
|
self._predictor.fit(dataframe.iloc[:, :-1], dataframe.iloc[:, -1])
|
|
60
|
-
self.
|
|
61
|
-
|
|
63
|
+
self._surrounding = HyperCube.create_surrounding_cube(dataframe, True, self._output, self._protected_features)
|
|
64
|
+
self._hypercubes = self._iterate(Node(dataframe, self._surrounding))
|
|
62
65
|
|
|
63
66
|
def get_hypercubes(self) -> Iterable[HyperCube]:
|
|
64
67
|
return list(self._hypercubes)
|
|
@@ -76,14 +79,17 @@ class ExACT(HyperCubeClustering, ABC):
|
|
|
76
79
|
enumerate(dataframe.iloc[:, -1].unique())
|
|
77
80
|
).items()}}) if isinstance(dataframe.iloc[0, -1], str) else dataframe
|
|
78
81
|
|
|
82
|
+
def _get_gauss_predictions(self, to_split):
|
|
83
|
+
to_split.sort(reverse=True)
|
|
84
|
+
(_, depth, _, node) = to_split.pop()
|
|
85
|
+
data = ExACT._remove_string_label(node.dataframe)
|
|
86
|
+
gauss_params = select_gaussian_mixture(data.drop(self._protected_features, axis=1), self.gauss_components)
|
|
87
|
+
return node, depth, gauss_params[2].predict(data.drop(self._protected_features, axis=1)), gauss_params
|
|
88
|
+
|
|
79
89
|
def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
|
|
80
90
|
to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
|
|
81
91
|
while len(to_split) > 0:
|
|
82
|
-
|
|
83
|
-
(_, depth, _, node) = to_split.pop()
|
|
84
|
-
data = ExACT._remove_string_label(node.dataframe)
|
|
85
|
-
gauss_params = select_gaussian_mixture(data, self.gauss_components)
|
|
86
|
-
gauss_pred = gauss_params[2].predict(data)
|
|
92
|
+
node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split)
|
|
87
93
|
cubes, indices = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
|
|
88
94
|
cubes = [(c.volume(), len(idx), i, idx, c) for i, (c, idx) in enumerate(zip(cubes, indices))
|
|
89
95
|
if (idx is not None) and (not node.cube.equal(c))]
|
psyke/clustering/utils.py
CHANGED
|
@@ -11,7 +11,6 @@ def select_gaussian_mixture(data: pd.DataFrame, max_components) -> tuple[float,
|
|
|
11
11
|
try:
|
|
12
12
|
models = [GaussianMixture(n_components=n).fit(data) for n in components if n <= len(data)]
|
|
13
13
|
except ValueError:
|
|
14
|
-
print(data)
|
|
15
14
|
print(len(data))
|
|
16
15
|
return min([(m.bic(data) / (i + 2), (i + 2), m) for i, m in enumerate(models)])
|
|
17
16
|
|
psyke/extraction/__init__.py
CHANGED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from tuprolog.theory import Theory
|
|
5
|
+
|
|
6
|
+
from psyke import Extractor
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PedagogicalExtractor(Extractor, ABC):
|
|
10
|
+
|
|
11
|
+
def __init__(self, predictor, discretization=None, normalization=None):
|
|
12
|
+
Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization)
|
|
13
|
+
|
|
14
|
+
def _substitute_output(self, dataframe: pd.DataFrame) -> pd.DataFrame:
|
|
15
|
+
new_y = pd.DataFrame(self.predictor.predict(dataframe.iloc[:, :-1])).set_index(dataframe.index)
|
|
16
|
+
data = dataframe.iloc[:, :-1].copy().join(new_y)
|
|
17
|
+
data.columns = dataframe.columns
|
|
18
|
+
return data
|
|
19
|
+
|
|
20
|
+
def extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
21
|
+
self.theory = self._extract(self._substitute_output(dataframe))
|
|
22
|
+
return self.theory
|
|
23
|
+
|
|
24
|
+
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
25
|
+
raise NotImplementedError('extract')
|