psyke 0.8.14.dev6__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of psyke might be problematic. Click here for more details.
- {psyke-0.8.14.dev6/psyke.egg-info → psyke-0.9.0}/PKG-INFO +1 -1
- psyke-0.9.0/VERSION +1 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/__init__.py +98 -23
- psyke-0.9.0/psyke/extraction/__init__.py +51 -0
- psyke-0.8.14.dev6/psyke/extraction/cart/predictor.py → psyke-0.9.0/psyke/extraction/cart/CartPredictor.py +49 -4
- psyke-0.9.0/psyke/extraction/cart/FairTree.py +196 -0
- psyke-0.9.0/psyke/extraction/cart/FairTreePredictor.py +62 -0
- psyke-0.9.0/psyke/extraction/cart/__init__.py +71 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/__init__.py +10 -3
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/creepy/__init__.py +1 -1
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/gridex/__init__.py +3 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/iter/__init__.py +5 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/strategy.py +13 -9
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/tuning/pedro/__init__.py +4 -2
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/utils/logic.py +1 -1
- {psyke-0.8.14.dev6 → psyke-0.9.0/psyke.egg-info}/PKG-INFO +1 -1
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke.egg-info/SOURCES.txt +3 -1
- psyke-0.8.14.dev6/VERSION +0 -1
- psyke-0.8.14.dev6/psyke/extraction/__init__.py +0 -21
- psyke-0.8.14.dev6/psyke/extraction/cart/__init__.py +0 -96
- {psyke-0.8.14.dev6 → psyke-0.9.0}/LICENSE +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/MANIFEST.in +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/README.md +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/clustering/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/clustering/cream/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/clustering/exact/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/clustering/utils.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/cosmik/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/divine/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/gridrex/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/hex/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/hypercube.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/hypercubic/utils.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/real/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/real/utils.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/trepan/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/extraction/trepan/utils.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/hypercubepredictor.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/schema/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/tuning/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/tuning/crash/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/tuning/orchid/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/utils/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/utils/dataframe.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/utils/metrics.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/utils/plot.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke/utils/sorted.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke.egg-info/dependency_links.txt +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke.egg-info/not-zip-safe +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke.egg-info/requires.txt +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/psyke.egg-info/top_level.txt +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/pyproject.toml +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/setup.cfg +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/setup.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/clustering/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/cart/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/cart/test_cart.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/cart/test_simplified_cart.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/hypercubic/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/hypercubic/gridex/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/hypercubic/gridex/test_gridex.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/hypercubic/iter/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/hypercubic/iter/test_iter.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/hypercubic/test_hypercube.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/real/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/real/test_real.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/real/test_rule.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/trepan/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/trepan/test_node.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/trepan/test_split.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/extraction/trepan/test_trepan.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/utils/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/utils/test_prune.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/utils/test_simplify.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/psyke/utils/test_simplify_formatter.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/resources/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/resources/datasets/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/resources/predictors/__init__.py +0 -0
- {psyke-0.8.14.dev6 → psyke-0.9.0}/test/resources/tests/__init__.py +0 -0
psyke-0.9.0/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.9.0
|
|
@@ -5,16 +5,20 @@ from enum import Enum
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
+
from matplotlib import pyplot as plt
|
|
8
9
|
from sklearn.linear_model import LinearRegression
|
|
9
10
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, f1_score, accuracy_score, \
|
|
10
11
|
adjusted_rand_score, adjusted_mutual_info_score, v_measure_score, fowlkes_mallows_score
|
|
12
|
+
from tuprolog.solve.prolog import prolog_solver
|
|
11
13
|
|
|
12
14
|
from psyke.schema import DiscreteFeature
|
|
13
15
|
from psyke.utils import get_default_random_seed, Target, get_int_precision
|
|
14
|
-
from tuprolog.theory import Theory
|
|
16
|
+
from tuprolog.theory import Theory, mutable_theory
|
|
15
17
|
from typing import Iterable
|
|
16
18
|
import logging
|
|
17
19
|
|
|
20
|
+
from psyke.utils.logic import get_in_rule, data_to_struct, get_not_in_rule
|
|
21
|
+
|
|
18
22
|
logging.basicConfig(level=logging.WARN)
|
|
19
23
|
logger = logging.getLogger('psyke')
|
|
20
24
|
|
|
@@ -52,7 +56,7 @@ class EvaluableModel(object):
|
|
|
52
56
|
"""
|
|
53
57
|
Predicts the output values of every sample in dataset.
|
|
54
58
|
|
|
55
|
-
:param dataframe:
|
|
59
|
+
:param dataframe: the set of instances to predict.
|
|
56
60
|
:return: a list of predictions.
|
|
57
61
|
"""
|
|
58
62
|
return self.__convert(self._predict(dataframe))
|
|
@@ -85,7 +89,7 @@ class EvaluableModel(object):
|
|
|
85
89
|
def score(self, dataframe: pd.DataFrame, predictor=None, fidelity: bool = False, completeness: bool = True,
|
|
86
90
|
brute: bool = False, criterion: str = 'corners', n: int = 2,
|
|
87
91
|
task: EvaluableModel.Task = Task.CLASSIFICATION,
|
|
88
|
-
scoring_function: Iterable[EvaluableModel.Score] =
|
|
92
|
+
scoring_function: Iterable[EvaluableModel.Score] = (ClassificationScore.ACCURACY, )):
|
|
89
93
|
extracted = np.array(
|
|
90
94
|
self.predict(dataframe.iloc[:, :-1]) if not brute else
|
|
91
95
|
self.brute_predict(dataframe.iloc[:, :-1], criterion, n)
|
|
@@ -151,42 +155,113 @@ class Extractor(EvaluableModel, ABC):
|
|
|
151
155
|
def __init__(self, predictor, discretization: Iterable[DiscreteFeature] = None, normalization=None):
|
|
152
156
|
super().__init__(discretization, normalization)
|
|
153
157
|
self.predictor = predictor
|
|
158
|
+
self.theory = None
|
|
154
159
|
|
|
155
160
|
def extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
156
161
|
"""
|
|
157
162
|
Extracts rules from the underlying predictor.
|
|
158
163
|
|
|
159
|
-
:param dataframe:
|
|
164
|
+
:param dataframe: the set of instances to be used for the extraction.
|
|
160
165
|
:return: the theory created from the extracted rules.
|
|
161
166
|
"""
|
|
162
167
|
raise NotImplementedError('extract')
|
|
163
168
|
|
|
164
|
-
def predict_why(self, data: dict[str, float], verbose=True):
|
|
169
|
+
def predict_why(self, data: dict[str, float], verbose: bool = True):
|
|
165
170
|
"""
|
|
166
171
|
Provides a prediction and the corresponding explanation.
|
|
167
|
-
:param data:
|
|
168
|
-
:param verbose: if the explanation
|
|
172
|
+
:param data: the instance to predict.
|
|
173
|
+
:param verbose: if True the explanation is printed.
|
|
169
174
|
"""
|
|
170
175
|
raise NotImplementedError('predict_why')
|
|
171
176
|
|
|
172
|
-
def predict_counter(self, data: dict[str, float], verbose=True, only_first=True):
|
|
177
|
+
def predict_counter(self, data: dict[str, float], verbose: bool = True, only_first: bool = True):
|
|
173
178
|
"""
|
|
174
179
|
Provides a prediction and counterfactual explanations.
|
|
175
|
-
:param data:
|
|
176
|
-
:param verbose: if the counterfactual explanation
|
|
177
|
-
:param only_first: if only the closest counterfactual explanation is provided for each distinct class.
|
|
180
|
+
:param data: the instance to predict.
|
|
181
|
+
:param verbose: if True the counterfactual explanation is printed.
|
|
182
|
+
:param only_first: if True only the closest counterfactual explanation is provided for each distinct class.
|
|
178
183
|
"""
|
|
179
184
|
raise NotImplementedError('predict_counter')
|
|
180
185
|
|
|
186
|
+
def plot_fairness(self, dataframe: pd.DataFrame, groups: dict[str, list], colormap='seismic_r', filename=None,
|
|
187
|
+
figsize=(5, 4)):
|
|
188
|
+
"""
|
|
189
|
+
Provides a visual estimation of the fairness exhibited by an extractor with respect to the specified groups.
|
|
190
|
+
:param dataframe: the set of instances to be used for the estimation.
|
|
191
|
+
:param groups: the set of relevant groups to consider.
|
|
192
|
+
:param colormap: the colormap to use for the plot.
|
|
193
|
+
:param filename: if not None, name used to save the plot.
|
|
194
|
+
:param figsize: size of the plot.
|
|
195
|
+
"""
|
|
196
|
+
counts = {group: len(dataframe[idx_g]) for group, idx_g in groups.items()}
|
|
197
|
+
output = {'labels': []}
|
|
198
|
+
for group in groups:
|
|
199
|
+
output[group] = []
|
|
200
|
+
for i, clause in enumerate(self.theory.clauses):
|
|
201
|
+
if len(dataframe) == 0:
|
|
202
|
+
break
|
|
203
|
+
solver = prolog_solver(static_kb=mutable_theory(clause).assertZ(get_in_rule()).assertZ(get_not_in_rule()))
|
|
204
|
+
idx = np.array([query.is_yes for query in
|
|
205
|
+
[solver.solveOnce(data_to_struct(data)) for _, data in dataframe.iterrows()]])
|
|
206
|
+
# print(f'Rule {i + 1}. Outcome {clause.head.args[-1]}. Affecting', end='')
|
|
207
|
+
output['labels'].append(str(clause.head.args[-1]))
|
|
208
|
+
for group, idx_g in groups.items():
|
|
209
|
+
# print(f' {len(dataframe[idx & idx_g]) / counts[group]:.2f}%{group}', end='')
|
|
210
|
+
output[group].append(len(dataframe[idx & idx_g]) / counts[group])
|
|
211
|
+
dataframe = dataframe[~idx]
|
|
212
|
+
groups = {group: indices[~idx] for group, indices in groups.items()}
|
|
213
|
+
# print(f'. Left {len(dataframe)} instances')
|
|
214
|
+
|
|
215
|
+
binary = len(set(output['labels'])) == 2
|
|
216
|
+
labels = sorted(set(output['labels']))
|
|
217
|
+
data = np.vstack([output[group] for group in groups]).T * 100
|
|
218
|
+
if binary:
|
|
219
|
+
data[np.array(output['labels']) == labels[0]] *= -1
|
|
220
|
+
|
|
221
|
+
plt.figure(figsize=figsize)
|
|
222
|
+
plt.imshow(data, cmap=colormap, vmin=-100 if binary else 0, vmax=100)
|
|
223
|
+
|
|
224
|
+
plt.gca().set_xticks(range(len(groups)), labels=groups.keys())
|
|
225
|
+
plt.gca().set_yticks(range(len(output['labels'])),
|
|
226
|
+
labels=[f'Rule {i + 1}\n{l}' for i, l in enumerate(output['labels'])])
|
|
227
|
+
|
|
228
|
+
plt.xlabel('Groups')
|
|
229
|
+
plt.ylabel('Rules')
|
|
230
|
+
plt.title("Rule set impact on groups")
|
|
231
|
+
|
|
232
|
+
for i in range(len(output['labels'])):
|
|
233
|
+
for j in range(len(groups)):
|
|
234
|
+
plt.gca().text(j, i, f'{abs(int(data[i, j]))}%', ha="center", va="center", color="k")
|
|
235
|
+
|
|
236
|
+
plt.gca().set_xticks([i + .5 for i in range(len(groups))], minor=True)
|
|
237
|
+
plt.gca().set_yticks([i + .5 for i in range(len(output['labels']))], minor=True)
|
|
238
|
+
plt.gca().grid(which='minor', color='k', linestyle='-', linewidth=.8)
|
|
239
|
+
plt.gca().tick_params(which='minor', bottom=False, left=False)
|
|
240
|
+
cbarticks = np.linspace(-100 if binary else 0, 100, 9 if binary else 11, dtype=int)
|
|
241
|
+
cbar = plt.colorbar(fraction=0.046, label='Affected samples (%)', ticks=cbarticks)
|
|
242
|
+
if binary:
|
|
243
|
+
ticklabels = [str(-i) if i < 0 else str(i) for i in cbarticks]
|
|
244
|
+
ticklabels[0] += f' {labels[0]}'
|
|
245
|
+
ticklabels[-1] += f' {labels[-1]}'
|
|
246
|
+
cbar.ax.set_yticklabels(ticklabels)
|
|
247
|
+
|
|
248
|
+
plt.tight_layout()
|
|
249
|
+
if filename is not None:
|
|
250
|
+
plt.savefig(filename, dpi=500)
|
|
251
|
+
plt.show()
|
|
252
|
+
|
|
253
|
+
def make_fair(self, features: Iterable[str]):
|
|
254
|
+
raise NotImplementedError(f'Fairness for {type(self).__name__} is not supported at the moment')
|
|
255
|
+
|
|
181
256
|
def mae(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
|
|
182
257
|
n: int = 3) -> float:
|
|
183
258
|
"""
|
|
184
259
|
Calculates the predictions' MAE w.r.t. the instances given as input.
|
|
185
260
|
|
|
186
|
-
:param dataframe:
|
|
261
|
+
:param dataframe: the set of instances to be used to calculate the mean absolute error.
|
|
187
262
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
188
263
|
:param brute: if True, a brute prediction is executed.
|
|
189
|
-
:param criterion:
|
|
264
|
+
:param criterion: criterion for brute prediction.
|
|
190
265
|
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
191
266
|
:return: the mean absolute error (MAE) of the predictions.
|
|
192
267
|
"""
|
|
@@ -198,10 +273,10 @@ class Extractor(EvaluableModel, ABC):
|
|
|
198
273
|
"""
|
|
199
274
|
Calculates the predictions' MSE w.r.t. the instances given as input.
|
|
200
275
|
|
|
201
|
-
:param dataframe:
|
|
276
|
+
:param dataframe: the set of instances to be used to calculate the mean squared error.
|
|
202
277
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
203
278
|
:param brute: if True, a brute prediction is executed.
|
|
204
|
-
:param criterion:
|
|
279
|
+
:param criterion: criterion for brute prediction.
|
|
205
280
|
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
206
281
|
:return: the mean squared error (MSE) of the predictions.
|
|
207
282
|
"""
|
|
@@ -213,10 +288,10 @@ class Extractor(EvaluableModel, ABC):
|
|
|
213
288
|
"""
|
|
214
289
|
Calculates the predictions' R2 score w.r.t. the instances given as input.
|
|
215
290
|
|
|
216
|
-
:param dataframe:
|
|
291
|
+
:param dataframe: the set of instances to be used to calculate the R2 score.
|
|
217
292
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
218
293
|
:param brute: if True, a brute prediction is executed.
|
|
219
|
-
:param criterion:
|
|
294
|
+
:param criterion: criterion for brute prediction.
|
|
220
295
|
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
221
296
|
:return: the R2 score of the predictions.
|
|
222
297
|
"""
|
|
@@ -224,14 +299,14 @@ class Extractor(EvaluableModel, ABC):
|
|
|
224
299
|
Extractor.Task.REGRESSION, [Extractor.RegressionScore.R2])[Extractor.RegressionScore.R2][-1]
|
|
225
300
|
|
|
226
301
|
def accuracy(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
|
|
227
|
-
|
|
302
|
+
n: int = 3) -> float:
|
|
228
303
|
"""
|
|
229
304
|
Calculates the predictions' accuracy classification score w.r.t. the instances given as input.
|
|
230
305
|
|
|
231
|
-
:param dataframe:
|
|
306
|
+
:param dataframe: the set of instances to be used to calculate the accuracy classification score.
|
|
232
307
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
233
308
|
:param brute: if True, a brute prediction is executed.
|
|
234
|
-
:param criterion:
|
|
309
|
+
:param criterion: criterion for brute prediction.
|
|
235
310
|
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
236
311
|
:return: the accuracy classification score of the predictions.
|
|
237
312
|
"""
|
|
@@ -244,10 +319,10 @@ class Extractor(EvaluableModel, ABC):
|
|
|
244
319
|
"""
|
|
245
320
|
Calculates the predictions' F1 score w.r.t. the instances given as input.
|
|
246
321
|
|
|
247
|
-
:param dataframe:
|
|
322
|
+
:param dataframe: the set of instances to be used to calculate the F1 score.
|
|
248
323
|
:param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
|
|
249
324
|
:param brute: if True, a brute prediction is executed.
|
|
250
|
-
:param criterion:
|
|
325
|
+
:param criterion: criterion for brute prediction.
|
|
251
326
|
:param n: number of points for brute prediction with 'perimeter' criterion.
|
|
252
327
|
:return: the F1 score of the predictions.
|
|
253
328
|
"""
|
|
@@ -331,7 +406,7 @@ class Extractor(EvaluableModel, ABC):
|
|
|
331
406
|
|
|
332
407
|
@staticmethod
|
|
333
408
|
def creepy(predictor, clustering, depth: int, error_threshold: float, output: Target = Target.CONSTANT,
|
|
334
|
-
gauss_components: int = 2, ranks: [(str, float)] =
|
|
409
|
+
gauss_components: int = 2, ranks: Iterable[(str, float)] = tuple(), ignore_threshold: float = 0.0,
|
|
335
410
|
discretization=None, normalization: dict[str, tuple[float, float]] = None,
|
|
336
411
|
seed: int = get_default_random_seed()) -> Extractor:
|
|
337
412
|
"""
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from collections import Iterable
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from tuprolog.theory import Theory
|
|
6
|
+
|
|
7
|
+
from psyke import Extractor
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PedagogicalExtractor(Extractor, ABC):
|
|
11
|
+
|
|
12
|
+
def __init__(self, predictor, discretization=None, normalization=None):
|
|
13
|
+
Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization)
|
|
14
|
+
|
|
15
|
+
def _substitute_output(self, dataframe: pd.DataFrame) -> pd.DataFrame:
|
|
16
|
+
new_y = pd.DataFrame(self.predictor.predict(dataframe.iloc[:, :-1])).set_index(dataframe.index)
|
|
17
|
+
data = dataframe.iloc[:, :-1].copy().join(new_y)
|
|
18
|
+
data.columns = dataframe.columns
|
|
19
|
+
return data
|
|
20
|
+
|
|
21
|
+
def extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
22
|
+
self.theory = self._extract(self._substitute_output(dataframe))
|
|
23
|
+
return self.theory
|
|
24
|
+
|
|
25
|
+
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
26
|
+
raise NotImplementedError('extract')
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class FairExtractor(PedagogicalExtractor, ABC):
|
|
30
|
+
|
|
31
|
+
def __init__(self, extractor: Extractor, features: Iterable):
|
|
32
|
+
super().__init__(extractor.predictor, extractor.discretization, extractor.normalization)
|
|
33
|
+
self.features = features
|
|
34
|
+
self.extractor = extractor
|
|
35
|
+
# self.make_fair()
|
|
36
|
+
|
|
37
|
+
# def extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
38
|
+
# self.theory = self.extractor.extract(dataframe)
|
|
39
|
+
# return self.theory
|
|
40
|
+
|
|
41
|
+
# def predict_why(self, data: dict[str, float], verbose: bool = True):
|
|
42
|
+
# self.extractor.predict_why(data, verbose)
|
|
43
|
+
|
|
44
|
+
# def predict_counter(self, data: dict[str, float], verbose: bool = True, only_first: bool = True):
|
|
45
|
+
# self.extractor.predict_counter(data, verbose, only_first)
|
|
46
|
+
|
|
47
|
+
# def _predict(self, dataframe: pd.DataFrame) -> Iterable:
|
|
48
|
+
# return self.extractor.predict(dataframe)
|
|
49
|
+
|
|
50
|
+
# def _brute_predict(self, dataframe: pd.DataFrame, criterion: str = 'corner', n: int = 2) -> Iterable:
|
|
51
|
+
# return self.extractor.brute_predict(dataframe, criterion, n)
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
from collections import Iterable
|
|
2
2
|
from typing import Union, Any
|
|
3
3
|
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
4
5
|
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
5
|
-
from
|
|
6
|
+
from tuprolog.core import clause, Var, Struct
|
|
7
|
+
from tuprolog.theory import Theory, mutable_theory
|
|
6
8
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
+
from psyke.extraction.cart import LeafConstraints, LeafSequence
|
|
10
|
+
from psyke.schema import LessThan, GreaterThan, SchemaException, DiscreteFeature
|
|
11
|
+
from psyke.utils.logic import create_variable_list, create_head, create_term
|
|
9
12
|
|
|
10
13
|
|
|
11
14
|
class CartPredictor:
|
|
@@ -14,8 +17,9 @@ class CartPredictor:
|
|
|
14
17
|
"""
|
|
15
18
|
|
|
16
19
|
def __init__(self, predictor: Union[DecisionTreeClassifier, DecisionTreeRegressor] = DecisionTreeClassifier(),
|
|
17
|
-
normalization=None):
|
|
20
|
+
discretization=None, normalization=None):
|
|
18
21
|
self._predictor = predictor
|
|
22
|
+
self.discretization = discretization
|
|
19
23
|
self.normalization = normalization
|
|
20
24
|
|
|
21
25
|
def __get_constraints(self, nodes: Iterable[(int, bool)]) -> LeafConstraints:
|
|
@@ -62,6 +66,47 @@ class CartPredictor:
|
|
|
62
66
|
def predict(self, data) -> Iterable:
|
|
63
67
|
return self._predictor.predict(data)
|
|
64
68
|
|
|
69
|
+
@staticmethod
|
|
70
|
+
def _simplify_nodes(nodes: list) -> Iterable:
|
|
71
|
+
simplified = [nodes.pop(0)]
|
|
72
|
+
while len(nodes) > 0:
|
|
73
|
+
first_node = nodes[0][0]
|
|
74
|
+
for k, conditions in first_node.items():
|
|
75
|
+
for condition in conditions:
|
|
76
|
+
if all(k in node[0] and condition in node[0][k] for node in nodes):
|
|
77
|
+
[node[0][k].remove(condition) for node in nodes]
|
|
78
|
+
simplified.append(nodes.pop(0))
|
|
79
|
+
return [({k: v for k, v in rule.items() if v != []}, prediction) for rule, prediction in simplified]
|
|
80
|
+
|
|
81
|
+
def _create_body(self, variables: dict[str, Var], conditions: LeafConstraints) -> Iterable[Struct]:
|
|
82
|
+
results = []
|
|
83
|
+
for feature_name, cond_list in conditions.items():
|
|
84
|
+
for condition in cond_list:
|
|
85
|
+
feature: DiscreteFeature = [d for d in self.discretization if feature_name in d.admissible_values][0] \
|
|
86
|
+
if self.discretization else None
|
|
87
|
+
results.append(create_term(variables[feature_name], condition) if feature is None else
|
|
88
|
+
create_term(variables[feature.name],
|
|
89
|
+
feature.admissible_values[feature_name],
|
|
90
|
+
isinstance(condition, GreaterThan)))
|
|
91
|
+
return results
|
|
92
|
+
|
|
93
|
+
def create_theory(self, data: pd.DataFrame, simplify: True) -> Theory:
|
|
94
|
+
new_theory = mutable_theory()
|
|
95
|
+
nodes = [node for node in self]
|
|
96
|
+
nodes = self._simplify_nodes(nodes) if simplify else nodes
|
|
97
|
+
for (constraints, prediction) in nodes:
|
|
98
|
+
if self.normalization is not None and data.columns[-1] in self.normalization:
|
|
99
|
+
m, s = self.normalization[data.columns[-1]]
|
|
100
|
+
prediction = prediction * s + m
|
|
101
|
+
variables = create_variable_list(self.discretization, data)
|
|
102
|
+
new_theory.assertZ(
|
|
103
|
+
clause(
|
|
104
|
+
create_head(data.columns[-1], list(variables.values()), prediction),
|
|
105
|
+
self._create_body(variables, constraints)
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
return new_theory
|
|
109
|
+
|
|
65
110
|
@property
|
|
66
111
|
def predictor(self) -> Union[DecisionTreeClassifier, DecisionTreeRegressor]:
|
|
67
112
|
return self._predictor
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from collections import Counter
|
|
3
|
+
|
|
4
|
+
from sklearn.metrics import accuracy_score, r2_score
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Node:
|
|
8
|
+
def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
|
|
9
|
+
self.feature = feature
|
|
10
|
+
self.threshold = threshold
|
|
11
|
+
self.left = left
|
|
12
|
+
self.right = right
|
|
13
|
+
self.value = value
|
|
14
|
+
|
|
15
|
+
def is_leaf_node(self):
|
|
16
|
+
return self.value is not None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FairTree:
|
|
20
|
+
def __init__(self, max_depth=3, max_leaves=None, criterion=None, min_samples_split=2, lambda_penalty=0.0,
|
|
21
|
+
protected_attr=None):
|
|
22
|
+
self.max_depth = max_depth
|
|
23
|
+
self.max_leaves = max_leaves
|
|
24
|
+
self.min_samples_split = min_samples_split
|
|
25
|
+
self.lambda_penalty = lambda_penalty
|
|
26
|
+
self.protected_attr = protected_attr
|
|
27
|
+
self.criterion = criterion
|
|
28
|
+
self.root = None
|
|
29
|
+
self.n_leaves = 0
|
|
30
|
+
self.quality_function = None
|
|
31
|
+
|
|
32
|
+
def fit(self, X, y):
|
|
33
|
+
self.n_leaves = 0
|
|
34
|
+
self.root = self._grow_tree(X, y, depth=0)
|
|
35
|
+
while self.n_leaves > self.max_leaves:
|
|
36
|
+
self.prune_least_important_leaf(X, y)
|
|
37
|
+
self.n_leaves -= 1
|
|
38
|
+
return self
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def _estimate_output(y):
|
|
42
|
+
raise NotImplementedError
|
|
43
|
+
|
|
44
|
+
def score(self, X, y):
|
|
45
|
+
raise NotImplementedError
|
|
46
|
+
|
|
47
|
+
def predict(self, X):
|
|
48
|
+
return np.array([self._traverse_tree(x, self.root) for _, x in X.iterrows()])
|
|
49
|
+
|
|
50
|
+
def _traverse_tree(self, x, node):
|
|
51
|
+
if node.is_leaf_node():
|
|
52
|
+
return node.value
|
|
53
|
+
if x[node.feature] <= node.threshold:
|
|
54
|
+
return self._traverse_tree(x, node.left)
|
|
55
|
+
return self._traverse_tree(x, node.right)
|
|
56
|
+
|
|
57
|
+
def _grow_tree(self, X, y, depth):
|
|
58
|
+
if depth >= self.max_depth or X.shape[0] < self.min_samples_split or len(set(y.values.flatten())) == 1 or \
|
|
59
|
+
(self.max_leaves is not None and self.n_leaves >= self.max_leaves):
|
|
60
|
+
self.n_leaves += 1
|
|
61
|
+
return Node(value=self._estimate_output(y))
|
|
62
|
+
|
|
63
|
+
best_feature, best_threshold = self._best_split(X, y)
|
|
64
|
+
if best_feature is None:
|
|
65
|
+
self.n_leaves += 1
|
|
66
|
+
return Node(value=self._estimate_output(y))
|
|
67
|
+
|
|
68
|
+
left_idxs = X[best_feature] <= best_threshold
|
|
69
|
+
right_idxs = X[best_feature] > best_threshold
|
|
70
|
+
|
|
71
|
+
left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
|
|
72
|
+
right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)
|
|
73
|
+
return Node(best_feature, best_threshold, left, right)
|
|
74
|
+
|
|
75
|
+
def _best_split(self, X, y):
|
|
76
|
+
best_gain = -float('inf')
|
|
77
|
+
split_idx, split_threshold = None, None
|
|
78
|
+
|
|
79
|
+
for feature in [feature for feature in X.columns if feature not in self.protected_attr]:
|
|
80
|
+
for threshold in np.unique(np.quantile(X[feature], np.linspace(0, 1, num=25))):
|
|
81
|
+
left_idxs = X[feature] <= threshold
|
|
82
|
+
right_idxs = X[feature] > threshold
|
|
83
|
+
|
|
84
|
+
if left_idxs.sum() == 0 or right_idxs.sum() == 0:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
gain = self._fair_gain(y, left_idxs, right_idxs, X[self.protected_attr])
|
|
88
|
+
|
|
89
|
+
if gain > best_gain:
|
|
90
|
+
best_gain = gain
|
|
91
|
+
split_idx = feature
|
|
92
|
+
split_threshold = threshold
|
|
93
|
+
return split_idx, split_threshold
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def _disparity(group):
|
|
97
|
+
counts = Counter(group)
|
|
98
|
+
if len(counts) <= 1:
|
|
99
|
+
return 0.0
|
|
100
|
+
values = np.array(list(counts.values())) / len(group)
|
|
101
|
+
return np.abs(values[0] - values[1])
|
|
102
|
+
|
|
103
|
+
def _fair_gain(self, y, left_idx, right_idx, protected):
|
|
104
|
+
child = len(y[left_idx]) / len(y) * self.quality_function(y[left_idx]) + \
|
|
105
|
+
len(y[right_idx]) / len(y) * self.quality_function(y[right_idx])
|
|
106
|
+
info_gain = self.quality_function(y) - child
|
|
107
|
+
penalty = self._disparity(protected[left_idx]) + self._disparity(protected[right_idx])
|
|
108
|
+
return info_gain - self.lambda_penalty * penalty
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def _match_path(x, path):
|
|
112
|
+
for node, left in path:
|
|
113
|
+
if left and x[node.feature] > node.threshold:
|
|
114
|
+
return False
|
|
115
|
+
if not left and x[node.feature] <= node.threshold:
|
|
116
|
+
return False
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def candidates(node, parent=None, is_left=None, path=[]):
|
|
121
|
+
if node is None or node.is_leaf_node():
|
|
122
|
+
return []
|
|
123
|
+
leaves = []
|
|
124
|
+
if node.left.is_leaf_node() and node.right.is_leaf_node():
|
|
125
|
+
leaves.append((node, parent, is_left, path))
|
|
126
|
+
leaves += FairTreeClassifier.candidates(node.left, node, True, path + [(node, True)])
|
|
127
|
+
leaves += FairTreeClassifier.candidates(node.right, node, False, path + [(node, False)])
|
|
128
|
+
return leaves
|
|
129
|
+
|
|
130
|
+
def prune_least_important_leaf(self, X, y):
|
|
131
|
+
best_score = -np.inf
|
|
132
|
+
best_prune = None
|
|
133
|
+
|
|
134
|
+
for node, parent, is_left, path in self.candidates(self.root):
|
|
135
|
+
original_left = node.left
|
|
136
|
+
original_right = node.right
|
|
137
|
+
|
|
138
|
+
merged_y = y[(X.apply(lambda x: self._match_path(x, path), axis=1))]
|
|
139
|
+
if len(merged_y) == 0:
|
|
140
|
+
continue
|
|
141
|
+
new_value = self._estimate_output(merged_y)
|
|
142
|
+
node.left = node.right = None
|
|
143
|
+
node.value = new_value
|
|
144
|
+
|
|
145
|
+
score = self.score(X, y)
|
|
146
|
+
if score >= best_score:
|
|
147
|
+
best_score = score
|
|
148
|
+
best_prune = (node, new_value)
|
|
149
|
+
|
|
150
|
+
node.left, node.right, node.value = original_left, original_right, None
|
|
151
|
+
|
|
152
|
+
if best_prune:
|
|
153
|
+
best_prune[0].left = best_prune[0].right = None
|
|
154
|
+
best_prune[0].value = best_prune[1]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
class FairTreeClassifier(FairTree):
|
|
158
|
+
def __init__(self, max_depth=3, max_leaves=None, criterion='entropy', min_samples_split=2, lambda_penalty=0.0,
|
|
159
|
+
protected_attr=None):
|
|
160
|
+
super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr)
|
|
161
|
+
self.quality_function = self._gini if self.criterion == 'gini' else self._entropy
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def _estimate_output(y):
|
|
165
|
+
return Counter(y.values.flatten()).most_common(1)[0][0]
|
|
166
|
+
|
|
167
|
+
def score(self, X, y):
|
|
168
|
+
return accuracy_score(y.values.flatten(), self.predict(X))
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def _entropy(y):
|
|
172
|
+
ps = np.unique(y, return_counts=True)[1] / len(y)
|
|
173
|
+
return -np.sum([p * np.log2(p) for p in ps if p > 0])
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def _gini(y):
|
|
177
|
+
return 1.0 - np.sum(np.unique(y, return_counts=True)[1] / len(y)**2)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class FairTreeRegressor(FairTree):
|
|
181
|
+
def __init__(self, max_depth=3, max_leaves=None, criterion='mse', min_samples_split=2, lambda_penalty=0.0,
|
|
182
|
+
protected_attr=None):
|
|
183
|
+
super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr)
|
|
184
|
+
self.quality_function = self._mse
|
|
185
|
+
|
|
186
|
+
@staticmethod
|
|
187
|
+
def _estimate_output(y):
|
|
188
|
+
return np.mean(y.values.flatten())
|
|
189
|
+
|
|
190
|
+
def score(self, X, y):
|
|
191
|
+
return r2_score(y.values.flatten(), self.predict(X))
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _mse(y):
|
|
195
|
+
y = y.values.flatten().astype(float)
|
|
196
|
+
return np.mean((y - np.mean(y))**2)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from collections import Iterable
|
|
3
|
+
from typing import Union, Any
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
6
|
+
from tuprolog.core import clause, Var, Struct
|
|
7
|
+
from tuprolog.theory import Theory, mutable_theory
|
|
8
|
+
|
|
9
|
+
from psyke.extraction.cart import FairTreeClassifier, FairTreeRegressor, LeafSequence, LeafConstraints
|
|
10
|
+
from psyke.extraction.cart.CartPredictor import CartPredictor
|
|
11
|
+
from psyke.schema import LessThan, GreaterThan, SchemaException, DiscreteFeature, Value
|
|
12
|
+
from psyke.utils.logic import create_variable_list, create_head, create_term
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FairTreePredictor(CartPredictor):
|
|
16
|
+
"""
|
|
17
|
+
A wrapper for fair decision and regression trees of psyke.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, predictor: Union[FairTreeClassifier, FairTreeRegressor] = DecisionTreeClassifier(),
|
|
21
|
+
discretization=None, normalization=None):
|
|
22
|
+
super().__init__(predictor, discretization, normalization)
|
|
23
|
+
|
|
24
|
+
def __iter__(self) -> LeafSequence:
|
|
25
|
+
leaves = [node for node in self.recurse(self._predictor.root, {})]
|
|
26
|
+
return (leaf for leaf in leaves)
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def merge_constraints(constraints: LeafConstraints, constraint: Value, feature: str):
|
|
30
|
+
if feature in constraints:
|
|
31
|
+
try:
|
|
32
|
+
constraints[feature][-1] *= constraint
|
|
33
|
+
except SchemaException:
|
|
34
|
+
constraints[feature].append(constraint)
|
|
35
|
+
else:
|
|
36
|
+
constraints[feature] = [constraint]
|
|
37
|
+
return constraints
|
|
38
|
+
|
|
39
|
+
def recurse(self, node, constraints) -> Union[LeafSequence, tuple[LeafConstraints, Any]]:
|
|
40
|
+
if node.is_leaf_node():
|
|
41
|
+
return constraints, node.value
|
|
42
|
+
|
|
43
|
+
feature = node.feature
|
|
44
|
+
threshold = node.threshold if self.normalization is None else \
|
|
45
|
+
(node.threshold * self.normalization[feature][1] + self.normalization[feature][0])
|
|
46
|
+
|
|
47
|
+
left = self.recurse(node.left, self.merge_constraints(copy.deepcopy(constraints), LessThan(threshold), feature))
|
|
48
|
+
right = self.recurse(node.right, self.merge_constraints(copy.deepcopy(constraints),
|
|
49
|
+
GreaterThan(threshold), feature))
|
|
50
|
+
return (left if isinstance(left, list) else [left]) + (right if isinstance(right, list) else [right])
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def predictor(self) -> Union[FairTreeClassifier, FairTreeRegressor]:
|
|
54
|
+
return self._predictor
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def n_leaves(self) -> int:
|
|
58
|
+
return self._predictor.n_leaves
|
|
59
|
+
|
|
60
|
+
@predictor.setter
|
|
61
|
+
def predictor(self, predictor: Union[FairTreeClassifier, FairTreeRegressor]):
|
|
62
|
+
self._predictor = predictor
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
|
|
3
|
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
4
|
+
|
|
5
|
+
from psyke.extraction import PedagogicalExtractor
|
|
6
|
+
from psyke import get_default_random_seed
|
|
7
|
+
from psyke.extraction.cart.FairTree import FairTreeClassifier, FairTreeRegressor
|
|
8
|
+
from psyke.schema import DiscreteFeature, Value
|
|
9
|
+
from tuprolog.theory import Theory
|
|
10
|
+
from typing import Iterable, Any
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
TREE_SEED = get_default_random_seed()
|
|
15
|
+
|
|
16
|
+
LeafConstraints = dict[str, list[Value]]
|
|
17
|
+
LeafSequence = Iterable[tuple[LeafConstraints, Any]]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Cart(PedagogicalExtractor, ABC):
|
|
21
|
+
|
|
22
|
+
def __init__(self, predictor, max_depth: int = 3, max_leaves: int = None, max_features=None,
|
|
23
|
+
discretization: Iterable[DiscreteFeature] = None,
|
|
24
|
+
normalization=None, simplify: bool = True):
|
|
25
|
+
from psyke.extraction.cart.CartPredictor import CartPredictor
|
|
26
|
+
|
|
27
|
+
super().__init__(predictor, discretization, normalization)
|
|
28
|
+
self.is_fair = None
|
|
29
|
+
self._cart_predictor = CartPredictor(discretization=discretization, normalization=normalization)
|
|
30
|
+
self.depth = max_depth
|
|
31
|
+
self.leaves = max_leaves
|
|
32
|
+
self.max_features = max_features
|
|
33
|
+
self._simplify = simplify
|
|
34
|
+
|
|
35
|
+
def _extract(self, data: pd.DataFrame) -> Theory:
|
|
36
|
+
from psyke.extraction.cart.FairTreePredictor import FairTreePredictor
|
|
37
|
+
|
|
38
|
+
if self.is_fair:
|
|
39
|
+
self._cart_predictor = FairTreePredictor(discretization=self.discretization,
|
|
40
|
+
normalization=self.normalization)
|
|
41
|
+
fair_tree = FairTreeClassifier if isinstance(data.iloc[0, -1], str) else FairTreeRegressor
|
|
42
|
+
self._cart_predictor.predictor = fair_tree(max_depth=self.depth, max_leaves=self.leaves,
|
|
43
|
+
protected_attr=self.is_fair)
|
|
44
|
+
else:
|
|
45
|
+
tree = DecisionTreeClassifier if isinstance(data.iloc[0, -1], str) else DecisionTreeRegressor
|
|
46
|
+
self._cart_predictor.predictor = tree(random_state=TREE_SEED, max_depth=self.depth,
|
|
47
|
+
max_leaf_nodes=self.leaves, max_features=self.max_features)
|
|
48
|
+
self._cart_predictor.predictor.fit(data.iloc[:, :-1], data.iloc[:, -1])
|
|
49
|
+
return self._cart_predictor.create_theory(data, self._simplify)
|
|
50
|
+
|
|
51
|
+
def make_fair(self, features: Iterable[str]):
|
|
52
|
+
self.is_fair = features
|
|
53
|
+
|
|
54
|
+
def _predict(self, dataframe: pd.DataFrame) -> Iterable:
|
|
55
|
+
return self._cart_predictor.predict(dataframe)
|
|
56
|
+
|
|
57
|
+
def predict_why(self, data: dict[str, float], verbose=True):
|
|
58
|
+
prediction = None
|
|
59
|
+
conditions = {}
|
|
60
|
+
if self.normalization is not None:
|
|
61
|
+
data = {k: v * self.normalization[k][1] + self.normalization[k][0] if k in self.normalization else v
|
|
62
|
+
for k, v in data.items()}
|
|
63
|
+
for conditions, prediction in self._cart_predictor:
|
|
64
|
+
if all(all(interval.is_in(data[variable]) for interval in intervals)
|
|
65
|
+
for variable, intervals in conditions.items()):
|
|
66
|
+
break
|
|
67
|
+
return prediction, conditions
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def n_rules(self) -> int:
|
|
71
|
+
return self._cart_predictor.n_leaves
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import math
|
|
4
3
|
from abc import ABC
|
|
4
|
+
from collections import Iterable
|
|
5
|
+
|
|
5
6
|
import numpy as np
|
|
6
7
|
import pandas as pd
|
|
7
8
|
from sklearn.base import ClassifierMixin
|
|
@@ -13,7 +14,7 @@ from psyke.extraction import PedagogicalExtractor
|
|
|
13
14
|
from psyke.extraction.hypercubic.hypercube import HyperCube, RegressionCube, ClassificationCube, ClosedCube, Point, \
|
|
14
15
|
GenericCube
|
|
15
16
|
from psyke.hypercubepredictor import HyperCubePredictor
|
|
16
|
-
from psyke.schema import
|
|
17
|
+
from psyke.schema import Value
|
|
17
18
|
from psyke.utils.logic import create_variable_list, create_head, to_var, Simplifier
|
|
18
19
|
from psyke.utils import Target
|
|
19
20
|
from psyke.extraction.hypercubic.strategy import Strategy, FixedStrategy
|
|
@@ -209,10 +210,16 @@ class FeatureRanker:
|
|
|
209
210
|
|
|
210
211
|
|
|
211
212
|
class Grid:
|
|
212
|
-
def __init__(self, iterations: int = 1, strategy: Strategy |
|
|
213
|
+
def __init__(self, iterations: int = 1, strategy: Strategy | Iterable[Strategy] = FixedStrategy()):
|
|
213
214
|
self.iterations = iterations
|
|
214
215
|
self.strategy = strategy
|
|
215
216
|
|
|
217
|
+
def make_fair(self, features: Iterable[str]):
|
|
218
|
+
if isinstance(self.strategy, Strategy):
|
|
219
|
+
self.strategy.make_fair(features)
|
|
220
|
+
elif isinstance(self.strategy, Iterable):
|
|
221
|
+
[strategy.make_fair(features) for strategy in self.strategy]
|
|
222
|
+
|
|
216
223
|
def get(self, feature: str, depth: int) -> int:
|
|
217
224
|
if isinstance(self.strategy, list):
|
|
218
225
|
return self.strategy[depth].get(feature)
|
|
@@ -17,7 +17,7 @@ class CReEPy(HyperCubeExtractor):
|
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
19
|
def __init__(self, predictor, clustering=Clustering.exact, depth: int = 3, error_threshold: float = 0.1,
|
|
20
|
-
output: Target = Target.CONSTANT, gauss_components: int = 5, ranks:
|
|
20
|
+
output: Target = Target.CONSTANT, gauss_components: int = 5, ranks: Iterable[(str, float)] = tuple(),
|
|
21
21
|
ignore_threshold: float = 0.0, discretization=None, normalization=None,
|
|
22
22
|
seed: int = get_default_random_seed()):
|
|
23
23
|
super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output,
|
|
@@ -23,6 +23,7 @@ class ITER(HyperCubeExtractor):
|
|
|
23
23
|
raise NotImplementedError
|
|
24
24
|
self.predictor = predictor
|
|
25
25
|
self.min_update = min_update
|
|
26
|
+
self._init_points = n_points
|
|
26
27
|
self.n_points = n_points
|
|
27
28
|
self.max_iterations = max_iterations
|
|
28
29
|
self.min_examples = min_examples
|
|
@@ -33,6 +34,10 @@ class ITER(HyperCubeExtractor):
|
|
|
33
34
|
self.seed = seed
|
|
34
35
|
self.ignore_dimensions = ignore_dimensions if ignore_dimensions is not None else []
|
|
35
36
|
|
|
37
|
+
def make_fair(self, features: Iterable[str]):
|
|
38
|
+
self.n_points = self._init_points
|
|
39
|
+
self.ignore_dimensions += list(features)
|
|
40
|
+
|
|
36
41
|
def _best_cube(self, dataframe: pd.DataFrame, cube: GenericCube, cubes: Iterable[Expansion]) -> Expansion | None:
|
|
37
42
|
expansions = []
|
|
38
43
|
for limit in cubes:
|
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from functools import reduce
|
|
4
|
-
from
|
|
4
|
+
from collections import Iterable
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class Strategy:
|
|
8
|
-
def __init__(self):
|
|
9
|
-
self._partitions =
|
|
8
|
+
def __init__(self, partitions = None):
|
|
9
|
+
self._partitions = partitions
|
|
10
|
+
self._no_features = []
|
|
10
11
|
|
|
11
12
|
def get(self, feature: str) -> int:
|
|
12
13
|
raise NotImplementedError
|
|
13
14
|
|
|
15
|
+
def make_fair(self, features: Iterable[str]):
|
|
16
|
+
self._no_features = features
|
|
17
|
+
|
|
14
18
|
def partition_number(self, features: Iterable[str]) -> int:
|
|
15
19
|
return reduce(lambda x, y: x * y, map(self.get, features), 1)
|
|
16
20
|
|
|
@@ -29,23 +33,23 @@ class Strategy:
|
|
|
29
33
|
|
|
30
34
|
class FixedStrategy(Strategy):
|
|
31
35
|
def __init__(self, partitions: int = 2):
|
|
32
|
-
super().__init__()
|
|
33
|
-
self._partitions = partitions
|
|
36
|
+
super().__init__(partitions)
|
|
34
37
|
|
|
35
38
|
def get(self, feature: str) -> int:
|
|
36
|
-
return self._partitions
|
|
39
|
+
return 1 if feature in self._no_features else self._partitions
|
|
37
40
|
|
|
38
41
|
def __str__(self):
|
|
39
42
|
return "Fixed ({})".format(super().__str__())
|
|
40
43
|
|
|
41
44
|
|
|
42
45
|
class AdaptiveStrategy(Strategy):
|
|
43
|
-
def __init__(self, features: Iterable[str], partitions: Iterable[tuple[float, float]] | None = None):
|
|
44
|
-
super().__init__()
|
|
46
|
+
def __init__(self, features: Iterable[(str, float)], partitions: Iterable[tuple[float, float]] | None = None):
|
|
47
|
+
super().__init__(partitions if partitions is not None else [(0.33, 2), (0.67, 3)])
|
|
45
48
|
self.features = features
|
|
46
|
-
self._partitions = partitions if partitions is not None else [(0.33, 2), (0.67, 3)]
|
|
47
49
|
|
|
48
50
|
def get(self, feature: str) -> int:
|
|
51
|
+
if feature in self._no_features:
|
|
52
|
+
return 1
|
|
49
53
|
importance = next(filter(lambda t: t[0] == feature, self.features))[1]
|
|
50
54
|
n = 1
|
|
51
55
|
for (imp, part) in self._partitions:
|
|
@@ -55,8 +55,10 @@ class PEDRO(SKEOptimizer, IterativeOptimizer):
|
|
|
55
55
|
patience = self.patience
|
|
56
56
|
while patience > 0:
|
|
57
57
|
print("{}. {}. Threshold = {:.2f}. ".format(self.algorithm_name, grid, threshold), end="")
|
|
58
|
-
|
|
59
|
-
|
|
58
|
+
param_dict = dict(min_examples=25, threshold=threshold, normalization=self.normalization)
|
|
59
|
+
if self.algorithm != Extractor.gridrex:
|
|
60
|
+
param_dict['output'] = self.output
|
|
61
|
+
extractor = self.algorithm(self.predictor, grid, **param_dict)
|
|
60
62
|
_ = extractor.extract(self.dataframe)
|
|
61
63
|
error_function = (lambda *x: 1 - extractor.accuracy(*x)) if self.output == Target.CLASSIFICATION \
|
|
62
64
|
else extractor.mae
|
|
@@ -126,7 +126,7 @@ def to_var(name: str) -> Var:
|
|
|
126
126
|
def create_variable_list(features: list[DiscreteFeature], dataset: pd.DataFrame = None) -> dict[str, Var]:
|
|
127
127
|
dataset = dataset.columns[:-1] if dataset is not None else None
|
|
128
128
|
values = {feature.name: to_var(feature.name) for feature in features} \
|
|
129
|
-
if
|
|
129
|
+
if features else {name: to_var(name) for name in dataset}
|
|
130
130
|
return values
|
|
131
131
|
|
|
132
132
|
|
|
@@ -17,8 +17,10 @@ psyke/clustering/utils.py
|
|
|
17
17
|
psyke/clustering/cream/__init__.py
|
|
18
18
|
psyke/clustering/exact/__init__.py
|
|
19
19
|
psyke/extraction/__init__.py
|
|
20
|
+
psyke/extraction/cart/CartPredictor.py
|
|
21
|
+
psyke/extraction/cart/FairTree.py
|
|
22
|
+
psyke/extraction/cart/FairTreePredictor.py
|
|
20
23
|
psyke/extraction/cart/__init__.py
|
|
21
|
-
psyke/extraction/cart/predictor.py
|
|
22
24
|
psyke/extraction/hypercubic/__init__.py
|
|
23
25
|
psyke/extraction/hypercubic/hypercube.py
|
|
24
26
|
psyke/extraction/hypercubic/strategy.py
|
psyke-0.8.14.dev6/VERSION
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
0.8.14.dev6
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
from tuprolog.theory import Theory
|
|
5
|
-
|
|
6
|
-
from psyke import Extractor
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class PedagogicalExtractor(Extractor, ABC):
|
|
10
|
-
|
|
11
|
-
def __init__(self, predictor, discretization=None, normalization=None):
|
|
12
|
-
Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization)
|
|
13
|
-
|
|
14
|
-
def extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
15
|
-
new_y = pd.DataFrame(self.predictor.predict(dataframe.iloc[:, :-1])).set_index(dataframe.index)
|
|
16
|
-
data = dataframe.iloc[:, :-1].copy().join(new_y)
|
|
17
|
-
data.columns = dataframe.columns
|
|
18
|
-
return self._extract(data)
|
|
19
|
-
|
|
20
|
-
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
21
|
-
raise NotImplementedError('extract')
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
from abc import ABC
|
|
2
|
-
|
|
3
|
-
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
4
|
-
|
|
5
|
-
from psyke.extraction import PedagogicalExtractor
|
|
6
|
-
from psyke.extraction.cart.predictor import CartPredictor, LeafConstraints, LeafSequence
|
|
7
|
-
from psyke import get_default_random_seed
|
|
8
|
-
from psyke.schema import GreaterThan, DiscreteFeature
|
|
9
|
-
from psyke.utils.logic import create_variable_list, create_head, create_term
|
|
10
|
-
from tuprolog.core import clause, Var, Struct
|
|
11
|
-
from tuprolog.theory import Theory, mutable_theory
|
|
12
|
-
from typing import Iterable
|
|
13
|
-
import pandas as pd
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
TREE_SEED = get_default_random_seed()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class Cart(PedagogicalExtractor, ABC):
|
|
20
|
-
|
|
21
|
-
def __init__(self, predictor, max_depth: int = 3, max_leaves: int = None, max_features=None,
|
|
22
|
-
discretization: Iterable[DiscreteFeature] = None,
|
|
23
|
-
normalization=None, simplify: bool = True):
|
|
24
|
-
super().__init__(predictor, discretization, normalization)
|
|
25
|
-
self._cart_predictor = CartPredictor(normalization=normalization)
|
|
26
|
-
self.depth = max_depth
|
|
27
|
-
self.leaves = max_leaves
|
|
28
|
-
self.max_features = max_features
|
|
29
|
-
self._simplify = simplify
|
|
30
|
-
|
|
31
|
-
def _create_body(self, variables: dict[str, Var], conditions: LeafConstraints) -> Iterable[Struct]:
|
|
32
|
-
results = []
|
|
33
|
-
for feature_name, cond_list in conditions.items():
|
|
34
|
-
for condition in cond_list:
|
|
35
|
-
features = [d for d in self.discretization if feature_name in d.admissible_values]
|
|
36
|
-
feature: DiscreteFeature = features[0] if len(features) > 0 else None
|
|
37
|
-
results.append(create_term(variables[feature_name], condition) if feature is None else
|
|
38
|
-
create_term(variables[feature.name],
|
|
39
|
-
feature.admissible_values[feature_name],
|
|
40
|
-
isinstance(condition, GreaterThan)))
|
|
41
|
-
return results
|
|
42
|
-
|
|
43
|
-
@staticmethod
|
|
44
|
-
def _simplify_nodes(nodes: list) -> Iterable:
|
|
45
|
-
simplified = [nodes.pop(0)]
|
|
46
|
-
while len(nodes) > 0:
|
|
47
|
-
first_node = nodes[0][0]
|
|
48
|
-
for k, conditions in first_node.items():
|
|
49
|
-
for condition in conditions:
|
|
50
|
-
if all(k in node[0] and condition in node[0][k] for node in nodes):
|
|
51
|
-
[node[0][k].remove(condition) for node in nodes]
|
|
52
|
-
simplified.append(nodes.pop(0))
|
|
53
|
-
return [({k: v for k, v in rule.items() if v != []}, prediction) for rule, prediction in simplified]
|
|
54
|
-
|
|
55
|
-
def _create_theory(self, data: pd.DataFrame) -> Theory:
|
|
56
|
-
new_theory = mutable_theory()
|
|
57
|
-
nodes = [node for node in self._cart_predictor]
|
|
58
|
-
nodes = Cart._simplify_nodes(nodes) if self._simplify else nodes
|
|
59
|
-
for (constraints, prediction) in nodes:
|
|
60
|
-
if self.normalization is not None and data.columns[-1] in self.normalization:
|
|
61
|
-
m, s = self.normalization[data.columns[-1]]
|
|
62
|
-
prediction = prediction * s + m
|
|
63
|
-
variables = create_variable_list(self.discretization, data)
|
|
64
|
-
new_theory.assertZ(
|
|
65
|
-
clause(
|
|
66
|
-
create_head(data.columns[-1], list(variables.values()), prediction),
|
|
67
|
-
self._create_body(variables, constraints)
|
|
68
|
-
)
|
|
69
|
-
)
|
|
70
|
-
return new_theory
|
|
71
|
-
|
|
72
|
-
def _extract(self, data: pd.DataFrame) -> Theory:
|
|
73
|
-
tree = DecisionTreeClassifier if isinstance(data.iloc[0, -1], str) else DecisionTreeRegressor
|
|
74
|
-
self._cart_predictor.predictor = tree(random_state=TREE_SEED, max_depth=self.depth,
|
|
75
|
-
max_leaf_nodes=self.leaves, max_features=self.max_features)
|
|
76
|
-
self._cart_predictor.predictor.fit(data.iloc[:, :-1], data.iloc[:, -1])
|
|
77
|
-
return self._create_theory(data)
|
|
78
|
-
|
|
79
|
-
def _predict(self, dataframe: pd.DataFrame) -> Iterable:
|
|
80
|
-
return self._cart_predictor.predict(dataframe)
|
|
81
|
-
|
|
82
|
-
def predict_why(self, data: dict[str, float], verbose=True):
|
|
83
|
-
prediction = None
|
|
84
|
-
conditions = {}
|
|
85
|
-
if self.normalization is not None:
|
|
86
|
-
data = {k: v * self.normalization[k][1] + self.normalization[k][0] if k in self.normalization else v
|
|
87
|
-
for k, v in data.items()}
|
|
88
|
-
for conditions, prediction in self._cart_predictor:
|
|
89
|
-
if all(all(interval.is_in(data[variable]) for interval in intervals)
|
|
90
|
-
for variable, intervals in conditions.items()):
|
|
91
|
-
break
|
|
92
|
-
return prediction, conditions
|
|
93
|
-
|
|
94
|
-
@property
|
|
95
|
-
def n_rules(self) -> int:
|
|
96
|
-
return self._cart_predictor.n_leaves
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|