psyke 0.4.9.dev6__py3-none-any.whl → 1.0.4.dev10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. psyke/__init__.py +231 -85
  2. psyke/clustering/__init__.py +9 -4
  3. psyke/clustering/cream/__init__.py +6 -10
  4. psyke/clustering/exact/__init__.py +17 -11
  5. psyke/clustering/utils.py +0 -1
  6. psyke/extraction/__init__.py +25 -0
  7. psyke/extraction/cart/CartPredictor.py +128 -0
  8. psyke/extraction/cart/FairTree.py +205 -0
  9. psyke/extraction/cart/FairTreePredictor.py +56 -0
  10. psyke/extraction/cart/__init__.py +48 -62
  11. psyke/extraction/hypercubic/__init__.py +187 -47
  12. psyke/extraction/hypercubic/cosmik/__init__.py +47 -0
  13. psyke/extraction/hypercubic/creepy/__init__.py +24 -29
  14. psyke/extraction/hypercubic/divine/__init__.py +86 -0
  15. psyke/extraction/hypercubic/ginger/__init__.py +100 -0
  16. psyke/extraction/hypercubic/gridex/__init__.py +45 -84
  17. psyke/extraction/hypercubic/gridrex/__init__.py +4 -4
  18. psyke/extraction/hypercubic/hex/__init__.py +104 -0
  19. psyke/extraction/hypercubic/hypercube.py +275 -72
  20. psyke/extraction/hypercubic/iter/__init__.py +45 -46
  21. psyke/extraction/hypercubic/strategy.py +13 -9
  22. psyke/extraction/real/__init__.py +24 -29
  23. psyke/extraction/real/utils.py +2 -2
  24. psyke/extraction/trepan/__init__.py +24 -19
  25. psyke/genetic/__init__.py +0 -0
  26. psyke/genetic/fgin/__init__.py +74 -0
  27. psyke/genetic/gin/__init__.py +144 -0
  28. psyke/hypercubepredictor.py +102 -0
  29. psyke/schema/__init__.py +230 -36
  30. psyke/tuning/__init__.py +40 -28
  31. psyke/tuning/crash/__init__.py +33 -64
  32. psyke/tuning/orchid/__init__.py +21 -23
  33. psyke/tuning/pedro/__init__.py +70 -56
  34. psyke/utils/logic.py +8 -8
  35. psyke/utils/plot.py +79 -3
  36. {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/METADATA +42 -22
  37. psyke-1.0.4.dev10.dist-info/RECORD +46 -0
  38. {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/WHEEL +1 -1
  39. {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info/licenses}/LICENSE +2 -1
  40. psyke/extraction/cart/predictor.py +0 -73
  41. psyke-0.4.9.dev6.dist-info/RECORD +0 -36
  42. {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/top_level.txt +0 -0
psyke/__init__.py CHANGED
@@ -5,17 +5,20 @@ from enum import Enum
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from numpy import argmax
8
+ from matplotlib import pyplot as plt
9
9
  from sklearn.linear_model import LinearRegression
10
10
  from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, f1_score, accuracy_score, \
11
11
  adjusted_rand_score, adjusted_mutual_info_score, v_measure_score, fowlkes_mallows_score
12
+ from tuprolog.solve.prolog import prolog_solver
12
13
 
13
14
  from psyke.schema import DiscreteFeature
14
15
  from psyke.utils import get_default_random_seed, Target, get_int_precision
15
- from tuprolog.theory import Theory
16
+ from tuprolog.theory import Theory, mutable_theory
16
17
  from typing import Iterable
17
18
  import logging
18
19
 
20
+ from psyke.utils.logic import get_in_rule, data_to_struct, get_not_in_rule
21
+
19
22
  logging.basicConfig(level=logging.WARN)
20
23
  logger = logging.getLogger('psyke')
21
24
 
@@ -45,28 +48,36 @@ class EvaluableModel(object):
45
48
  V = 3,
46
49
  FMI = 4
47
50
 
48
- def __init__(self, normalization=None):
51
+ def __init__(self, discretization=None, normalization=None):
52
+ self.discretization = [] if discretization is None else list(discretization)
49
53
  self.normalization = normalization
50
54
 
51
- def predict(self, dataframe: pd.DataFrame, mapping: dict[str: int] = None) -> Iterable:
55
+ def predict(self, dataframe: pd.DataFrame) -> Iterable:
52
56
  """
53
57
  Predicts the output values of every sample in dataset.
54
58
 
55
- :param dataframe: is the set of instances to predict.
56
- :param mapping: for one-hot encoding.
59
+ :param dataframe: the set of instances to predict.
57
60
  :return: a list of predictions.
58
61
  """
59
- ys = self._predict(dataframe)
60
- if mapping is not None:
61
- inverse_mapping = {v: k for k, v in mapping.items()}
62
- ys = [inverse_mapping[y] for y in ys]
63
- return ys
62
+ return self.__convert(self._predict(dataframe))
64
63
 
65
64
  def _predict(self, dataframe: pd.DataFrame) -> Iterable:
66
65
  raise NotImplementedError('predict')
67
66
 
67
+ def __convert(self, ys: Iterable) -> Iterable:
68
+ if self.normalization is not None and len(ys) > 0 and not isinstance([p for p in ys if p is not None][0], str):
69
+ m, s = self.normalization[list(self.normalization.keys())[-1]]
70
+ ys = [prediction if prediction is None else prediction * s + m for prediction in ys]
71
+ return ys
72
+
73
+ def brute_predict(self, dataframe: pd.DataFrame, criterion: str = 'corner', n: int = 2) -> Iterable:
74
+ return self.__convert(self._brute_predict(dataframe, criterion, n))
75
+
76
+ def _brute_predict(self, dataframe: pd.DataFrame, criterion: str = 'corner', n: int = 2) -> Iterable:
77
+ raise NotImplementedError('brute_predict')
78
+
68
79
  def unscale(self, values, name):
69
- if self.normalization is None or isinstance(values, LinearRegression):
80
+ if self.normalization is None or name not in self.normalization or isinstance(values, LinearRegression):
70
81
  return values
71
82
  if isinstance(values, Iterable):
72
83
  values = [None if value is None else
@@ -76,9 +87,13 @@ class EvaluableModel(object):
76
87
  return values
77
88
 
78
89
  def score(self, dataframe: pd.DataFrame, predictor=None, fidelity: bool = False, completeness: bool = True,
90
+ brute: bool = False, criterion: str = 'corners', n: int = 2,
79
91
  task: EvaluableModel.Task = Task.CLASSIFICATION,
80
- scoring_function: Iterable[EvaluableModel.Score] = [ClassificationScore.ACCURACY]):
81
- extracted = np.array(self.predict(dataframe.iloc[:, :-1]))
92
+ scoring_function: Iterable[EvaluableModel.Score] = (ClassificationScore.ACCURACY, )):
93
+ extracted = np.array(
94
+ self.predict(dataframe.iloc[:, :-1]) if not brute else
95
+ self.brute_predict(dataframe.iloc[:, :-1], criterion, n)
96
+ )
82
97
  idx = [prediction is not None for prediction in extracted]
83
98
  y_extracted = extracted[idx]
84
99
  true = [dataframe.iloc[idx, -1]]
@@ -134,110 +149,263 @@ class Extractor(EvaluableModel, ABC):
134
149
  ----------
135
150
  predictor : the underling black box predictor.
136
151
  discretization : A collection of sets of discretised features.
137
- Each set corresponds to a set of features derived from a single non-discrete feature.
152
+ Each set corresponds to a set of features derived from a single non-discrete feature.
138
153
  """
139
154
 
140
155
  def __init__(self, predictor, discretization: Iterable[DiscreteFeature] = None, normalization=None):
141
- super().__init__(normalization)
156
+ super().__init__(discretization, normalization)
142
157
  self.predictor = predictor
143
- self.discretization = [] if discretization is None else list(discretization)
158
+ self.theory = None
144
159
 
145
- def extract(self, dataframe: pd.DataFrame, mapping: dict[str: int] = None, sort: bool = True) -> Theory:
160
+ def extract(self, dataframe: pd.DataFrame) -> Theory:
146
161
  """
147
162
  Extracts rules from the underlying predictor.
148
163
 
149
- :param dataframe: is the set of instances to be used for the extraction.
150
- :param mapping: for one-hot encoding.
151
- :param sort: alphabetically sort the variables of the head of the rules.
164
+ :param dataframe: the set of instances to be used for the extraction.
152
165
  :return: the theory created from the extracted rules.
153
166
  """
154
167
  raise NotImplementedError('extract')
155
168
 
156
- def mae(self, dataframe: pd.DataFrame, predictor=None) -> float:
169
+ def predict_why(self, data: dict[str, float], verbose: bool = True):
170
+ """
171
+ Provides a prediction and the corresponding explanation.
172
+ :param data: the instance to predict.
173
+ :param verbose: if True the explanation is printed.
174
+ """
175
+ raise NotImplementedError('predict_why')
176
+
177
+ def predict_counter(self, data: dict[str, float], verbose: bool = True, only_first: bool = True):
178
+ """
179
+ Provides a prediction and counterfactual explanations.
180
+ :param data: the instance to predict.
181
+ :param verbose: if True the counterfactual explanation is printed.
182
+ :param only_first: if True only the closest counterfactual explanation is provided for each distinct class.
183
+ """
184
+ raise NotImplementedError('predict_counter')
185
+
186
+ def plot_fairness(self, dataframe: pd.DataFrame, groups: dict[str, list], colormap='seismic_r', filename=None,
187
+ figsize=(5, 4)):
188
+ """
189
+ Provides a visual estimation of the fairness exhibited by an extractor with respect to the specified groups.
190
+ :param dataframe: the set of instances to be used for the estimation.
191
+ :param groups: the set of relevant groups to consider.
192
+ :param colormap: the colormap to use for the plot.
193
+ :param filename: if not None, name used to save the plot.
194
+ :param figsize: size of the plot.
195
+ """
196
+ counts = {group: len(dataframe[idx_g]) for group, idx_g in groups.items()}
197
+ output = {'labels': []}
198
+ for group in groups:
199
+ output[group] = []
200
+ for i, clause in enumerate(self.theory.clauses):
201
+ if len(dataframe) == 0:
202
+ break
203
+ solver = prolog_solver(static_kb=mutable_theory(clause).assertZ(get_in_rule()).assertZ(get_not_in_rule()))
204
+ idx = np.array([query.is_yes for query in
205
+ [solver.solveOnce(data_to_struct(data)) for _, data in dataframe.iterrows()]])
206
+ # print(f'Rule {i + 1}. Outcome {clause.head.args[-1]}. Affecting', end='')
207
+ output['labels'].append(str(clause.head.args[-1]))
208
+ for group, idx_g in groups.items():
209
+ # print(f' {len(dataframe[idx & idx_g]) / counts[group]:.2f}%{group}', end='')
210
+ output[group].append(len(dataframe[idx & idx_g]) / counts[group])
211
+ dataframe = dataframe[~idx]
212
+ groups = {group: indices[~idx] for group, indices in groups.items()}
213
+ # print(f'. Left {len(dataframe)} instances')
214
+
215
+ binary = len(set(output['labels'])) == 2
216
+ labels = sorted(set(output['labels']))
217
+ data = np.vstack([output[group] for group in groups]).T * 100
218
+ if binary:
219
+ data[np.array(output['labels']) == labels[0]] *= -1
220
+
221
+ plt.figure(figsize=figsize)
222
+ plt.imshow(data, cmap=colormap, vmin=-100 if binary else 0, vmax=100)
223
+
224
+ plt.gca().set_xticks(range(len(groups)), labels=groups.keys())
225
+ plt.gca().set_yticks(range(len(output['labels'])),
226
+ labels=[f'Rule {i + 1}\n{l}' for i, l in enumerate(output['labels'])])
227
+
228
+ plt.xlabel('Groups')
229
+ plt.ylabel('Rules')
230
+ plt.title("Rule set impact on groups")
231
+
232
+ for i in range(len(output['labels'])):
233
+ for j in range(len(groups)):
234
+ plt.gca().text(j, i, f'{abs(data[i, j]):.2f}%', ha="center", va="center", color="k")
235
+
236
+ plt.gca().set_xticks([i + .5 for i in range(len(groups))], minor=True)
237
+ plt.gca().set_yticks([i + .5 for i in range(len(output['labels']))], minor=True)
238
+ plt.gca().grid(which='minor', color='k', linestyle='-', linewidth=.8)
239
+ plt.gca().tick_params(which='minor', bottom=False, left=False)
240
+ cbarticks = np.linspace(-100 if binary else 0, 100, 9 if binary else 11, dtype=int)
241
+ cbar = plt.colorbar(fraction=0.046, label='Affected samples (%)', ticks=cbarticks)
242
+ if binary:
243
+ ticklabels = [str(-i) if i < 0 else str(i) for i in cbarticks]
244
+ ticklabels[0] += f' {labels[0]}'
245
+ ticklabels[-1] += f' {labels[-1]}'
246
+ cbar.ax.set_yticklabels(ticklabels)
247
+
248
+ plt.tight_layout()
249
+ if filename is not None:
250
+ plt.savefig(filename, dpi=500)
251
+ plt.show()
252
+
253
+ def make_fair(self, features: Iterable[str]):
254
+ raise NotImplementedError(f'Fairness for {type(self).__name__} is not supported at the moment')
255
+
256
+ def mae(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
257
+ n: int = 3) -> float:
157
258
  """
158
259
  Calculates the predictions' MAE w.r.t. the instances given as input.
159
260
 
160
- :param dataframe: is the set of instances to be used to calculate the mean absolute error.
261
+ :param dataframe: the set of instances to be used to calculate the mean absolute error.
161
262
  :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
263
+ :param brute: if True, a brute prediction is executed.
264
+ :param criterion: criterion for brute prediction.
265
+ :param n: number of points for brute prediction with 'perimeter' criterion.
162
266
  :return: the mean absolute error (MAE) of the predictions.
163
267
  """
164
- return self.score(dataframe, predictor, predictor is not None, False, Extractor.Task.REGRESSION,
165
- [Extractor.RegressionScore.MAE])[Extractor.RegressionScore.MAE][-1]
268
+ return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
269
+ Extractor.Task.REGRESSION, [Extractor.RegressionScore.MAE])[Extractor.RegressionScore.MAE][-1]
166
270
 
167
- def mse(self, dataframe: pd.DataFrame, predictor=None) -> float:
271
+ def mse(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
272
+ n: int = 3) -> float:
168
273
  """
169
274
  Calculates the predictions' MSE w.r.t. the instances given as input.
170
275
 
171
- :param dataframe: is the set of instances to be used to calculate the mean squared error.
276
+ :param dataframe: the set of instances to be used to calculate the mean squared error.
172
277
  :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
278
+ :param brute: if True, a brute prediction is executed.
279
+ :param criterion: criterion for brute prediction.
280
+ :param n: number of points for brute prediction with 'perimeter' criterion.
173
281
  :return: the mean squared error (MSE) of the predictions.
174
282
  """
175
- return self.score(dataframe, predictor, predictor is not None, False, Extractor.Task.REGRESSION,
176
- [Extractor.RegressionScore.MSE])[Extractor.RegressionScore.MSE][-1]
283
+ return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
284
+ Extractor.Task.REGRESSION, [Extractor.RegressionScore.MSE])[Extractor.RegressionScore.MSE][-1]
177
285
 
178
- def r2(self, dataframe: pd.DataFrame, predictor=None) -> float:
286
+ def r2(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
287
+ n: int = 3) -> float:
179
288
  """
180
289
  Calculates the predictions' R2 score w.r.t. the instances given as input.
181
290
 
182
- :param dataframe: is the set of instances to be used to calculate the R2 score.
291
+ :param dataframe: the set of instances to be used to calculate the R2 score.
183
292
  :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
293
+ :param brute: if True, a brute prediction is executed.
294
+ :param criterion: criterion for brute prediction.
295
+ :param n: number of points for brute prediction with 'perimeter' criterion.
184
296
  :return: the R2 score of the predictions.
185
297
  """
186
- return self.score(dataframe, predictor, predictor is not None, False,
298
+ return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
187
299
  Extractor.Task.REGRESSION, [Extractor.RegressionScore.R2])[Extractor.RegressionScore.R2][-1]
188
300
 
189
- def accuracy(self, dataframe: pd.DataFrame, predictor=None) -> float:
301
+ def accuracy(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
302
+ n: int = 3) -> float:
190
303
  """
191
304
  Calculates the predictions' accuracy classification score w.r.t. the instances given as input.
192
305
 
193
- :param dataframe: is the set of instances to be used to calculate the accuracy classification score.
306
+ :param dataframe: the set of instances to be used to calculate the accuracy classification score.
194
307
  :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
308
+ :param brute: if True, a brute prediction is executed.
309
+ :param criterion: criterion for brute prediction.
310
+ :param n: number of points for brute prediction with 'perimeter' criterion.
195
311
  :return: the accuracy classification score of the predictions.
196
312
  """
197
- return self.score(dataframe, predictor, predictor is not None, False, Extractor.Task.CLASSIFICATION,
313
+ return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
314
+ Extractor.Task.CLASSIFICATION,
198
315
  [Extractor.ClassificationScore.ACCURACY])[Extractor.ClassificationScore.ACCURACY][-1]
199
316
 
200
- def f1(self, dataframe: pd.DataFrame, predictor=None) -> float:
317
+ def f1(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
318
+ n: int = 3) -> float:
201
319
  """
202
320
  Calculates the predictions' F1 score w.r.t. the instances given as input.
203
321
 
204
- :param dataframe: is the set of instances to be used to calculate the F1 score.
322
+ :param dataframe: the set of instances to be used to calculate the F1 score.
205
323
  :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
324
+ :param brute: if True, a brute prediction is executed.
325
+ :param criterion: criterion for brute prediction.
326
+ :param n: number of points for brute prediction with 'perimeter' criterion.
206
327
  :return: the F1 score of the predictions.
207
328
  """
208
- return self.score(dataframe, predictor, predictor is not None, False, Extractor.Task.CLASSIFICATION,
329
+ return self.score(dataframe, predictor, predictor is not None, False, brute, criterion, n,
330
+ Extractor.Task.CLASSIFICATION,
209
331
  [Extractor.ClassificationScore.F1])[Extractor.ClassificationScore.F1][-1]
210
332
 
211
333
  @staticmethod
212
- def cart(predictor, max_depth: int = 3, max_leaves: int = 3,
334
+ def cart(predictor, max_depth: int = 3, max_leaves: int = 3, max_features=None,
213
335
  discretization: Iterable[DiscreteFeature] = None, normalization=None, simplify: bool = True) -> Extractor:
214
336
  """
215
337
  Creates a new Cart extractor.
216
338
  """
217
339
  from psyke.extraction.cart import Cart
218
- return Cart(predictor, max_depth, max_leaves, discretization=discretization, normalization=normalization,
219
- simplify=simplify)
340
+ return Cart(predictor, max_depth, max_leaves, max_features,
341
+ discretization=discretization, normalization=normalization, simplify=simplify)
342
+
343
+ @staticmethod
344
+ def divine(predictor, k: int = 5, patience: int = 15, close_to_center: bool = True,
345
+ discretization: Iterable[DiscreteFeature] = None, normalization=None,
346
+ seed: int = get_default_random_seed()) -> Extractor:
347
+ """
348
+ Creates a new DiViNE extractor.
349
+ """
350
+ from psyke.extraction.hypercubic.divine import DiViNE
351
+ return DiViNE(predictor, k=k, patience=patience, close_to_center=close_to_center,
352
+ discretization=discretization, normalization=normalization, seed=seed)
353
+
354
+ @staticmethod
355
+ def cosmik(predictor, max_components: int = 4, k: int = 5, patience: int = 15, close_to_center: bool = True,
356
+ output: Target = Target.CONSTANT, discretization: Iterable[DiscreteFeature] = None, normalization=None,
357
+ seed: int = get_default_random_seed()) -> Extractor:
358
+ """
359
+ Creates a new COSMiK extractor.
360
+ """
361
+ from psyke.extraction.hypercubic.cosmik import COSMiK
362
+ return COSMiK(predictor, max_components=max_components, k=k, patience=patience, close_to_center=close_to_center,
363
+ output=output, discretization=discretization, normalization=normalization, seed=seed)
220
364
 
221
365
  @staticmethod
222
366
  def iter(predictor, min_update: float = 0.1, n_points: int = 1, max_iterations: int = 600, min_examples: int = 250,
223
- threshold: float = 0.1, fill_gaps: bool = True, normalization: dict[str, tuple[float, float]] = None,
224
- output=None, seed: int = get_default_random_seed()) -> Extractor:
367
+ threshold: float = 0.1, fill_gaps: bool = True, ignore_dimensions=None,
368
+ normalization: dict[str, tuple[float, float]] = None, output=None,
369
+ seed: int = get_default_random_seed()) -> Extractor:
225
370
  """
226
371
  Creates a new ITER extractor.
227
372
  """
228
373
  from psyke.extraction.hypercubic.iter import ITER
229
374
  return ITER(predictor, min_update, n_points, max_iterations, min_examples, threshold, fill_gaps,
230
- normalization, output, seed)
375
+ ignore_dimensions, normalization, output, seed)
231
376
 
232
377
  @staticmethod
233
- def gridex(predictor, grid, min_examples: int = 250, threshold: float = 0.1,
234
- normalization: dict[str, tuple[float, float]] = None,
378
+ def gridex(predictor, grid, min_examples: int = 250, threshold: float = 0.1, output: Target = Target.CONSTANT,
379
+ discretization=None, normalization: dict[str, tuple[float, float]] = None,
235
380
  seed: int = get_default_random_seed()) -> Extractor:
236
381
  """
237
382
  Creates a new GridEx extractor.
238
383
  """
239
384
  from psyke.extraction.hypercubic.gridex import GridEx
240
- return GridEx(predictor, grid, min_examples, threshold, normalization, seed)
385
+ return GridEx(predictor, grid, min_examples, threshold, output, discretization, normalization, seed)
386
+
387
+ @staticmethod
388
+ def hex(predictor, grid, min_examples: int = 250, threshold: float = 0.1, output: Target = Target.CONSTANT,
389
+ discretization=None, normalization: dict[str, tuple[float, float]] = None,
390
+ seed: int = get_default_random_seed()) -> Extractor:
391
+ """
392
+ Creates a new HEx extractor.
393
+ """
394
+ from psyke.extraction.hypercubic.hex import HEx
395
+ return HEx(predictor, grid, min_examples, threshold, output, discretization, normalization, seed)
396
+
397
+ @staticmethod
398
+ def ginger(predictor, features: Iterable[str], sigmas: Iterable[float], max_slices: int, min_rules: int = 1,
399
+ max_poly: int = 1, alpha: float = 0.5, indpb: float = 0.5, tournsize: int = 3, metric: str = 'R2',
400
+ n_gen: int = 50, n_pop: int = 50, threshold=None, valid=None, output=Target.REGRESSION,
401
+ normalization: dict[str, tuple[float, float]] = None,
402
+ seed: int = get_default_random_seed()) -> Extractor:
403
+ """
404
+ Creates a new GInGER extractor.
405
+ """
406
+ from psyke.extraction.hypercubic.ginger import GInGER
407
+ return GInGER(predictor, features, sigmas, max_slices, min_rules, max_poly, alpha, indpb, tournsize, metric,
408
+ n_gen, n_pop, threshold, valid, output, normalization, seed)
241
409
 
242
410
  @staticmethod
243
411
  def gridrex(predictor, grid, min_examples: int = 250, threshold: float = 0.1,
@@ -250,15 +418,16 @@ class Extractor(EvaluableModel, ABC):
250
418
  return GridREx(predictor, grid, min_examples, threshold, normalization, seed)
251
419
 
252
420
  @staticmethod
253
- def creepy(predictor, clustering, depth: int, error_threshold: float, output, gauss_components: int = 2,
254
- ranks: [(str, float)] = [], ignore_threshold: float = 0.0,
255
- normalization: dict[str, tuple[float, float]] = None) -> Extractor:
421
+ def creepy(predictor, clustering, depth: int, error_threshold: float, output: Target = Target.CONSTANT,
422
+ gauss_components: int = 2, ranks: Iterable[(str, float)] = tuple(), ignore_threshold: float = 0.0,
423
+ discretization=None, normalization: dict[str, tuple[float, float]] = None,
424
+ seed: int = get_default_random_seed()) -> Extractor:
256
425
  """
257
426
  Creates a new CReEPy extractor.
258
427
  """
259
428
  from psyke.extraction.hypercubic.creepy import CReEPy
260
- return CReEPy(predictor, depth, error_threshold, output, gauss_components, ranks, ignore_threshold,
261
- normalization, clustering)
429
+ return CReEPy(predictor, clustering, depth, error_threshold, output, gauss_components, ranks, ignore_threshold,
430
+ discretization, normalization, seed)
262
431
 
263
432
  @staticmethod
264
433
  def real(predictor, discretization=None) -> Extractor:
@@ -281,52 +450,29 @@ class Extractor(EvaluableModel, ABC):
281
450
 
282
451
 
283
452
  class Clustering(EvaluableModel, ABC):
284
- def __init__(self, normalization=None):
285
- super().__init__(normalization)
453
+ def __init__(self, discretization=None, normalization=None):
454
+ super().__init__(discretization, normalization)
286
455
 
287
456
  def fit(self, dataframe: pd.DataFrame):
288
- raise NotImplementedError('extract')
457
+ raise NotImplementedError('fit')
289
458
 
290
459
  def explain(self):
291
- raise NotImplementedError('extract')
460
+ raise NotImplementedError('explain')
292
461
 
293
462
  @staticmethod
294
- def exact(depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT,
295
- gauss_components: int = 2) -> Clustering:
463
+ def exact(depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT, gauss_components: int = 2,
464
+ discretization=None, normalization=None, seed: int = get_default_random_seed()) -> Clustering:
296
465
  """
297
466
  Creates a new ExACT instance.
298
467
  """
299
468
  from psyke.clustering.exact import ExACT
300
- return ExACT(depth, error_threshold, output, gauss_components)
469
+ return ExACT(depth, error_threshold, output, gauss_components, discretization, normalization, seed)
301
470
 
302
471
  @staticmethod
303
- def cream(depth: int, error_threshold: float, output, gauss_components: int = 2) -> Clustering:
472
+ def cream(depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT, gauss_components: int = 2,
473
+ discretization=None, normalization=None, seed: int = get_default_random_seed()) -> Clustering:
304
474
  """
305
475
  Creates a new CREAM instance.
306
476
  """
307
477
  from psyke.clustering.cream import CREAM
308
- return CREAM(depth, error_threshold, output, gauss_components)
309
-
310
-
311
- class PedagogicalExtractor(Extractor, ABC):
312
-
313
- def __init__(self, predictor, discretization=None, normalization=None):
314
- Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization)
315
-
316
- def extract(self, dataframe: pd.DataFrame, mapping: dict[str: int] = None, sort: bool = True) -> Theory:
317
- new_y = self.predictor.predict(dataframe.iloc[:, :-1])
318
- if mapping is not None:
319
- if hasattr(new_y[0], 'shape'):
320
- # One-hot encoding for multi-class tasks
321
- if len(new_y[0].shape) > 0 and new_y[0].shape[0] > 1:
322
- new_y = [argmax(y, axis=0) for y in new_y]
323
- # One-hot encoding for binary class tasks
324
- else:
325
- new_y = [round(y[0]) for y in new_y]
326
- new_y = pd.DataFrame(new_y).set_index(dataframe.index)
327
- data = dataframe.iloc[:, :-1].copy().join(new_y)
328
- data.columns = dataframe.columns
329
- return self._extract(data, mapping, sort)
330
-
331
- def _extract(self, dataframe: pd.DataFrame, mapping: dict[str: int] = None, sort: bool = True) -> Theory:
332
- raise NotImplementedError('extract')
478
+ return CREAM(depth, error_threshold, output, gauss_components, discretization, normalization, seed)
@@ -2,13 +2,18 @@ from abc import ABC
2
2
  from typing import Iterable
3
3
 
4
4
  from psyke import Clustering, Target
5
- from psyke.extraction.hypercubic import HyperCube, HyperCubePredictor
5
+ from psyke.extraction.hypercubic import HyperCube
6
+ from psyke.hypercubepredictor import HyperCubePredictor
6
7
 
7
8
 
8
9
  class HyperCubeClustering(HyperCubePredictor, Clustering, ABC):
9
10
 
10
- def __init__(self, output: Target = Target.CONSTANT, normalization=None):
11
- HyperCubePredictor.__init__(self, output=output, normalization=normalization)
11
+ def __init__(self, output: Target = Target.CONSTANT, discretization=None, normalization=None):
12
+ HyperCubePredictor.__init__(self, output=output, discretization=discretization, normalization=normalization)
13
+ self._protected_features = []
12
14
 
13
15
  def get_hypercubes(self) -> Iterable[HyperCube]:
14
- raise NotImplementedError('predict')
16
+ raise NotImplementedError('get_hypercubes')
17
+
18
+ def make_fair(self, features: Iterable[str]):
19
+ self._protected_features = features
@@ -5,7 +5,7 @@ from typing import Iterable
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
 
8
- import psyke.utils
8
+ from psyke.utils import Target, get_default_random_seed
9
9
  from psyke.clustering.exact import ExACT
10
10
  from psyke.extraction.hypercubic import Node, HyperCube, ClosedCube
11
11
  from psyke.clustering.utils import select_gaussian_mixture
@@ -16,9 +16,9 @@ class CREAM(ExACT):
16
16
  Explanator implementing CREAM algorithm.
17
17
  """
18
18
 
19
- def __init__(self, depth: int, error_threshold: float,
20
- output: psyke.utils.Target = psyke.utils.Target.CONSTANT, gauss_components: int = 5):
21
- super().__init__(depth, error_threshold, output, gauss_components)
19
+ def __init__(self, depth: int, error_threshold: float, output: Target = Target.CONSTANT, gauss_components: int = 5,
20
+ discretization=None, normalization=None, seed: int = get_default_random_seed()):
21
+ super().__init__(depth, error_threshold, output, gauss_components, discretization, normalization, seed)
22
22
 
23
23
  def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int):
24
24
  cubes = []
@@ -46,11 +46,7 @@ class CREAM(ExACT):
46
46
  def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
47
47
  to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
48
48
  while len(to_split) > 0:
49
- to_split.sort(reverse=True)
50
- (_, depth, _, node) = to_split.pop()
51
- data = ExACT._remove_string_label(node.dataframe)
52
- gauss_params = select_gaussian_mixture(data, self.gauss_components)
53
- gauss_pred = gauss_params[2].predict(data)
49
+ node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split)
54
50
  cubes = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
55
51
  if len(cubes) < 1:
56
52
  continue
@@ -65,4 +61,4 @@ class CREAM(ExACT):
65
61
  (error, depth + 1, np.random.uniform(), n) for (n, error) in
66
62
  zip(node.children, [right[0].diversity, left[0].diversity]) if error > self.error_threshold
67
63
  ]
68
- return self._node_to_cubes(surrounding)
64
+ return self._node_to_cubes(surrounding)
@@ -13,7 +13,7 @@ from psyke.clustering import HyperCubeClustering
13
13
  from psyke.extraction.hypercubic import Node, ClosedCube, HyperCube
14
14
  from psyke.clustering.utils import select_gaussian_mixture, select_dbscan_epsilon
15
15
  from psyke.extraction.hypercubic.hypercube import ClosedRegressionCube, ClosedClassificationCube
16
- from psyke.utils import Target
16
+ from psyke.utils import Target, get_default_random_seed
17
17
 
18
18
 
19
19
  class ExACT(HyperCubeClustering, ABC):
@@ -22,13 +22,15 @@ class ExACT(HyperCubeClustering, ABC):
22
22
  """
23
23
 
24
24
  def __init__(self, depth: int = 2, error_threshold: float = 0.1, output: Target = Target.CONSTANT,
25
- gauss_components: int = 2, normalization=None):
26
- super().__init__(output, normalization)
25
+ gauss_components: int = 2, discretization=None, normalization=None,
26
+ seed: int = get_default_random_seed()):
27
+ super().__init__(output, discretization, normalization)
27
28
  self.depth = depth
28
29
  self.error_threshold = error_threshold
29
30
  self.gauss_components = gauss_components
30
31
  self._predictor = KNeighborsClassifier() if output == Target.CLASSIFICATION else KNeighborsRegressor()
31
32
  self._predictor.n_neighbors = 1
33
+ self.seed = seed
32
34
 
33
35
  def __eligible_cubes(self, gauss_pred: np.ndarray, node: Node, clusters: int):
34
36
  cubes = []
@@ -52,13 +54,14 @@ class ExACT(HyperCubeClustering, ABC):
52
54
  dbscan_pred = DBSCAN(eps=select_dbscan_epsilon(data, clusters)).fit_predict(data.iloc[:, :-1])
53
55
  return HyperCube.create_surrounding_cube(
54
56
  dataframe.iloc[np.where(dbscan_pred == Counter(dbscan_pred).most_common(1)[0][0])],
55
- True, self._output
57
+ True, self._output, self._protected_features
56
58
  )
57
59
 
58
60
  def fit(self, dataframe: pd.DataFrame):
61
+ np.random.seed(self.seed)
59
62
  self._predictor.fit(dataframe.iloc[:, :-1], dataframe.iloc[:, -1])
60
- self._hypercubes = \
61
- self._iterate(Node(dataframe, HyperCube.create_surrounding_cube(dataframe, True, self._output)))
63
+ self._surrounding = HyperCube.create_surrounding_cube(dataframe, True, self._output, self._protected_features)
64
+ self._hypercubes = self._iterate(Node(dataframe, self._surrounding))
62
65
 
63
66
  def get_hypercubes(self) -> Iterable[HyperCube]:
64
67
  return list(self._hypercubes)
@@ -76,14 +79,17 @@ class ExACT(HyperCubeClustering, ABC):
76
79
  enumerate(dataframe.iloc[:, -1].unique())
77
80
  ).items()}}) if isinstance(dataframe.iloc[0, -1], str) else dataframe
78
81
 
82
+ def _get_gauss_predictions(self, to_split):
83
+ to_split.sort(reverse=True)
84
+ (_, depth, _, node) = to_split.pop()
85
+ data = ExACT._remove_string_label(node.dataframe)
86
+ gauss_params = select_gaussian_mixture(data.drop(self._protected_features, axis=1), self.gauss_components)
87
+ return node, depth, gauss_params[2].predict(data.drop(self._protected_features, axis=1)), gauss_params
88
+
79
89
  def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
80
90
  to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
81
91
  while len(to_split) > 0:
82
- to_split.sort(reverse=True)
83
- (_, depth, _, node) = to_split.pop()
84
- data = ExACT._remove_string_label(node.dataframe)
85
- gauss_params = select_gaussian_mixture(data, self.gauss_components)
86
- gauss_pred = gauss_params[2].predict(data)
92
+ node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split)
87
93
  cubes, indices = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
88
94
  cubes = [(c.volume(), len(idx), i, idx, c) for i, (c, idx) in enumerate(zip(cubes, indices))
89
95
  if (idx is not None) and (not node.cube.equal(c))]
psyke/clustering/utils.py CHANGED
@@ -11,7 +11,6 @@ def select_gaussian_mixture(data: pd.DataFrame, max_components) -> tuple[float,
11
11
  try:
12
12
  models = [GaussianMixture(n_components=n).fit(data) for n in components if n <= len(data)]
13
13
  except ValueError:
14
- print(data)
15
14
  print(len(data))
16
15
  return min([(m.bic(data) / (i + 2), (i + 2), m) for i, m in enumerate(models)])
17
16
 
@@ -0,0 +1,25 @@
1
+ from abc import ABC
2
+
3
+ import pandas as pd
4
+ from tuprolog.theory import Theory
5
+
6
+ from psyke import Extractor
7
+
8
+
9
+ class PedagogicalExtractor(Extractor, ABC):
10
+
11
+ def __init__(self, predictor, discretization=None, normalization=None):
12
+ Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization)
13
+
14
+ def _substitute_output(self, dataframe: pd.DataFrame) -> pd.DataFrame:
15
+ new_y = pd.DataFrame(self.predictor.predict(dataframe.iloc[:, :-1])).set_index(dataframe.index)
16
+ data = dataframe.iloc[:, :-1].copy().join(new_y)
17
+ data.columns = dataframe.columns
18
+ return data
19
+
20
+ def extract(self, dataframe: pd.DataFrame) -> Theory:
21
+ self.theory = self._extract(self._substitute_output(dataframe))
22
+ return self.theory
23
+
24
+ def _extract(self, dataframe: pd.DataFrame) -> Theory:
25
+ raise NotImplementedError('extract')