psyke 0.9.1.dev12__py3-none-any.whl → 0.9.1.dev43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of psyke might be problematic. Click here for more details.

psyke/__init__.py CHANGED
@@ -65,7 +65,7 @@ class EvaluableModel(object):
65
65
  raise NotImplementedError('predict')
66
66
 
67
67
  def __convert(self, ys: Iterable) -> Iterable:
68
- if self.normalization is not None and not isinstance([p for p in ys if p is not None][0], str):
68
+ if self.normalization is not None and len(ys) > 0 and not isinstance([p for p in ys if p is not None][0], str):
69
69
  m, s = self.normalization[list(self.normalization.keys())[-1]]
70
70
  ys = [prediction if prediction is None else prediction * s + m for prediction in ys]
71
71
  return ys
@@ -231,7 +231,7 @@ class Extractor(EvaluableModel, ABC):
231
231
 
232
232
  for i in range(len(output['labels'])):
233
233
  for j in range(len(groups)):
234
- plt.gca().text(j, i, f'{abs(int(data[i, j]))}%', ha="center", va="center", color="k")
234
+ plt.gca().text(j, i, f'{abs(data[i, j]):.2f}%', ha="center", va="center", color="k")
235
235
 
236
236
  plt.gca().set_xticks([i + .5 for i in range(len(groups))], minor=True)
237
237
  plt.gca().set_yticks([i + .5 for i in range(len(output['labels']))], minor=True)
@@ -394,6 +394,19 @@ class Extractor(EvaluableModel, ABC):
394
394
  from psyke.extraction.hypercubic.hex import HEx
395
395
  return HEx(predictor, grid, min_examples, threshold, output, discretization, normalization, seed)
396
396
 
397
+ @staticmethod
398
+ def ginger(predictor, features: Iterable[str], sigmas: Iterable[float], max_slices: int, min_rules: int = 1,
399
+ max_poly: int = 1, alpha: float = 0.5, indpb: float = 0.5, tournsize: int = 3, metric:str = 'R2',
400
+ n_gen: int = 50, n_pop: int = 50, threshold=None, valid=None,
401
+ normalization: dict[str, tuple[float, float]] = None,
402
+ seed: int = get_default_random_seed()) -> Extractor:
403
+ """
404
+ Creates a new GInGER extractor.
405
+ """
406
+ from psyke.extraction.hypercubic.ginger import GInGER
407
+ return GInGER(predictor, features, sigmas, max_slices, min_rules, max_poly, alpha, indpb, tournsize, metric,
408
+ n_gen, n_pop, threshold, valid, normalization, seed)
409
+
397
410
  @staticmethod
398
411
  def gridrex(predictor, grid, min_examples: int = 250, threshold: float = 0.1,
399
412
  normalization: dict[str, tuple[float, float]] = None,
@@ -10,6 +10,10 @@ class HyperCubeClustering(HyperCubePredictor, Clustering, ABC):
10
10
 
11
11
  def __init__(self, output: Target = Target.CONSTANT, discretization=None, normalization=None):
12
12
  HyperCubePredictor.__init__(self, output=output, discretization=discretization, normalization=normalization)
13
+ self._protected_features = []
13
14
 
14
15
  def get_hypercubes(self) -> Iterable[HyperCube]:
15
16
  raise NotImplementedError('get_hypercubes')
17
+
18
+ def make_fair(self, features: Iterable[str]):
19
+ self._protected_features = features
@@ -46,11 +46,7 @@ class CREAM(ExACT):
46
46
  def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
47
47
  to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
48
48
  while len(to_split) > 0:
49
- to_split.sort(reverse=True)
50
- (_, depth, _, node) = to_split.pop()
51
- data = ExACT._remove_string_label(node.dataframe)
52
- gauss_params = select_gaussian_mixture(data, self.gauss_components)
53
- gauss_pred = gauss_params[2].predict(data)
49
+ node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split)
54
50
  cubes = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
55
51
  if len(cubes) < 1:
56
52
  continue
@@ -65,4 +61,4 @@ class CREAM(ExACT):
65
61
  (error, depth + 1, np.random.uniform(), n) for (n, error) in
66
62
  zip(node.children, [right[0].diversity, left[0].diversity]) if error > self.error_threshold
67
63
  ]
68
- return self._node_to_cubes(surrounding)
64
+ return self._node_to_cubes(surrounding)
@@ -54,13 +54,13 @@ class ExACT(HyperCubeClustering, ABC):
54
54
  dbscan_pred = DBSCAN(eps=select_dbscan_epsilon(data, clusters)).fit_predict(data.iloc[:, :-1])
55
55
  return HyperCube.create_surrounding_cube(
56
56
  dataframe.iloc[np.where(dbscan_pred == Counter(dbscan_pred).most_common(1)[0][0])],
57
- True, self._output
57
+ True, self._output, self._protected_features
58
58
  )
59
59
 
60
60
  def fit(self, dataframe: pd.DataFrame):
61
61
  np.random.seed(self.seed)
62
62
  self._predictor.fit(dataframe.iloc[:, :-1], dataframe.iloc[:, -1])
63
- self._surrounding = HyperCube.create_surrounding_cube(dataframe, True, self._output)
63
+ self._surrounding = HyperCube.create_surrounding_cube(dataframe, True, self._output, self._protected_features)
64
64
  self._hypercubes = self._iterate(Node(dataframe, self._surrounding))
65
65
 
66
66
  def get_hypercubes(self) -> Iterable[HyperCube]:
@@ -79,14 +79,17 @@ class ExACT(HyperCubeClustering, ABC):
79
79
  enumerate(dataframe.iloc[:, -1].unique())
80
80
  ).items()}}) if isinstance(dataframe.iloc[0, -1], str) else dataframe
81
81
 
82
+ def _get_gauss_predictions(self, to_split):
83
+ to_split.sort(reverse=True)
84
+ (_, depth, _, node) = to_split.pop()
85
+ data = ExACT._remove_string_label(node.dataframe)
86
+ gauss_params = select_gaussian_mixture(data.drop(self._protected_features, axis=1), self.gauss_components)
87
+ return node, depth, gauss_params[2].predict(data.drop(self._protected_features, axis=1)), gauss_params
88
+
82
89
  def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
83
90
  to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
84
91
  while len(to_split) > 0:
85
- to_split.sort(reverse=True)
86
- (_, depth, _, node) = to_split.pop()
87
- data = ExACT._remove_string_label(node.dataframe)
88
- gauss_params = select_gaussian_mixture(data, self.gauss_components)
89
- gauss_pred = gauss_params[2].predict(data)
92
+ node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split)
90
93
  cubes, indices = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
91
94
  cubes = [(c.volume(), len(idx), i, idx, c) for i, (c, idx) in enumerate(zip(cubes, indices))
92
95
  if (idx is not None) and (not node.cube.equal(c))]
psyke/clustering/utils.py CHANGED
@@ -11,7 +11,6 @@ def select_gaussian_mixture(data: pd.DataFrame, max_components) -> tuple[float,
11
11
  try:
12
12
  models = [GaussianMixture(n_components=n).fit(data) for n in components if n <= len(data)]
13
13
  except ValueError:
14
- print(data)
15
14
  print(len(data))
16
15
  return min([(m.bic(data) / (i + 2), (i + 2), m) for i, m in enumerate(models)])
17
16
 
@@ -77,15 +77,16 @@ class FairTree:
77
77
  sorted_indices = np.argsort(X)
78
78
  X = np.array(X)[sorted_indices]
79
79
  y = np.array(y)[sorted_indices]
80
- return np.array([(X[i] + X[i - 1]) / 2.0 for i in range(1, len(X)) if y[i] != y[i - 1]])
80
+ # X = np.array(np.unique(np.unique(list(zip(X, y)), axis=0)[:, 0]), dtype=float)
81
+ return np.array([(X[:-1][i] + X[1:][i]) / 2.0 for i in range(len(X) - 1) if y[i] != y[i + 1]])
81
82
 
82
83
  def _best_split(self, X, y):
83
84
  best_gain = -float('inf')
84
85
  split_idx, split_threshold = None, None
85
86
 
86
87
  for feature in [feature for feature in X.columns if feature not in self.protected_attr]:
87
- # for threshold in np.unique(np.quantile(X[feature], np.linspace(0, 1, num=25))):
88
- for threshold in self.generate_thresholds(X[feature], y):
88
+ # for threshold in self.generate_thresholds(X[feature], y):
89
+ for threshold in np.unique(np.quantile(X[feature], np.linspace(0, 1, num=25))):
89
90
  left_idxs = X[feature] <= threshold
90
91
  right_idxs = X[feature] > threshold
91
92
 
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  from abc import ABC
4
4
  from collections import Iterable
5
+ from itertools import combinations
5
6
 
6
7
  import numpy as np
7
8
  import pandas as pd
@@ -25,6 +26,7 @@ class HyperCubeExtractor(HyperCubePredictor, PedagogicalExtractor, ABC):
25
26
  HyperCubePredictor.__init__(self, output=output, normalization=normalization)
26
27
  PedagogicalExtractor.__init__(self, predictor, discretization=discretization, normalization=normalization)
27
28
  self._default_surrounding_cube = False
29
+ self.threshold = None
28
30
 
29
31
  def _default_cube(self) -> HyperCube | RegressionCube | ClassificationCube:
30
32
  if self._output == Target.CONSTANT:
@@ -33,11 +35,53 @@ class HyperCubeExtractor(HyperCubePredictor, PedagogicalExtractor, ABC):
33
35
  return RegressionCube()
34
36
  return ClassificationCube()
35
37
 
38
+ @staticmethod
39
+ def _find_couples(to_split: Iterable[HyperCube], not_in_cache: set[HyperCube],
40
+ adjacent_cache: dict[tuple[HyperCube, HyperCube], str | None]) -> \
41
+ Iterable[tuple[HyperCube, HyperCube, str]]:
42
+
43
+ for cube1, cube2 in combinations(to_split, 2):
44
+ key = (cube1, cube2) if id(cube1) < id(cube2) else (cube2, cube1)
45
+
46
+ if (cube1 in not_in_cache) or (cube2 in not_in_cache):
47
+ adjacent_cache[key] = cube1.is_adjacent(cube2)
48
+ feature = adjacent_cache.get(key)
49
+ if feature is not None:
50
+ yield cube1, cube2, feature
51
+
52
+ def _evaluate_merge(self, not_in_cache: Iterable[HyperCube], dataframe: pd.DataFrame, feature: str,
53
+ cube: HyperCube, other_cube: HyperCube,
54
+ merge_cache: dict[tuple[HyperCube, HyperCube], HyperCube | None]) -> bool:
55
+ if (cube in not_in_cache) or (other_cube in not_in_cache):
56
+ merged_cube = cube.merge_along_dimension(other_cube, feature)
57
+ merged_cube.update(dataframe, self.predictor)
58
+ merge_cache[(cube, other_cube)] = merged_cube
59
+ return cube.output == other_cube.output if self._output == Target.CLASSIFICATION else \
60
+ merge_cache[(cube, other_cube)].diversity < self.threshold
61
+
36
62
  def _sort_cubes(self):
37
63
  cubes = [(cube.diversity, i, cube) for i, cube in enumerate(self._hypercubes)]
38
64
  cubes.sort()
39
65
  self._hypercubes = [cube[2] for cube in cubes]
40
66
 
67
+ def _merge(self, to_split: list[HyperCube], dataframe: pd.DataFrame) -> Iterable[HyperCube]:
68
+ not_in_cache = set(to_split)
69
+ adjacent_cache = {}
70
+ merge_cache = {}
71
+ while True:
72
+ to_merge = [([cube, other_cube], merge_cache[(cube, other_cube)]) for cube, other_cube, feature in
73
+ HyperCubeExtractor._find_couples(to_split, not_in_cache, adjacent_cache) if
74
+ self._evaluate_merge(not_in_cache, dataframe, feature, cube, other_cube, merge_cache)]
75
+
76
+ if len(to_merge) == 0:
77
+ break
78
+ best = min(to_merge, key=lambda c: c[1].diversity)
79
+ for cube in best[0]:
80
+ to_split.remove(cube)
81
+ to_split.append(best[1])
82
+ not_in_cache = [best[1]]
83
+ return to_split
84
+
41
85
  def extract(self, dataframe: pd.DataFrame) -> Theory:
42
86
  theory = PedagogicalExtractor.extract(self, dataframe)
43
87
  self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output)
@@ -1,7 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from collections import Iterable
4
- import numpy as np
4
+ from typing import Callable, Any
5
+
5
6
  import pandas as pd
6
7
  from sklearn.base import ClassifierMixin
7
8
  from tuprolog.theory import Theory
@@ -16,16 +17,23 @@ class CReEPy(HyperCubeExtractor):
16
17
  Explanator implementing CReEPy algorithm.
17
18
  """
18
19
 
19
- def __init__(self, predictor, clustering=Clustering.exact, depth: int = 3, error_threshold: float = 0.1,
20
- output: Target = Target.CONSTANT, gauss_components: int = 5, ranks: Iterable[(str, float)] = tuple(),
21
- ignore_threshold: float = 0.0, discretization=None, normalization=None,
22
- seed: int = get_default_random_seed()):
20
+ ClusteringType = Callable[[int, float, Target, int, Any, Any, int], HyperCubeClustering]
21
+
22
+ def __init__(self, predictor, clustering: ClusteringType = Clustering.exact, depth: int = 3,
23
+ error_threshold: float = 0.1, output: Target = Target.CONSTANT, gauss_components: int = 5,
24
+ ranks: Iterable[(str, float)] = tuple(), ignore_threshold: float = 0.0, discretization=None,
25
+ normalization=None, seed: int = get_default_random_seed()):
23
26
  super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output,
24
27
  discretization, normalization)
25
28
  self.clustering = clustering(depth, error_threshold, self._output, gauss_components, discretization,
26
29
  normalization, seed)
27
30
  self._default_surrounding_cube = True
28
31
  self._dimensions_to_ignore = set([dimension for dimension, relevance in ranks if relevance < ignore_threshold])
32
+ self._protected_features = []
33
+
34
+ def make_fair(self, features: Iterable[str]):
35
+ self.clustering.make_fair(features)
36
+ self._dimensions_to_ignore.update(features)
29
37
 
30
38
  def _extract(self, dataframe: pd.DataFrame) -> Theory:
31
39
  if not isinstance(self.clustering, HyperCubeClustering):
@@ -0,0 +1,98 @@
1
+ import itertools
2
+ from typing import Iterable
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ from sklearn.preprocessing import PolynomialFeatures
7
+ from tuprolog.theory import Theory
8
+
9
+ from psyke import get_default_random_seed
10
+ from psyke.extraction.hypercubic import HyperCubeExtractor, HyperCube, RegressionCube
11
+
12
+ from deap import base, creator
13
+
14
+ from psyke.genetic.gin import GIn
15
+
16
+
17
+ class GInGER(HyperCubeExtractor):
18
+ """
19
+ Explanator implementing GInGER algorithm.
20
+ """
21
+
22
+ def __init__(self, predictor, features, sigmas, max_slices, min_rules=1, max_poly=1, alpha=0.5, indpb=0.5,
23
+ tournsize=3, metric='R2', n_gen=50, n_pop=50, threshold=None, valid=None, normalization=None,
24
+ seed: int = get_default_random_seed()):
25
+ super().__init__(predictor, normalization)
26
+ self.threshold = threshold
27
+ np.random.seed(seed)
28
+
29
+ self.features = features
30
+ self.max_features = len(features)
31
+ self.sigmas = sigmas
32
+ self.max_slices = max_slices
33
+ self.min_rules = min_rules
34
+ self.poly = max_poly
35
+ self.trained_poly = None
36
+
37
+ self.alpha = alpha
38
+ self.indpb = indpb
39
+ self.tournsize = tournsize
40
+ self.metric = metric
41
+
42
+ self.n_gen = n_gen
43
+ self.n_pop = n_pop
44
+ self.valid = valid
45
+
46
+ creator.create("FitnessMax", base.Fitness, weights=(1.0,))
47
+ creator.create("Individual", list, fitness=creator.FitnessMax)
48
+
49
+ def __poly_names(self):
50
+ return [''.join(['' if pp == 0 else f'{n} * ' if pp == 1 else f'{n}**{pp} * '
51
+ for pp, n in zip(p, self.trained_poly.feature_names_in_)])[:-3]
52
+ for p in self.trained_poly.powers_]
53
+
54
+ def _predict(self, dataframe: pd.DataFrame) -> Iterable:
55
+ dataframe = pd.DataFrame(self.trained_poly.fit_transform(dataframe), columns=self.__poly_names())
56
+ return np.array([self._predict_from_cubes(row.to_dict()) for _, row in dataframe.iterrows()])
57
+
58
+ def _extract(self, dataframe: pd.DataFrame) -> Theory:
59
+ best = {}
60
+ for poly in range(self.poly):
61
+ for slices in list(itertools.product(range(1, self.max_slices + 1), repeat=self.max_features)):
62
+ gr = GIn((dataframe.iloc[:, :-1], dataframe.iloc[:, -1]), self.valid, self.features, self.sigmas,
63
+ slices, min_rules=self.min_rules, poly=poly + 1, alpha=self.alpha,
64
+ indpb=self.indpb, tournsize=self.tournsize, metric=self.metric, warm=True)
65
+
66
+ b, score, _, _ = gr.run(n_gen=self.n_gen, n_pop=self.n_pop)
67
+ best[(score, poly + 1, slices)] = b
68
+ m = min(best)
69
+ poly, slices, best = m[1], m[2], best[m]
70
+ self.trained_poly = PolynomialFeatures(degree=poly, include_bias=False)
71
+ transformed = pd.DataFrame(self.trained_poly.fit_transform(dataframe.iloc[:, :-1]), columns=self.__poly_names())
72
+ transformed[dataframe.columns[-1]] = dataframe.iloc[:, -1].values
73
+
74
+ self._surrounding = HyperCube.create_surrounding_cube(transformed, output=self._output)
75
+
76
+ cuts = [sorted(best[sum(slices[:i]):sum(slices[:i + 1])]) for i in range(len(slices))]
77
+
78
+ intervals = [[(transformed[self.features[i]].min(), cut[0])] +
79
+ [(cut[i], cut[i + 1]) for i in range(len(cut) - 1)] +
80
+ [(cut[-1], transformed[self.features[i]].max())] for i, cut in enumerate(cuts)]
81
+
82
+ hypercubes = [{f: iv for f, iv in zip(self.features, combo)} for combo in itertools.product(*intervals)]
83
+ mi_ma = {f: (transformed[f].min(), transformed[f].max()) for f in transformed.columns if f not in self.features}
84
+ self._hypercubes = [RegressionCube({feat: h[feat] if feat in self.features else mi_ma[feat]
85
+ for feat in transformed.columns[:-1]}) for h in hypercubes]
86
+ self._hypercubes = [c for c in self._hypercubes if c.count(transformed) >= 2]
87
+ for c in self._hypercubes:
88
+ for feature in transformed.columns:
89
+ if feature not in self.features:
90
+ for direction in ['+', '-']:
91
+ c.set_infinite(feature, direction)
92
+ c.update(transformed)
93
+ if self.threshold is not None:
94
+ self._hypercubes = self._merge(self._hypercubes, transformed)
95
+ return self._create_theory(transformed)
96
+
97
+ def make_fair(self, features: Iterable[str]):
98
+ self._dimensions_to_ignore.update(features)
@@ -59,7 +59,6 @@ class GridEx(HyperCubeExtractor):
59
59
  def _iterate(self, dataframe: pd.DataFrame):
60
60
  fake = dataframe.copy()
61
61
  prev = [self._surrounding]
62
- next_iteration = []
63
62
 
64
63
  for iteration in self.grid.iterate():
65
64
  next_iteration = []
@@ -67,56 +66,12 @@ class GridEx(HyperCubeExtractor):
67
66
  if cube.count(dataframe) == 0:
68
67
  continue
69
68
  if cube.diversity < self.threshold:
70
- self._hypercubes += [cube]
69
+ self._hypercubes.append(cube)
71
70
  continue
72
71
  to_split, fake = self._cubes_to_split(cube, iteration, dataframe, fake)
73
- next_iteration += [c for c in self._merge(to_split, fake)]
74
- prev = next_iteration.copy()
75
- self._hypercubes += [cube for cube in next_iteration]
76
-
77
- @staticmethod
78
- def _find_couples(to_split: Iterable[HyperCube], not_in_cache: Iterable[HyperCube],
79
- adjacent_cache: dict[tuple[HyperCube, HyperCube], str | None]) -> \
80
- Iterable[tuple[HyperCube, HyperCube, str]]:
81
- checked = []
82
- eligible = []
83
- for cube in to_split:
84
- checked.append(cube)
85
- for other_cube in [c for c in to_split if c not in checked]:
86
- if (cube in not_in_cache) or (other_cube in not_in_cache):
87
- adjacent_cache[(cube, other_cube)] = cube.is_adjacent(other_cube)
88
- adjacent_feature = adjacent_cache[(cube, other_cube)]
89
- eligible.append((cube, other_cube, adjacent_feature))
90
- return [couple for couple in eligible if couple[2] is not None]
91
-
92
- def _evaluate_merge(self, not_in_cache: Iterable[HyperCube],
93
- dataframe: pd.DataFrame, feature: str,
94
- cube: HyperCube, other_cube: HyperCube,
95
- merge_cache: dict[(HyperCube, HyperCube), HyperCube | None]) -> bool:
96
- if (cube in not_in_cache) or (other_cube in not_in_cache):
97
- merged_cube = cube.merge_along_dimension(other_cube, feature)
98
- merged_cube.update(dataframe, self.predictor)
99
- merge_cache[(cube, other_cube)] = merged_cube
100
- return cube.output == other_cube.output if self._output == Target.CLASSIFICATION else \
101
- merge_cache[(cube, other_cube)].diversity < self.threshold
102
-
103
- def _merge(self, to_split: Iterable[HyperCube], dataframe: pd.DataFrame) -> Iterable[HyperCube]:
104
- not_in_cache = [cube for cube in to_split]
105
- adjacent_cache = {}
106
- merge_cache = {}
107
- cont = True
108
- while cont:
109
- to_merge = [([cube, other_cube], merge_cache[(cube, other_cube)]) for cube, other_cube, feature in
110
- GridEx._find_couples(to_split, not_in_cache, adjacent_cache) if
111
- self._evaluate_merge(not_in_cache, dataframe, feature, cube, other_cube, merge_cache)]
112
- if len(to_merge) == 0:
113
- cont = False
114
- else:
115
- sorted(to_merge, key=lambda c: c[1].diversity)
116
- best = to_merge[0]
117
- to_split = [cube for cube in to_split if cube not in best[0]] + [best[1]]
118
- not_in_cache = [best[1]]
119
- return to_split
72
+ next_iteration.extend(self._merge(to_split, fake))
73
+ prev = next_iteration
74
+ self._hypercubes.extend(prev)
120
75
 
121
76
  def make_fair(self, features: Iterable[str]):
122
77
  self.grid.make_fair(features)
@@ -143,10 +143,9 @@ class HyperCube:
143
143
  self._default = True
144
144
 
145
145
  def set_infinite(self, dimension: str, direction: str):
146
- if dimension in self._infinite_dimensions:
147
- self._infinite_dimensions[dimension].append(direction)
148
- else:
149
- self._infinite_dimensions[dimension] = [direction]
146
+ if dimension not in self._infinite_dimensions:
147
+ self._infinite_dimensions[dimension] = set()
148
+ self._infinite_dimensions[dimension].add(direction)
150
149
 
151
150
  def copy_infinite_dimensions(self, dimensions: dict[str, str]):
152
151
  self._infinite_dimensions = dimensions.copy()
@@ -260,13 +259,15 @@ class HyperCube:
260
259
  if not self.is_default and value is not None]
261
260
 
262
261
  @staticmethod
263
- def create_surrounding_cube(dataset: pd.DataFrame, closed: bool = False,
264
- output=None) -> GenericCube:
262
+ def create_surrounding_cube(dataset: pd.DataFrame, closed: bool = False, output=None,
263
+ features_to_ignore: Iterable[str] = []) -> GenericCube:
265
264
  output = Target.CONSTANT if output is None else output
266
265
  dimensions = {
267
266
  column: (min(dataset[column]) - HyperCube.EPSILON * 2, max(dataset[column]) + HyperCube.EPSILON * 2)
268
267
  for column in dataset.columns[:-1]
269
268
  }
269
+ for column in features_to_ignore:
270
+ dimensions[column] = (-np.inf, np.inf)
270
271
  if closed:
271
272
  if output == Target.CONSTANT:
272
273
  return ClosedCube(dimensions)
@@ -432,14 +433,16 @@ class HyperCube:
432
433
  else:
433
434
  self.update_dimension(feature, (lower, upper))
434
435
 
435
- def update(self, dataset: pd.DataFrame, predictor) -> None:
436
- filtered = self.filter_dataframe(dataset.iloc[:, :-1])
437
- predictions = predictor.predict(filtered)
438
- self._output = np.mean(predictions)
439
- self._diversity = np.std(predictions)
440
- self._error = (abs(predictions - self._output)).mean()
441
- means = filtered.describe().loc['mean']
442
- self._barycenter = Point(means.index.values, means.values)
436
+ def update(self, dataset: pd.DataFrame, predictor=None) -> None:
437
+ idx = self.filter_indices(dataset.iloc[:, :-1])
438
+ filtered = dataset.iloc[idx, :-1]
439
+ if len(filtered > 0):
440
+ predictions = dataset.iloc[idx, -1] if predictor is None else predictor.predict(filtered)
441
+ self._output = np.mean(predictions)
442
+ self._diversity = np.std(predictions)
443
+ self._error = (abs(predictions - self._output)).mean()
444
+ means = filtered.describe().loc['mean']
445
+ self._barycenter = Point(means.index.values, means.values)
443
446
 
444
447
  # TODO: why this is not a property?
445
448
  def init_diversity(self, std: float) -> None:
@@ -450,10 +453,11 @@ class RegressionCube(HyperCube):
450
453
  def __init__(self, dimension: dict[str, tuple] = None, limits: set[Limit] = None, output=None):
451
454
  super().__init__(dimension=dimension, limits=limits, output=LinearRegression() if output is None else output)
452
455
 
453
- def update(self, dataset: pd.DataFrame, predictor) -> None:
454
- filtered = self.filter_dataframe(dataset.iloc[:, :-1])
456
+ def update(self, dataset: pd.DataFrame, predictor=None) -> None:
457
+ idx = self.filter_indices(dataset.iloc[:, :-1])
458
+ filtered = dataset.iloc[idx, :-1]
455
459
  if len(filtered > 0):
456
- predictions = predictor.predict(filtered)
460
+ predictions = dataset.iloc[idx, -1] if predictor is None else predictor.predict(filtered)
457
461
  self._output.fit(filtered, predictions)
458
462
  self._diversity = self._error = (abs(self._output.predict(filtered) - predictions)).mean()
459
463
  means = filtered.describe().loc['mean']
@@ -489,10 +493,11 @@ class ClassificationCube(HyperCube):
489
493
  def __init__(self, dimension: dict[str, tuple] = None, limits: set[Limit] = None, output: str = ""):
490
494
  super().__init__(dimension=dimension, limits=limits, output=output)
491
495
 
492
- def update(self, dataset: pd.DataFrame, predictor) -> None:
493
- filtered = self.filter_dataframe(dataset.iloc[:, :-1])
496
+ def update(self, dataset: pd.DataFrame, predictor=None) -> None:
497
+ idx = self.filter_indices(dataset.iloc[:, :-1])
498
+ filtered = dataset.iloc[idx, :-1]
494
499
  if len(filtered > 0):
495
- predictions = predictor.predict(filtered)
500
+ predictions = dataset.iloc[idx, -1] if predictor is None else predictor.predict(filtered)
496
501
  self._output = mode(predictions)
497
502
  self._diversity = self._error = 1 - sum(p == self.output for p in predictions) / len(predictions)
498
503
  means = filtered.describe().loc['mean']
File without changes
@@ -0,0 +1,106 @@
1
+ import numpy as np
2
+ from deap import base, creator, tools, algorithms
3
+ import random
4
+ from sklearn.linear_model import LinearRegression
5
+ from sklearn.metrics import mean_absolute_error, r2_score
6
+ from sklearn.preprocessing import PolynomialFeatures
7
+
8
+
9
+ class GIn:
10
+
11
+ def __init__(self, train, valid, features, sigmas, slices, min_rules=1, poly=1,
12
+ alpha=0.5, indpb=0.5, tournsize=3, metric='R2', warm=False):
13
+ self.X, self.y = train
14
+ self.valid = valid
15
+
16
+ self.features = features
17
+ self.sigmas = sigmas
18
+ self.slices = slices
19
+ self.min_rules = min_rules
20
+ self.poly = PolynomialFeatures(degree=poly, include_bias=False)
21
+
22
+ self.alpha = alpha
23
+ self.indpb = indpb
24
+ self.tournsize = tournsize
25
+ self.metric = metric
26
+
27
+ self.toolbox = None
28
+ self.stats = None
29
+ self.hof = None
30
+
31
+ self.setup(warm)
32
+
33
+ def region(self, X, cuts):
34
+ indices = [np.searchsorted(np.array(cut), X[f].to_numpy(), side='right')
35
+ for cut, f in zip(cuts, self.features)]
36
+
37
+ regions = np.zeros(len(X), dtype=int)
38
+ multiplier = 1
39
+ for idx, n in zip(reversed(indices), reversed([len(cut) + 1 for cut in cuts])):
40
+ regions += idx * multiplier
41
+ multiplier *= n
42
+
43
+ return regions
44
+
45
+ def evaluate(self, individual):
46
+ to_pred, true = self.valid or (self.X, self.y)
47
+ boundaries = np.cumsum([0] + list(self.slices))
48
+ cuts = [sorted(individual[boundaries[i]:boundaries[i + 1]]) for i in range(len(self.slices))]
49
+
50
+ regions = self.region(to_pred, cuts)
51
+ regionsT = self.region(self.X, cuts)
52
+
53
+ y_pred = np.zeros(len(to_pred))
54
+ valid_regions = 0
55
+
56
+ for r in range(np.prod([s + 1 for s in self.slices])):
57
+ mask = regions == r
58
+ maskT = regionsT == r
59
+ if min(mask.sum(), maskT.sum()) < 3:
60
+ y_pred[mask] = np.mean(self.y)
61
+ continue
62
+ y_pred[mask] = LinearRegression().fit(self.poly.fit_transform(self.X)[maskT], self.y[maskT]).predict(
63
+ self.poly.fit_transform(to_pred)[mask])
64
+ valid_regions += 1
65
+
66
+ if valid_regions < self.min_rules:
67
+ return -9999,
68
+
69
+ return (r2_score if self.metric == 'R2' else -mean_absolute_error)(true, y_pred),
70
+
71
+ def setup(self, warm=False):
72
+ if not warm:
73
+ creator.create("FitnessMax", base.Fitness, weights=(1.0,))
74
+ creator.create("Individual", list, fitness=creator.FitnessMax)
75
+
76
+ self.toolbox = base.Toolbox()
77
+ for f in self.features:
78
+ self.toolbox.register(f, random.uniform, self.X[f].min(), self.X[f].max())
79
+
80
+ self.toolbox.register("individual", tools.initCycle, creator.Individual,
81
+ (sum([[getattr(self.toolbox, f) for i in range(s)]
82
+ for f, s in zip(self.features, self.slices)], [])), n=1)
83
+
84
+ self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual)
85
+
86
+ self.toolbox.register("mate", tools.cxBlend, alpha=self.alpha)
87
+ self.toolbox.register("mutate", tools.mutGaussian, indpb=self.indpb, mu=0,
88
+ sigma=sum([[sig] * s for sig, s in zip(self.sigmas, self.slices)], []))
89
+ self.toolbox.register("select", tools.selTournament, tournsize=self.tournsize)
90
+ self.toolbox.register("evaluate", self.evaluate)
91
+
92
+ self.stats = tools.Statistics(lambda ind: ind.fitness.values[0])
93
+ self.stats.register("avg", np.mean)
94
+ # self.stats.register("min", np.min)
95
+ self.stats.register("max", np.max)
96
+ # self.stats.register("std", np.std)
97
+
98
+ self.hof = tools.HallOfFame(1)
99
+
100
+ def run(self, n_pop=30, cxpb=0.8, mutpb=0.5, n_gen=50, seed=123):
101
+ random.seed(seed)
102
+ pop = self.toolbox.population(n=n_pop)
103
+ result, log = algorithms.eaSimple(pop, self.toolbox, cxpb=cxpb, mutpb=mutpb, ngen=n_gen,
104
+ stats=self.stats, halloffame=self.hof, verbose=False)
105
+ best = tools.selBest(pop, 1)[0]
106
+ return best, self.evaluate(best)[0], result, log
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: psyke
3
- Version: 0.9.1.dev12
3
+ Version: 0.9.1.dev43
4
4
  Summary: Python-based implementation of PSyKE, i.e. a Platform for Symbolic Knowledge Extraction
5
5
  Home-page: https://github.com/psykei/psyke-python
6
6
  Author: Matteo Magnini
@@ -1,22 +1,23 @@
1
- psyke/__init__.py,sha256=J8lzVDJaGAin9TpDB6cXgAQegopVBuqURaWkkxjptR8,22490
1
+ psyke/__init__.py,sha256=4GYagtqZnAOBz3VKyOgEp-yS5-16J7x2J9PoOkdQ9-U,23282
2
2
  psyke/hypercubepredictor.py,sha256=Pg8F2R_NHrNgFHx92s32BorYHMVvaxpEh4GtCsoyB2U,4620
3
- psyke/clustering/__init__.py,sha256=36MokTVwwWR_-o0mesvXHaYEYVTK2pn2m0ZY4G3Y3qU,581
4
- psyke/clustering/utils.py,sha256=S0YwCKyHVYp9qUAQVzCMrTwcQFPJ5TD14Jwn10DE-Z4,1616
5
- psyke/clustering/cream/__init__.py,sha256=W6k7vdjuUdA_azYA4vb5JtpWrofhDJ0DbM2jsnRKzfw,2994
6
- psyke/clustering/exact/__init__.py,sha256=s4MPvGZ6gle3X9WH3YFHOEdinGcXIXh-7EFRcElWzsQ,5275
3
+ psyke/clustering/__init__.py,sha256=LfLZY2UwHY9xlFT4SMGGbyFY5S6sMXndY-UMaJIJtd8,714
4
+ psyke/clustering/utils.py,sha256=BqMPKJ-r6CdxXwyk-2AvkPV4DBnZF5WUNz2fKiXbhlw,1596
5
+ psyke/clustering/cream/__init__.py,sha256=TtUd5IyfavSUZeSuSAr06ZftVhK30ZLZGUUfH3ZQG2w,2799
6
+ psyke/clustering/exact/__init__.py,sha256=OMMxc_lIKHouZTpbLF7dt4dOA3chL4XFpjOCyc6GTTY,5545
7
7
  psyke/extraction/__init__.py,sha256=Q0i6wMzCdU7CkxhzWoD8H_a6XId6bfEx6LZbSJmTqm0,936
8
8
  psyke/extraction/cart/CartPredictor.py,sha256=YhEuaENLWixu379sIXZkFeCNc8GBnxLnR6TPCQR7sps,5743
9
- psyke/extraction/cart/FairTree.py,sha256=49ciVmqgSa6hNL2axYi-oN4DSsqrHdewH76TTYiR8x0,7529
9
+ psyke/extraction/cart/FairTree.py,sha256=mccoLDrSNy6iivqFZ23m33hxIB_kPXa3mNL1ukfb5Ls,7624
10
10
  psyke/extraction/cart/FairTreePredictor.py,sha256=7z4oLqflkRMqqVW_UIlrGsQrvROM4sXUfY7LPQJ662g,2321
11
11
  psyke/extraction/cart/__init__.py,sha256=SsjAJiL4n6q_GNR6H8PNfhTkAZ67Ka7NRvVRxCULBhQ,3191
12
- psyke/extraction/hypercubic/__init__.py,sha256=SK-I9IPQEdpYVTkFGa8No803QMwYSqgTTzinry4KLew,10896
13
- psyke/extraction/hypercubic/hypercube.py,sha256=s1fuGOZfN2ZE21C7f6-b1T3Ta_934c4rwDLD_pBWwFk,25847
12
+ psyke/extraction/hypercubic/__init__.py,sha256=AxvPJxEQzL9Diyi7l2kX3zZESZ9xbh6RFp7ffs0w2ic,13112
13
+ psyke/extraction/hypercubic/hypercube.py,sha256=Pz-F6RkAKLT5e86L29khqLjKTJ7k2TZszdRdxddVFtA,26275
14
14
  psyke/extraction/hypercubic/strategy.py,sha256=m9BGSrKc-VadgEQTOPow85hBPFqMIt0J99nCFIh4NUs,1839
15
15
  psyke/extraction/hypercubic/utils.py,sha256=D2FN5CCm_T3h23DmLFoTnIcFo7LvIq__ktl4hjUqkcA,1525
16
16
  psyke/extraction/hypercubic/cosmik/__init__.py,sha256=XQUvOtMFpR0vMHYtwIVl3G626HMqN8Clt6BqNm4nvFs,1880
17
- psyke/extraction/hypercubic/creepy/__init__.py,sha256=Cglj1mmj6VM_YBKgfNN0uDIbZb2_YS1RtPc75ftXWP8,1744
17
+ psyke/extraction/hypercubic/creepy/__init__.py,sha256=x8a1ftoYHixGpiDfM3u-6QBEDYmaSlPIRIuAOCx573w,2056
18
18
  psyke/extraction/hypercubic/divine/__init__.py,sha256=ClO8CITKKXoo7nhlBJagR1yAachsxLHYQlqggl-9eGE,3665
19
- psyke/extraction/hypercubic/gridex/__init__.py,sha256=_g_JC6eFKLeg_CtkQawsUpVUAVxiVPQFJbfRVhMaBLg,5632
19
+ psyke/extraction/hypercubic/ginger/__init__.py,sha256=GZOKy_iLmYOuXcRrPhNEDP5ZA2Ez2Cjb6GuervSAD4Y,4476
20
+ psyke/extraction/hypercubic/gridex/__init__.py,sha256=tPPLGRJ-7fCt-OB-qq6W7EV0hqEuQVUGlXs2yyABo98,3161
20
21
  psyke/extraction/hypercubic/gridrex/__init__.py,sha256=h9usK5tFqd6ngBmRydsgkfQ1jlcQKj2uG72Tr1puFHk,595
21
22
  psyke/extraction/hypercubic/hex/__init__.py,sha256=553AZjOT9thfqDGtVDI6WtgYNex2Y6dg53cEyuf7Q80,4805
22
23
  psyke/extraction/hypercubic/iter/__init__.py,sha256=bb0neiPcNlyyr-OUUjgw4vdkehnAsoyJzVJ88jAHtQ8,10233
@@ -24,6 +25,8 @@ psyke/extraction/real/__init__.py,sha256=zAE_syurDqmFiopD5oLeIs9bROiuXy06wxoHmVq
24
25
  psyke/extraction/real/utils.py,sha256=4NNL15Eu7cmkG9b29GBP6CKgMTV1cmiJVS0k1MbWpIs,2148
25
26
  psyke/extraction/trepan/__init__.py,sha256=H8F_wpFLPcfyx2tgOOno8FwUomxfVxVl1vxlb0ClP1g,6931
26
27
  psyke/extraction/trepan/utils.py,sha256=iSUJ1ooNQT_VO1KfBZuIUeUsyUbGdQf_pSEE87vMeQg,2320
28
+ psyke/genetic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ psyke/genetic/gin/__init__.py,sha256=liRG5kenjPnMlr4pDGIshLDGmwK-GYrFCKFlWUDk1YU,4179
27
30
  psyke/schema/__init__.py,sha256=axv4ejZY0ItUwrC9IXb_yAhaQL5f1vwvXXmaIAHJmt0,26063
28
31
  psyke/tuning/__init__.py,sha256=yd_ForFmHeYbtRXltY1fOa-mPJvpE6ijzg50M_8Sdxw,3649
29
32
  psyke/tuning/crash/__init__.py,sha256=zIHEF75EFy_mRIieqzP04qKLG3GLsSc_mYZHpPfkzxU,2623
@@ -35,8 +38,8 @@ psyke/utils/logic.py,sha256=ioP25WMTYNYEzaRDNDe3kGNWqZ6DA_63t19d-ky_2kM,12227
35
38
  psyke/utils/metrics.py,sha256=Oo5BOonOSfo0qYsXWT5dmypZ7jiStByFC2MKEU0uMHg,2250
36
39
  psyke/utils/plot.py,sha256=dE8JJ6tQ0Ezosid-r2jqAisREjFe5LqExRzsVi5Ns-c,7785
37
40
  psyke/utils/sorted.py,sha256=C3CPW2JisND30BRk5c1sAAHs3Lb_wsRB2qZrYFuRnfM,678
38
- psyke-0.9.1.dev12.dist-info/licenses/LICENSE,sha256=G3mPaubObvkBXbsgTTeYGLk_pNEW8tc7HZr4u_wLEpU,11398
39
- psyke-0.9.1.dev12.dist-info/METADATA,sha256=R8bTf2T9Hf2Qc35iVKAINxAm5kiqRpF79HRUFYPwXcc,8395
40
- psyke-0.9.1.dev12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
41
- psyke-0.9.1.dev12.dist-info/top_level.txt,sha256=q1HglxOqqoIRukFtyis_ZNHczZg4gANRUPWkD7HAUTU,6
42
- psyke-0.9.1.dev12.dist-info/RECORD,,
41
+ psyke-0.9.1.dev43.dist-info/licenses/LICENSE,sha256=G3mPaubObvkBXbsgTTeYGLk_pNEW8tc7HZr4u_wLEpU,11398
42
+ psyke-0.9.1.dev43.dist-info/METADATA,sha256=8P6RB9OhVsNHokg8Q1meg5zpigQoaigC58y8EtrQHCo,8395
43
+ psyke-0.9.1.dev43.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ psyke-0.9.1.dev43.dist-info/top_level.txt,sha256=q1HglxOqqoIRukFtyis_ZNHczZg4gANRUPWkD7HAUTU,6
45
+ psyke-0.9.1.dev43.dist-info/RECORD,,