psyke 0.4.9.dev6__py3-none-any.whl → 1.0.4.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- psyke/__init__.py +231 -85
- psyke/clustering/__init__.py +9 -4
- psyke/clustering/cream/__init__.py +6 -10
- psyke/clustering/exact/__init__.py +17 -11
- psyke/clustering/utils.py +0 -1
- psyke/extraction/__init__.py +25 -0
- psyke/extraction/cart/CartPredictor.py +128 -0
- psyke/extraction/cart/FairTree.py +205 -0
- psyke/extraction/cart/FairTreePredictor.py +56 -0
- psyke/extraction/cart/__init__.py +48 -62
- psyke/extraction/hypercubic/__init__.py +187 -47
- psyke/extraction/hypercubic/cosmik/__init__.py +47 -0
- psyke/extraction/hypercubic/creepy/__init__.py +24 -29
- psyke/extraction/hypercubic/divine/__init__.py +86 -0
- psyke/extraction/hypercubic/ginger/__init__.py +100 -0
- psyke/extraction/hypercubic/gridex/__init__.py +45 -84
- psyke/extraction/hypercubic/gridrex/__init__.py +4 -4
- psyke/extraction/hypercubic/hex/__init__.py +104 -0
- psyke/extraction/hypercubic/hypercube.py +275 -72
- psyke/extraction/hypercubic/iter/__init__.py +45 -46
- psyke/extraction/hypercubic/strategy.py +13 -9
- psyke/extraction/real/__init__.py +24 -29
- psyke/extraction/real/utils.py +2 -2
- psyke/extraction/trepan/__init__.py +24 -19
- psyke/genetic/__init__.py +0 -0
- psyke/genetic/fgin/__init__.py +74 -0
- psyke/genetic/gin/__init__.py +144 -0
- psyke/hypercubepredictor.py +102 -0
- psyke/schema/__init__.py +230 -36
- psyke/tuning/__init__.py +40 -28
- psyke/tuning/crash/__init__.py +33 -64
- psyke/tuning/orchid/__init__.py +21 -23
- psyke/tuning/pedro/__init__.py +70 -56
- psyke/utils/logic.py +8 -8
- psyke/utils/plot.py +79 -3
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/METADATA +42 -22
- psyke-1.0.4.dev10.dist-info/RECORD +46 -0
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/WHEEL +1 -1
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info/licenses}/LICENSE +2 -1
- psyke/extraction/cart/predictor.py +0 -73
- psyke-0.4.9.dev6.dist-info/RECORD +0 -36
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
import random as rnd
|
|
3
2
|
from itertools import product
|
|
4
3
|
from typing import Iterable
|
|
5
4
|
import numpy as np
|
|
6
5
|
import pandas as pd
|
|
6
|
+
from sklearn.base import ClassifierMixin
|
|
7
7
|
from tuprolog.theory import Theory
|
|
8
|
-
from psyke import get_default_random_seed
|
|
8
|
+
from psyke import get_default_random_seed
|
|
9
9
|
from psyke.utils import Target
|
|
10
10
|
from psyke.extraction.hypercubic import HyperCubeExtractor, Grid, HyperCube
|
|
11
11
|
|
|
@@ -15,102 +15,63 @@ class GridEx(HyperCubeExtractor):
|
|
|
15
15
|
Explanator implementing GridEx algorithm, doi:10.1007/978-3-030-82017-6_2.
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float,
|
|
19
|
-
seed=get_default_random_seed()):
|
|
20
|
-
super().__init__(predictor, Target.
|
|
18
|
+
def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, output: Target = Target.CONSTANT,
|
|
19
|
+
discretization=None, normalization=None, seed: int = get_default_random_seed()):
|
|
20
|
+
super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output,
|
|
21
|
+
discretization, normalization)
|
|
21
22
|
self.grid = grid
|
|
22
23
|
self.min_examples = min_examples
|
|
23
24
|
self.threshold = threshold
|
|
24
|
-
|
|
25
|
+
np.random.seed(seed)
|
|
25
26
|
|
|
26
|
-
def _extract(self, dataframe: pd.DataFrame
|
|
27
|
+
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
27
28
|
self._hypercubes = []
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
self._iterate(surrounding, dataframe)
|
|
33
|
-
return self._create_theory(dataframe, sort)
|
|
29
|
+
self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output)
|
|
30
|
+
self._surrounding.init_diversity(2 * self.threshold)
|
|
31
|
+
self._iterate(dataframe)
|
|
32
|
+
return self._create_theory(dataframe)
|
|
34
33
|
|
|
35
|
-
def
|
|
36
|
-
|
|
37
|
-
|
|
34
|
+
def _create_ranges(self, cube, iteration):
|
|
35
|
+
ranges = {}
|
|
36
|
+
for (feature, (a, b)) in cube.dimensions.items():
|
|
37
|
+
n_bins = self.grid.get(feature, iteration)
|
|
38
|
+
if n_bins == 1:
|
|
39
|
+
ranges[feature] = [(a, b)]
|
|
40
|
+
self._dimensions_to_ignore.add(feature)
|
|
41
|
+
else:
|
|
42
|
+
size = (b - a) / n_bins
|
|
43
|
+
ranges[feature] = [(a + size * i, a + size * (i + 1)) for i in range(n_bins)]
|
|
44
|
+
return ranges
|
|
38
45
|
|
|
39
|
-
def
|
|
46
|
+
def _cubes_to_split(self, cube, iteration, dataframe, fake, keep_empty=False):
|
|
47
|
+
to_split = []
|
|
48
|
+
for p in product(*self._create_ranges(cube, iteration).values()):
|
|
49
|
+
cube = self._default_cube()
|
|
50
|
+
for i, f in enumerate(dataframe.columns[:-1]):
|
|
51
|
+
cube.update_dimension(f, p[i])
|
|
52
|
+
n = cube.count(dataframe)
|
|
53
|
+
if n > 0 or keep_empty:
|
|
54
|
+
fake = pd.concat([fake, cube.create_samples(self.min_examples - n)])
|
|
55
|
+
cube.update(fake, self.predictor)
|
|
56
|
+
to_split.append(cube)
|
|
57
|
+
return to_split, fake
|
|
58
|
+
|
|
59
|
+
def _iterate(self, dataframe: pd.DataFrame):
|
|
40
60
|
fake = dataframe.copy()
|
|
41
|
-
prev = [
|
|
42
|
-
next_iteration = []
|
|
61
|
+
prev = [self._surrounding]
|
|
43
62
|
|
|
44
63
|
for iteration in self.grid.iterate():
|
|
45
64
|
next_iteration = []
|
|
46
65
|
for cube in prev:
|
|
47
|
-
to_split = []
|
|
48
66
|
if cube.count(dataframe) == 0:
|
|
49
67
|
continue
|
|
50
68
|
if cube.diversity < self.threshold:
|
|
51
|
-
self._hypercubes
|
|
69
|
+
self._hypercubes.append(cube)
|
|
52
70
|
continue
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
size = (b - a) / n_bins
|
|
58
|
-
for i in range(n_bins):
|
|
59
|
-
bins.append((a + size * i, a + size * (i + 1)))
|
|
60
|
-
ranges[feature] = bins
|
|
61
|
-
for (pn, p) in enumerate(list(product(*ranges.values()))):
|
|
62
|
-
cube = self._default_cube()
|
|
63
|
-
for i, f in enumerate(dataframe.columns[:-1]):
|
|
64
|
-
cube.update_dimension(f, p[i])
|
|
65
|
-
n = cube.count(dataframe)
|
|
66
|
-
if n > 0:
|
|
67
|
-
fake = pd.concat([fake, cube.create_samples(self.min_examples - n, self.__generator)])
|
|
68
|
-
cube.update(fake, self.predictor)
|
|
69
|
-
to_split += [cube]
|
|
70
|
-
to_split = self._merge(to_split, fake)
|
|
71
|
-
next_iteration += [cube for cube in to_split]
|
|
72
|
-
prev = next_iteration.copy()
|
|
73
|
-
self._hypercubes += [cube for cube in next_iteration]
|
|
74
|
-
|
|
75
|
-
@staticmethod
|
|
76
|
-
def _find_couples(to_split: Iterable[HyperCube], not_in_cache: Iterable[HyperCube],
|
|
77
|
-
adjacent_cache: dict[tuple[HyperCube, HyperCube], str | None]) -> \
|
|
78
|
-
Iterable[tuple[HyperCube, HyperCube, str]]:
|
|
79
|
-
checked = []
|
|
80
|
-
eligible = []
|
|
81
|
-
for cube in to_split:
|
|
82
|
-
checked.append(cube)
|
|
83
|
-
for other_cube in [c for c in to_split if c not in checked]:
|
|
84
|
-
if (cube in not_in_cache) or (other_cube in not_in_cache):
|
|
85
|
-
adjacent_cache[(cube, other_cube)] = cube.is_adjacent(other_cube)
|
|
86
|
-
adjacent_feature = adjacent_cache[(cube, other_cube)]
|
|
87
|
-
eligible.append((cube, other_cube, adjacent_feature))
|
|
88
|
-
return [couple for couple in eligible if couple[2] is not None]
|
|
89
|
-
|
|
90
|
-
def _evaluate_merge(self, not_in_cache: Iterable[HyperCube],
|
|
91
|
-
dataframe: pd.DataFrame, feature: str,
|
|
92
|
-
cube: HyperCube, other_cube: HyperCube,
|
|
93
|
-
merge_cache: dict[(HyperCube, HyperCube), HyperCube | None]) -> bool:
|
|
94
|
-
if (cube in not_in_cache) or (other_cube in not_in_cache):
|
|
95
|
-
merged_cube = cube.merge_along_dimension(other_cube, feature)
|
|
96
|
-
merged_cube.update(dataframe, self.predictor)
|
|
97
|
-
merge_cache[(cube, other_cube)] = merged_cube
|
|
98
|
-
return cube.output == other_cube.output if self._output == Target.CLASSIFICATION else \
|
|
99
|
-
merge_cache[(cube, other_cube)].diversity < self.threshold
|
|
71
|
+
to_split, fake = self._cubes_to_split(cube, iteration, dataframe, fake)
|
|
72
|
+
next_iteration.extend(self._merge(to_split, fake))
|
|
73
|
+
prev = next_iteration
|
|
74
|
+
self._hypercubes.extend(prev)
|
|
100
75
|
|
|
101
|
-
def
|
|
102
|
-
|
|
103
|
-
adjacent_cache = {}
|
|
104
|
-
merge_cache = {}
|
|
105
|
-
# TODO: refactor this. A while true with a break is as ugly as hunger.
|
|
106
|
-
while True:
|
|
107
|
-
to_merge = [([cube, other_cube], merge_cache[(cube, other_cube)]) for cube, other_cube, feature in
|
|
108
|
-
GridEx._find_couples(to_split, not_in_cache, adjacent_cache) if
|
|
109
|
-
self._evaluate_merge(not_in_cache, dataframe, feature, cube, other_cube, merge_cache)]
|
|
110
|
-
if len(to_merge) == 0:
|
|
111
|
-
break
|
|
112
|
-
sorted(to_merge, key=lambda c: c[1].diversity)
|
|
113
|
-
best = to_merge[0]
|
|
114
|
-
to_split = [cube for cube in to_split if cube not in best[0]] + [best[1]]
|
|
115
|
-
not_in_cache = [best[1]]
|
|
116
|
-
return to_split
|
|
76
|
+
def make_fair(self, features: Iterable[str]):
|
|
77
|
+
self.grid.make_fair(features)
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
from psyke import get_default_random_seed
|
|
1
|
+
from psyke import get_default_random_seed, Target
|
|
2
2
|
from psyke.extraction.hypercubic import Grid, RegressionCube
|
|
3
3
|
from psyke.extraction.hypercubic.gridex import GridEx
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class GridREx(GridEx):
|
|
7
7
|
"""
|
|
8
|
-
Explanator implementing GridREx algorithm.
|
|
8
|
+
Explanator implementing GridREx algorithm, doi:10.24963/kr.2022/57.
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, normalization,
|
|
12
12
|
seed=get_default_random_seed()):
|
|
13
|
-
super().__init__(predictor, grid, min_examples, threshold, normalization, seed)
|
|
13
|
+
super().__init__(predictor, grid, min_examples, threshold, Target.REGRESSION, None, normalization, seed)
|
|
14
14
|
|
|
15
|
-
def _default_cube(self) -> RegressionCube:
|
|
15
|
+
def _default_cube(self, dimensions=None) -> RegressionCube:
|
|
16
16
|
return RegressionCube()
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Iterable
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from psyke import get_default_random_seed, Target
|
|
9
|
+
from psyke.extraction.hypercubic import Grid, HyperCube, GenericCube, ClassificationCube
|
|
10
|
+
from psyke.extraction.hypercubic.gridex import GridEx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HEx(GridEx):
|
|
14
|
+
"""
|
|
15
|
+
Explanator implementing HEx algorithm.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
class Node:
|
|
19
|
+
def __init__(self, cube: GenericCube, parent: HEx.Node = None, threshold: float = None):
|
|
20
|
+
self.cube = cube
|
|
21
|
+
self.parent = parent
|
|
22
|
+
self.children: Iterable[HEx.Node] = []
|
|
23
|
+
self.threshold = threshold
|
|
24
|
+
self.gain = True if parent is None else self.check()
|
|
25
|
+
|
|
26
|
+
def check(self) -> bool:
|
|
27
|
+
other = self.parent
|
|
28
|
+
try:
|
|
29
|
+
while not other.gain:
|
|
30
|
+
other = other.parent
|
|
31
|
+
except AttributeError:
|
|
32
|
+
return True
|
|
33
|
+
if isinstance(other.cube, ClassificationCube):
|
|
34
|
+
return other.cube.output != self.cube.output
|
|
35
|
+
return other.cube.error - self.cube.error > self.threshold * .6
|
|
36
|
+
|
|
37
|
+
def indices(self, dataframe: pd.DataFrame):
|
|
38
|
+
return self.cube.filter_indices(dataframe.iloc[:, :-1])
|
|
39
|
+
|
|
40
|
+
def eligible_children(self, dataframe) -> Iterable[HEx.Node]:
|
|
41
|
+
return [c for c in self.children if c.cube.count(dataframe) > 0]
|
|
42
|
+
|
|
43
|
+
def permanent_children(self, dataframe) -> Iterable[HEx.Node]:
|
|
44
|
+
return [c for c in self.eligible_children(dataframe) if c.gain]
|
|
45
|
+
|
|
46
|
+
def permanent_indices(self, dataframe):
|
|
47
|
+
return np.any([c.cube.filter_indices(dataframe.iloc[:, :-1])
|
|
48
|
+
for c in self.eligible_children(dataframe) if c.gain], axis=0)
|
|
49
|
+
|
|
50
|
+
def update(self, dataframe: pd.DataFrame, predictor, recursive=False):
|
|
51
|
+
if recursive:
|
|
52
|
+
for node in self.children:
|
|
53
|
+
node.update(dataframe, predictor, recursive)
|
|
54
|
+
cleaned = [(c.cube, c.gain) for c in self.eligible_children(dataframe)]
|
|
55
|
+
idx = self.permanent_indices(dataframe)
|
|
56
|
+
|
|
57
|
+
if sum(g for _, g in cleaned) > 0 and sum(self.indices(dataframe)) > sum(idx) and self.gain:
|
|
58
|
+
self.cube.update(dataframe[self.indices(dataframe) & ~idx], predictor)
|
|
59
|
+
return cleaned
|
|
60
|
+
|
|
61
|
+
def linearize(self, dataframe, depth=1):
|
|
62
|
+
children = [c.linearize(dataframe, depth + 1) for c in self.permanent_children(dataframe)]
|
|
63
|
+
return [(cc, dd) for c in children for cc, dd in c if c != []] + \
|
|
64
|
+
[(c, depth) for c in self.permanent_children(dataframe)]
|
|
65
|
+
|
|
66
|
+
def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, output: Target = Target.CONSTANT,
|
|
67
|
+
discretization=None, normalization=None, seed: int = get_default_random_seed()):
|
|
68
|
+
super().__init__(predictor, grid, min_examples, threshold, output, discretization, normalization, seed)
|
|
69
|
+
self._default_surrounding_cube = True
|
|
70
|
+
|
|
71
|
+
def _gain(self, parent_cube: GenericCube, new_cube: GenericCube) -> float:
|
|
72
|
+
if isinstance(parent_cube, ClassificationCube):
|
|
73
|
+
return parent_cube.output != new_cube.output
|
|
74
|
+
return parent_cube.error - new_cube.error > self.threshold * .6
|
|
75
|
+
|
|
76
|
+
def _iterate(self, dataframe: pd.DataFrame):
|
|
77
|
+
fake = dataframe.copy()
|
|
78
|
+
self._surrounding.update(dataframe, self.predictor)
|
|
79
|
+
root = HEx.Node(self._surrounding, threshold=self.threshold)
|
|
80
|
+
current = [root]
|
|
81
|
+
|
|
82
|
+
for iteration in self.grid.iterate():
|
|
83
|
+
next_iteration = []
|
|
84
|
+
for node in current:
|
|
85
|
+
if node.cube.diversity < self.threshold:
|
|
86
|
+
continue
|
|
87
|
+
children, fake = self._cubes_to_split(node.cube, iteration, dataframe, fake, True)
|
|
88
|
+
node.children = [HEx.Node(c, node, threshold=self.threshold) for c in children]
|
|
89
|
+
cleaned = node.update(fake, self.predictor, False)
|
|
90
|
+
node.children = [HEx.Node(c, node, threshold=self.threshold) for c in self._merge(
|
|
91
|
+
[c for c, _ in cleaned], fake)]
|
|
92
|
+
next_iteration += [n for n in node.children]
|
|
93
|
+
|
|
94
|
+
current = next_iteration.copy()
|
|
95
|
+
_ = root.update(fake, self.predictor, True)
|
|
96
|
+
self._hypercubes = []
|
|
97
|
+
linearized = root.linearize(fake)
|
|
98
|
+
for depth in sorted(np.unique([d for (_, d) in linearized]), reverse=True):
|
|
99
|
+
self._hypercubes += self._merge([c.cube for (c, d) in linearized if d == depth], fake)
|
|
100
|
+
|
|
101
|
+
if len(self._hypercubes) == 0:
|
|
102
|
+
self._hypercubes = [self._surrounding]
|
|
103
|
+
elif not min(np.any([c.filter_indices(dataframe.iloc[:, :-1]) for c in self._hypercubes], axis=0)):
|
|
104
|
+
self._hypercubes = self._hypercubes + [self._surrounding]
|