psyke 0.4.9.dev6__py3-none-any.whl → 1.0.4.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- psyke/__init__.py +231 -85
- psyke/clustering/__init__.py +9 -4
- psyke/clustering/cream/__init__.py +6 -10
- psyke/clustering/exact/__init__.py +17 -11
- psyke/clustering/utils.py +0 -1
- psyke/extraction/__init__.py +25 -0
- psyke/extraction/cart/CartPredictor.py +128 -0
- psyke/extraction/cart/FairTree.py +205 -0
- psyke/extraction/cart/FairTreePredictor.py +56 -0
- psyke/extraction/cart/__init__.py +48 -62
- psyke/extraction/hypercubic/__init__.py +187 -47
- psyke/extraction/hypercubic/cosmik/__init__.py +47 -0
- psyke/extraction/hypercubic/creepy/__init__.py +24 -29
- psyke/extraction/hypercubic/divine/__init__.py +86 -0
- psyke/extraction/hypercubic/ginger/__init__.py +100 -0
- psyke/extraction/hypercubic/gridex/__init__.py +45 -84
- psyke/extraction/hypercubic/gridrex/__init__.py +4 -4
- psyke/extraction/hypercubic/hex/__init__.py +104 -0
- psyke/extraction/hypercubic/hypercube.py +275 -72
- psyke/extraction/hypercubic/iter/__init__.py +45 -46
- psyke/extraction/hypercubic/strategy.py +13 -9
- psyke/extraction/real/__init__.py +24 -29
- psyke/extraction/real/utils.py +2 -2
- psyke/extraction/trepan/__init__.py +24 -19
- psyke/genetic/__init__.py +0 -0
- psyke/genetic/fgin/__init__.py +74 -0
- psyke/genetic/gin/__init__.py +144 -0
- psyke/hypercubepredictor.py +102 -0
- psyke/schema/__init__.py +230 -36
- psyke/tuning/__init__.py +40 -28
- psyke/tuning/crash/__init__.py +33 -64
- psyke/tuning/orchid/__init__.py +21 -23
- psyke/tuning/pedro/__init__.py +70 -56
- psyke/utils/logic.py +8 -8
- psyke/utils/plot.py +79 -3
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/METADATA +42 -22
- psyke-1.0.4.dev10.dist-info/RECORD +46 -0
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/WHEEL +1 -1
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info/licenses}/LICENSE +2 -1
- psyke/extraction/cart/predictor.py +0 -73
- psyke-0.4.9.dev6.dist-info/RECORD +0 -36
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/top_level.txt +0 -0
|
@@ -1,18 +1,14 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
from random import Random
|
|
3
2
|
from typing import Iterable
|
|
4
3
|
import numpy as np
|
|
5
4
|
import pandas as pd
|
|
6
5
|
from sklearn.base import ClassifierMixin
|
|
7
6
|
from tuprolog.theory import Theory
|
|
8
|
-
from psyke import PedagogicalExtractor
|
|
9
7
|
from psyke.extraction.hypercubic import HyperCube, HyperCubeExtractor
|
|
10
8
|
from psyke.extraction.hypercubic.hypercube import GenericCube
|
|
11
9
|
from psyke.extraction.hypercubic.utils import MinUpdate, Expansion
|
|
12
10
|
from psyke.utils import get_default_random_seed, Target
|
|
13
11
|
|
|
14
|
-
DomainProperties = (Iterable[MinUpdate], GenericCube)
|
|
15
|
-
|
|
16
12
|
|
|
17
13
|
class ITER(HyperCubeExtractor):
|
|
18
14
|
"""
|
|
@@ -20,12 +16,14 @@ class ITER(HyperCubeExtractor):
|
|
|
20
16
|
"""
|
|
21
17
|
|
|
22
18
|
def __init__(self, predictor, min_update, n_points, max_iterations, min_examples, threshold, fill_gaps,
|
|
23
|
-
normalization, output: Target = Target.CONSTANT,
|
|
24
|
-
|
|
19
|
+
ignore_dimensions: Iterable, normalization, output: Target = Target.CONSTANT,
|
|
20
|
+
seed=get_default_random_seed()):
|
|
21
|
+
super().__init__(predictor, output, normalization=normalization)
|
|
25
22
|
if output is Target.REGRESSION:
|
|
26
23
|
raise NotImplementedError
|
|
27
24
|
self.predictor = predictor
|
|
28
25
|
self.min_update = min_update
|
|
26
|
+
self._init_points = n_points
|
|
29
27
|
self.n_points = n_points
|
|
30
28
|
self.max_iterations = max_iterations
|
|
31
29
|
self.min_examples = min_examples
|
|
@@ -33,13 +31,18 @@ class ITER(HyperCubeExtractor):
|
|
|
33
31
|
self.fill_gaps = fill_gaps
|
|
34
32
|
self._output = Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else \
|
|
35
33
|
output if output is not None else Target.CONSTANT
|
|
36
|
-
self.
|
|
34
|
+
self.seed = seed
|
|
35
|
+
self.ignore_dimensions = ignore_dimensions if ignore_dimensions is not None else []
|
|
36
|
+
|
|
37
|
+
def make_fair(self, features: Iterable[str]):
|
|
38
|
+
self.n_points = self._init_points
|
|
39
|
+
self.ignore_dimensions += list(features)
|
|
37
40
|
|
|
38
41
|
def _best_cube(self, dataframe: pd.DataFrame, cube: GenericCube, cubes: Iterable[Expansion]) -> Expansion | None:
|
|
39
42
|
expansions = []
|
|
40
43
|
for limit in cubes:
|
|
41
44
|
count = limit.cube.count(dataframe)
|
|
42
|
-
dataframe = pd.concat([dataframe, limit.cube.create_samples(self.min_examples - count
|
|
45
|
+
dataframe = pd.concat([dataframe, limit.cube.create_samples(self.min_examples - count)])
|
|
43
46
|
limit.cube.update(dataframe, self.predictor)
|
|
44
47
|
expansions.append(Expansion(
|
|
45
48
|
limit.cube, limit.feature, limit.direction,
|
|
@@ -50,24 +53,21 @@ class ITER(HyperCubeExtractor):
|
|
|
50
53
|
return sorted(expansions, key=lambda e: e.distance)[0]
|
|
51
54
|
return None
|
|
52
55
|
|
|
53
|
-
def _calculate_min_updates(self
|
|
56
|
+
def _calculate_min_updates(self) -> Iterable[MinUpdate]:
|
|
54
57
|
return [MinUpdate(name, (interval[1] - interval[0]) * self.min_update) for (name, interval) in
|
|
55
|
-
|
|
58
|
+
self._surrounding.dimensions.items()]
|
|
56
59
|
|
|
57
|
-
|
|
58
|
-
def _create_range(cube: GenericCube, domain: DomainProperties, feature: str, direction: str)\
|
|
60
|
+
def _create_range(self, cube: GenericCube, min_updates: Iterable[MinUpdate], feature: str, direction: str)\
|
|
59
61
|
-> tuple[GenericCube, tuple[float, float]]:
|
|
60
|
-
min_updates, surrounding = domain
|
|
61
62
|
a, b = cube[feature]
|
|
62
63
|
size = [min_update for min_update in min_updates if min_update.name == feature][0].value
|
|
63
|
-
return (cube.copy(), (max(a - size,
|
|
64
|
-
if direction == '-' else (b, min(b + size,
|
|
64
|
+
return (cube.copy(), (max(a - size, self._surrounding.get_first(feature)), a)
|
|
65
|
+
if direction == '-' else (b, min(b + size, self._surrounding.get_second(feature))))
|
|
65
66
|
|
|
66
|
-
|
|
67
|
-
def _create_temp_cube(cube: GenericCube, domain: DomainProperties,
|
|
67
|
+
def _create_temp_cube(self, cube: GenericCube, min_updates: Iterable[MinUpdate],
|
|
68
68
|
hypercubes: Iterable[GenericCube], feature: str,
|
|
69
69
|
direction: str) -> Iterable[Expansion]:
|
|
70
|
-
temp_cube, values =
|
|
70
|
+
temp_cube, values = self._create_range(cube, min_updates, feature, direction)
|
|
71
71
|
temp_cube.update_dimension(feature, values)
|
|
72
72
|
overlap = temp_cube.overlap(hypercubes)
|
|
73
73
|
while (overlap is not None) & (temp_cube.has_volume()):
|
|
@@ -77,23 +77,24 @@ class ITER(HyperCubeExtractor):
|
|
|
77
77
|
else:
|
|
78
78
|
cube.add_limit(feature, direction)
|
|
79
79
|
|
|
80
|
-
|
|
81
|
-
def _create_temp_cubes(cube: GenericCube, domain: DomainProperties,
|
|
80
|
+
def _create_temp_cubes(self, cube: GenericCube, min_updates: Iterable[MinUpdate],
|
|
82
81
|
hypercubes: Iterable[GenericCube]) -> Iterable[Expansion]:
|
|
83
82
|
tmp_cubes = []
|
|
84
|
-
for feature in
|
|
83
|
+
for feature in self._surrounding.dimensions.keys():
|
|
84
|
+
if feature in self.ignore_dimensions:
|
|
85
|
+
continue
|
|
85
86
|
limit = cube.check_limits(feature)
|
|
86
87
|
if limit == '*':
|
|
87
88
|
continue
|
|
88
89
|
for x in {'-', '+'} - {limit}:
|
|
89
|
-
tmp_cubes +=
|
|
90
|
+
tmp_cubes += self._create_temp_cube(cube, min_updates, hypercubes, feature, x)
|
|
90
91
|
return tmp_cubes
|
|
91
92
|
|
|
92
93
|
def _cubes_to_update(self, dataframe: pd.DataFrame, to_expand: Iterable[GenericCube],
|
|
93
|
-
hypercubes: Iterable[GenericCube],
|
|
94
|
+
hypercubes: Iterable[GenericCube], min_updates: Iterable[MinUpdate]) \
|
|
94
95
|
-> Iterable[tuple[GenericCube, Expansion]]:
|
|
95
96
|
results = [(hypercube, self._best_cube(dataframe, hypercube, self._create_temp_cubes(
|
|
96
|
-
hypercube,
|
|
97
|
+
hypercube, min_updates, hypercubes))) for hypercube in to_expand]
|
|
97
98
|
return sorted([result for result in results if result[1] is not None], key=lambda x: x[1].distance)
|
|
98
99
|
|
|
99
100
|
def _expand_or_create(self, cube: GenericCube, expansion: Expansion, hypercubes: Iterable[GenericCube]) -> None:
|
|
@@ -103,7 +104,7 @@ class ITER(HyperCubeExtractor):
|
|
|
103
104
|
cube.expand(expansion, hypercubes)
|
|
104
105
|
|
|
105
106
|
@staticmethod
|
|
106
|
-
def _find_closer_sample(dataframe: pd.DataFrame, output: float | str) -> dict[str,
|
|
107
|
+
def _find_closer_sample(dataframe: pd.DataFrame, output: float | str) -> dict[str, float]:
|
|
107
108
|
if isinstance(output, str):
|
|
108
109
|
close_sample = dataframe[dataframe.iloc[:, -1] == output].iloc[0].to_dict()
|
|
109
110
|
else:
|
|
@@ -126,36 +127,34 @@ class ITER(HyperCubeExtractor):
|
|
|
126
127
|
return [HyperCube.cube_from_point(ITER._find_closer_sample(dataframe, point), output=self._output)
|
|
127
128
|
for point in points]
|
|
128
129
|
|
|
129
|
-
def _initialize(self, dataframe: pd.DataFrame) ->
|
|
130
|
+
def _initialize(self, dataframe: pd.DataFrame) -> Iterable[MinUpdate]:
|
|
130
131
|
self._fake_dataframe = dataframe.copy()
|
|
131
|
-
|
|
132
|
-
min_updates = self._calculate_min_updates(
|
|
133
|
-
self.
|
|
132
|
+
self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output)
|
|
133
|
+
min_updates = self._calculate_min_updates()
|
|
134
|
+
self._init_hypercubes(dataframe, min_updates)
|
|
134
135
|
for hypercube in self._hypercubes:
|
|
135
136
|
hypercube.update(dataframe, self.predictor)
|
|
136
|
-
return
|
|
137
|
-
|
|
138
|
-
def _init_hypercubes(
|
|
139
|
-
self,
|
|
140
|
-
dataframe: pd.DataFrame,
|
|
141
|
-
min_updates: Iterable[MinUpdate],
|
|
142
|
-
surrounding: GenericCube
|
|
143
|
-
) -> Iterable[GenericCube]:
|
|
137
|
+
return min_updates
|
|
138
|
+
|
|
139
|
+
def _init_hypercubes(self, dataframe: pd.DataFrame, min_updates: Iterable[MinUpdate]):
|
|
144
140
|
while True:
|
|
145
141
|
hypercubes = self._generate_starting_points(dataframe)
|
|
146
142
|
for hypercube in hypercubes:
|
|
147
|
-
hypercube.expand_all(min_updates,
|
|
143
|
+
hypercube.expand_all(min_updates, self._surrounding)
|
|
144
|
+
for d in self.ignore_dimensions:
|
|
145
|
+
hypercube[d] = self._surrounding[d]
|
|
148
146
|
self.n_points = self.n_points - 1
|
|
149
147
|
if not HyperCube.check_overlap(hypercubes, hypercubes):
|
|
150
148
|
break
|
|
151
|
-
|
|
149
|
+
self._hypercubes = hypercubes
|
|
152
150
|
|
|
153
|
-
def _iterate(self, dataframe: pd.DataFrame, hypercubes: Iterable[GenericCube],
|
|
151
|
+
def _iterate(self, dataframe: pd.DataFrame, hypercubes: Iterable[GenericCube], min_updates: Iterable[MinUpdate],
|
|
154
152
|
left_iteration: int) -> int:
|
|
153
|
+
np.random.seed(self.seed)
|
|
155
154
|
iterations = 0
|
|
156
155
|
to_expand = [cube for cube in hypercubes if cube.limit_count < (len(dataframe.columns) - 1) * 2]
|
|
157
156
|
while (len(to_expand) > 0) and (iterations < left_iteration):
|
|
158
|
-
updates = list(self._cubes_to_update(dataframe, to_expand, hypercubes,
|
|
157
|
+
updates = list(self._cubes_to_update(dataframe, to_expand, hypercubes, min_updates))
|
|
159
158
|
if len(updates) > 0:
|
|
160
159
|
self._expand_or_create(updates[0][0], updates[0][1], hypercubes)
|
|
161
160
|
iterations += 1
|
|
@@ -170,13 +169,13 @@ class ITER(HyperCubeExtractor):
|
|
|
170
169
|
min(overlapping_cube.get_first(feature), b) if direction == '+' else b)
|
|
171
170
|
return cube.overlap(hypercubes)
|
|
172
171
|
|
|
173
|
-
def _extract(self, dataframe: pd.DataFrame
|
|
174
|
-
|
|
172
|
+
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
173
|
+
min_updates = self._initialize(dataframe)
|
|
175
174
|
temp_train = dataframe.copy()
|
|
176
175
|
fake = dataframe.copy()
|
|
177
176
|
iterations = 0
|
|
178
177
|
while temp_train.shape[0] > 0:
|
|
179
|
-
iterations += self._iterate(fake, self._hypercubes,
|
|
178
|
+
iterations += self._iterate(fake, self._hypercubes, min_updates, self.max_iterations - iterations)
|
|
180
179
|
if (iterations >= self.max_iterations) or (not self.fill_gaps):
|
|
181
180
|
break
|
|
182
181
|
temp_train = temp_train.iloc[[p is None for p in self.predict(temp_train.iloc[:, :-1])]]
|
|
@@ -188,9 +187,9 @@ class ITER(HyperCubeExtractor):
|
|
|
188
187
|
if not new_cube.has_volume():
|
|
189
188
|
break
|
|
190
189
|
new_cube = HyperCube.cube_from_point(point, self._output)
|
|
191
|
-
new_cube.expand_all(
|
|
190
|
+
new_cube.expand_all(min_updates, self._surrounding, ratio)
|
|
192
191
|
overlap = new_cube.overlap(self._hypercubes)
|
|
193
192
|
ratio *= 2
|
|
194
193
|
if new_cube.has_volume():
|
|
195
194
|
self._hypercubes += [new_cube]
|
|
196
|
-
return self._create_theory(dataframe
|
|
195
|
+
return self._create_theory(dataframe)
|
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from functools import reduce
|
|
4
|
-
from
|
|
4
|
+
from collections.abc import Iterable
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class Strategy:
|
|
8
|
-
def __init__(self):
|
|
9
|
-
self._partitions =
|
|
8
|
+
def __init__(self, partitions = None):
|
|
9
|
+
self._partitions = partitions
|
|
10
|
+
self._no_features = []
|
|
10
11
|
|
|
11
12
|
def get(self, feature: str) -> int:
|
|
12
13
|
raise NotImplementedError
|
|
13
14
|
|
|
15
|
+
def make_fair(self, features: Iterable[str]):
|
|
16
|
+
self._no_features = features
|
|
17
|
+
|
|
14
18
|
def partition_number(self, features: Iterable[str]) -> int:
|
|
15
19
|
return reduce(lambda x, y: x * y, map(self.get, features), 1)
|
|
16
20
|
|
|
@@ -29,23 +33,23 @@ class Strategy:
|
|
|
29
33
|
|
|
30
34
|
class FixedStrategy(Strategy):
|
|
31
35
|
def __init__(self, partitions: int = 2):
|
|
32
|
-
super().__init__()
|
|
33
|
-
self._partitions = partitions
|
|
36
|
+
super().__init__(partitions)
|
|
34
37
|
|
|
35
38
|
def get(self, feature: str) -> int:
|
|
36
|
-
return self._partitions
|
|
39
|
+
return 1 if feature in self._no_features else self._partitions
|
|
37
40
|
|
|
38
41
|
def __str__(self):
|
|
39
42
|
return "Fixed ({})".format(super().__str__())
|
|
40
43
|
|
|
41
44
|
|
|
42
45
|
class AdaptiveStrategy(Strategy):
|
|
43
|
-
def __init__(self, features: Iterable[str], partitions: Iterable[tuple[float, float]] | None = None):
|
|
44
|
-
super().__init__()
|
|
46
|
+
def __init__(self, features: Iterable[(str, float)], partitions: Iterable[tuple[float, float]] | None = None):
|
|
47
|
+
super().__init__(partitions if partitions is not None else [(0.33, 2), (0.67, 3)])
|
|
45
48
|
self.features = features
|
|
46
|
-
self._partitions = partitions if partitions is not None else [(0.33, 2), (0.67, 3)]
|
|
47
49
|
|
|
48
50
|
def get(self, feature: str) -> int:
|
|
51
|
+
if feature in self._no_features:
|
|
52
|
+
return 1
|
|
49
53
|
importance = next(filter(lambda t: t[0] == feature, self.features))[1]
|
|
50
54
|
n = 1
|
|
51
55
|
for (imp, part) in self._partitions:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from functools import lru_cache
|
|
2
|
+
from psyke.extraction import PedagogicalExtractor
|
|
2
3
|
from psyke.extraction.real.utils import Rule, IndexedRuleSet
|
|
3
|
-
from psyke import PedagogicalExtractor
|
|
4
4
|
from psyke.schema import DiscreteFeature
|
|
5
5
|
from psyke.utils.dataframe import HashableDataFrame
|
|
6
6
|
from psyke.utils.logic import create_term, create_head, create_variable_list
|
|
@@ -15,13 +15,12 @@ class REAL(PedagogicalExtractor):
|
|
|
15
15
|
"""
|
|
16
16
|
Explanator implementing Rule Extraction As Learning (REAL) algorithm, doi:10.1016/B978-1-55860-335-6.50013-1.
|
|
17
17
|
The algorithm is sensible to features' order in the provided dataset during extraction.
|
|
18
|
-
To make it reproducible the features are internally sorted (alphabetically).
|
|
19
18
|
"""
|
|
20
19
|
|
|
21
20
|
def __init__(self, predictor, discretization: Iterable[DiscreteFeature]):
|
|
22
21
|
super().__init__(predictor, discretization)
|
|
22
|
+
self._ignore_feature = []
|
|
23
23
|
self._ruleset: IndexedRuleSet = IndexedRuleSet()
|
|
24
|
-
self._output_mapping = {}
|
|
25
24
|
|
|
26
25
|
@property
|
|
27
26
|
def n_rules(self):
|
|
@@ -31,7 +30,7 @@ class REAL(PedagogicalExtractor):
|
|
|
31
30
|
new_rule = self._rule_from_example(sample)
|
|
32
31
|
return any([new_rule in rule for rule in rules])
|
|
33
32
|
|
|
34
|
-
def
|
|
33
|
+
def _body(self, variables: dict[str, Var], rule: Rule) -> list[Struct]:
|
|
35
34
|
result = []
|
|
36
35
|
for predicates, truth_value in zip(rule.to_lists(), [True, False]):
|
|
37
36
|
for predicate in predicates:
|
|
@@ -40,28 +39,25 @@ class REAL(PedagogicalExtractor):
|
|
|
40
39
|
return result
|
|
41
40
|
|
|
42
41
|
def _create_clause(self, dataset: pd.DataFrame, variables: dict[str, Var], key: int, rule: Rule) -> Clause:
|
|
43
|
-
|
|
44
|
-
sorted(list(variables.values())),
|
|
45
|
-
str(sorted(list(set(dataset.iloc[:, -1])))[key]))
|
|
46
|
-
return clause(head, self._create_body(variables, rule))
|
|
42
|
+
return clause(create_head(dataset.columns[-1], list(variables.values()), key), self._body(variables, rule))
|
|
47
43
|
|
|
48
44
|
def _create_new_rule(self, sample: pd.Series) -> Rule:
|
|
49
45
|
rule = self._rule_from_example(sample)
|
|
50
46
|
return self._generalise(rule, sample)
|
|
51
47
|
|
|
52
48
|
def _create_ruleset(self, dataset: pd.DataFrame) -> IndexedRuleSet:
|
|
53
|
-
ruleset = IndexedRuleSet.create_indexed_ruleset(dataset)
|
|
54
|
-
for
|
|
49
|
+
ruleset = IndexedRuleSet.create_indexed_ruleset(sorted(set(dataset.iloc[:, -1])))
|
|
50
|
+
for _, sample in dataset.iloc[:, :-1].iterrows():
|
|
55
51
|
prediction = list(self.predictor.predict(sample.to_frame().transpose()))[0]
|
|
56
|
-
rules = ruleset.get(
|
|
52
|
+
rules = ruleset.get(prediction)
|
|
57
53
|
if not self._covers(sample, rules):
|
|
58
54
|
rules.append(self._create_new_rule(sample))
|
|
59
55
|
return ruleset.optimize()
|
|
60
56
|
|
|
61
|
-
def _create_theory(self, dataset: pd.DataFrame
|
|
57
|
+
def _create_theory(self, dataset: pd.DataFrame) -> MutableTheory:
|
|
62
58
|
theory = mutable_theory()
|
|
63
|
-
for key, rule in
|
|
64
|
-
variables = create_variable_list(self.discretization
|
|
59
|
+
for key, rule in self._ruleset.flatten():
|
|
60
|
+
variables = create_variable_list(self.discretization)
|
|
65
61
|
theory.assertZ(self._create_clause(dataset, variables, key, rule))
|
|
66
62
|
return theory
|
|
67
63
|
|
|
@@ -92,16 +88,22 @@ class REAL(PedagogicalExtractor):
|
|
|
92
88
|
return self._create_ruleset(dataset)
|
|
93
89
|
|
|
94
90
|
def _internal_predict(self, sample: pd.Series):
|
|
95
|
-
x = [index for index, rule in self._ruleset.flatten() if
|
|
96
|
-
|
|
97
|
-
return reverse_mapping[x[0]] if len(x) > 0 else None
|
|
91
|
+
x = [index for index, rule in self._ruleset.flatten() if self._rule_from_example(sample) in rule]
|
|
92
|
+
return x[0] if x else None
|
|
98
93
|
|
|
99
|
-
|
|
100
|
-
|
|
94
|
+
def make_fair(self, features: Iterable[str]):
|
|
95
|
+
self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \
|
|
96
|
+
if self.discretization else [features]
|
|
97
|
+
self._ignore_feature = [feature for features in self._ignore_feature for feature in features]
|
|
98
|
+
self._get_or_set.cache_clear()
|
|
99
|
+
|
|
100
|
+
def _rule_from_example(self, sample: pd.Series) -> Rule:
|
|
101
101
|
true_predicates, false_predicates = [], []
|
|
102
102
|
for feature, value in sample.items():
|
|
103
|
+
if feature in self._ignore_feature:
|
|
104
|
+
continue
|
|
103
105
|
true_predicates.append(str(feature)) if value == 1 else false_predicates.append(str(feature))
|
|
104
|
-
return Rule(
|
|
106
|
+
return Rule(true_predicates, false_predicates)
|
|
105
107
|
|
|
106
108
|
def _subset(self, samples: pd.DataFrame, predicate: str) -> (pd.DataFrame, bool):
|
|
107
109
|
samples_0 = samples.copy()
|
|
@@ -111,16 +113,9 @@ class REAL(PedagogicalExtractor):
|
|
|
111
113
|
samples_all = samples_0.append(samples_1)
|
|
112
114
|
return samples_all, len(set(self.predictor.predict(samples_all))) == 1
|
|
113
115
|
|
|
114
|
-
def _extract(self, dataframe: pd.DataFrame
|
|
115
|
-
# Order the dataset by column to preserve reproducibility.
|
|
116
|
-
dataframe = dataframe.sort_values(by=list(dataframe.columns.values), ascending=False)
|
|
117
|
-
# Always perform output mapping in the same (sorted) way to preserve reproducibility.
|
|
118
|
-
if mapping is None:
|
|
119
|
-
self._output_mapping = {value: index for index, value in enumerate(sorted(set(dataframe.iloc[:, -1])))}
|
|
120
|
-
else:
|
|
121
|
-
self._output_mapping = {value: index for index, value in enumerate(sorted(set(mapping[dataframe.iloc[:, -1]])))}
|
|
116
|
+
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
122
117
|
self._ruleset = self._get_or_set(HashableDataFrame(dataframe))
|
|
123
|
-
return self._create_theory(dataframe
|
|
118
|
+
return self._create_theory(dataframe)
|
|
124
119
|
|
|
125
120
|
def _predict(self, dataframe) -> Iterable:
|
|
126
121
|
return np.array([self._internal_predict(data.transpose()) for _, data in dataframe.iterrows()])
|
psyke/extraction/real/utils.py
CHANGED
|
@@ -49,5 +49,5 @@ class IndexedRuleSet(dict[int, list[Rule]]):
|
|
|
49
49
|
]
|
|
50
50
|
|
|
51
51
|
@staticmethod
|
|
52
|
-
def create_indexed_ruleset(
|
|
53
|
-
return IndexedRuleSet({
|
|
52
|
+
def create_indexed_ruleset(indices: Iterable) -> IndexedRuleSet:
|
|
53
|
+
return IndexedRuleSet({i: [] for i in indices})
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import numpy as np
|
|
2
|
+
from psyke.extraction import PedagogicalExtractor
|
|
2
3
|
from psyke.extraction.trepan.utils import Node, Split, SplitLogic
|
|
3
|
-
from psyke import DiscreteFeature
|
|
4
|
+
from psyke import DiscreteFeature
|
|
4
5
|
from psyke.utils.logic import create_term, create_variable_list, create_head
|
|
5
6
|
from psyke.utils.sorted import SortedList
|
|
6
7
|
from tuprolog.core import Var, Struct, clause
|
|
@@ -14,11 +15,17 @@ class Trepan(PedagogicalExtractor):
|
|
|
14
15
|
def __init__(self, predictor, discretization: Iterable[DiscreteFeature], min_examples: int = 0, max_depth: int = 3,
|
|
15
16
|
split_logic: SplitLogic = SplitLogic.DEFAULT):
|
|
16
17
|
super().__init__(predictor, discretization)
|
|
18
|
+
self._ignore_feature = []
|
|
17
19
|
self.min_examples = min_examples
|
|
18
20
|
self.max_depth = max_depth
|
|
19
21
|
self.split_logic = split_logic
|
|
20
22
|
self._root: Node
|
|
21
23
|
|
|
24
|
+
def make_fair(self, features: Iterable[str]):
|
|
25
|
+
self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \
|
|
26
|
+
if self.discretization else [features]
|
|
27
|
+
self._ignore_feature = [feature for features in self._ignore_feature for feature in features]
|
|
28
|
+
|
|
22
29
|
@property
|
|
23
30
|
def n_rules(self):
|
|
24
31
|
return sum(1 for _ in self._root)
|
|
@@ -28,7 +35,7 @@ class Trepan(PedagogicalExtractor):
|
|
|
28
35
|
raise NotImplementedError()
|
|
29
36
|
if node.n_classes == 1:
|
|
30
37
|
return None
|
|
31
|
-
splits =
|
|
38
|
+
splits = self._create_splits(node, names)
|
|
32
39
|
return None if len(splits) == 0 or splits[0].children[0].depth > self.max_depth else splits[0].children
|
|
33
40
|
|
|
34
41
|
def _compact(self):
|
|
@@ -54,28 +61,26 @@ class Trepan(PedagogicalExtractor):
|
|
|
54
61
|
def _create_split(node: Node, column: str) -> Union[Split, None]:
|
|
55
62
|
true_examples = Trepan._create_samples(node, column, 1.0)
|
|
56
63
|
false_examples = Trepan._create_samples(node, column, 0.0)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
true_node = Node(true_examples, node.n_examples,
|
|
64
|
+
true_constraints = list(node.constraints) + [(column, 1.0)]
|
|
65
|
+
false_constraints = list(node.constraints) + [(column, 0.0)]
|
|
66
|
+
true_node = Node(true_examples, node.n_examples, true_constraints, depth=node.depth + 1) \
|
|
60
67
|
if true_examples.shape[0] > 0 else None
|
|
61
|
-
false_node = Node(false_examples, node.n_examples,
|
|
68
|
+
false_node = Node(false_examples, node.n_examples, false_constraints, depth=node.depth + 1) \
|
|
62
69
|
if false_examples.shape[0] > 0 else None
|
|
63
70
|
return None if true_node is None or false_node is None else Split(node, (true_node, false_node))
|
|
64
71
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
if
|
|
70
|
-
|
|
71
|
-
if split is not None:
|
|
72
|
-
splits.add(split)
|
|
72
|
+
def _create_splits(self, node: Node, names: Iterable[str]) -> SortedList[Split]:
|
|
73
|
+
splits, constraints = Trepan._init_splits(node)
|
|
74
|
+
for column in [column for column in names if column not in list(constraints) + self._ignore_feature]:
|
|
75
|
+
split = Trepan._create_split(node, column)
|
|
76
|
+
if split is not None:
|
|
77
|
+
splits.add(split)
|
|
73
78
|
return splits
|
|
74
79
|
|
|
75
|
-
def _create_theory(self, name: str
|
|
80
|
+
def _create_theory(self, name: str) -> MutableTheory:
|
|
76
81
|
theory = mutable_theory()
|
|
77
82
|
for node in self._root:
|
|
78
|
-
variables = create_variable_list(self.discretization
|
|
83
|
+
variables = create_variable_list(self.discretization)
|
|
79
84
|
theory.assertZ(
|
|
80
85
|
clause(
|
|
81
86
|
create_head(name, list(variables.values()), str(node.dominant)),
|
|
@@ -116,7 +121,7 @@ class Trepan(PedagogicalExtractor):
|
|
|
116
121
|
continue
|
|
117
122
|
if not skip:
|
|
118
123
|
return Trepan._internal_predict(x, child, categories)
|
|
119
|
-
return node.dominant
|
|
124
|
+
return node.dominant
|
|
120
125
|
|
|
121
126
|
def _optimize(self) -> None:
|
|
122
127
|
n, nodes = 0, [self._root]
|
|
@@ -135,7 +140,7 @@ class Trepan(PedagogicalExtractor):
|
|
|
135
140
|
nodes.append(child)
|
|
136
141
|
return len(to_remove)
|
|
137
142
|
|
|
138
|
-
def _extract(self, dataframe: pd.DataFrame
|
|
143
|
+
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
139
144
|
queue = self._init(dataframe)
|
|
140
145
|
while len(queue) > 0:
|
|
141
146
|
node = queue.pop()
|
|
@@ -148,7 +153,7 @@ class Trepan(PedagogicalExtractor):
|
|
|
148
153
|
queue.add_all(best)
|
|
149
154
|
node.children += list(best)
|
|
150
155
|
self._optimize()
|
|
151
|
-
return self._create_theory(dataframe.columns[-1]
|
|
156
|
+
return self._create_theory(dataframe.columns[-1])
|
|
152
157
|
|
|
153
158
|
def _predict(self, dataframe: pd.DataFrame) -> Iterable:
|
|
154
159
|
return np.array(
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
from psyke import Target
|
|
5
|
+
from psyke.genetic.gin import GIn
|
|
6
|
+
|
|
7
|
+
import skfuzzy as skf
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FGIn(GIn):
|
|
11
|
+
|
|
12
|
+
def __init__(self, train, valid, features, sigmas, slices, min_rules=1, poly=1, alpha=0.5, indpb=0.5, tournsize=3,
|
|
13
|
+
metric='R2', output=Target.REGRESSION, warm=False):
|
|
14
|
+
super().__init__(train, valid, features, sigmas, slices, min_rules, poly, alpha, indpb, tournsize,
|
|
15
|
+
metric, output, warm)
|
|
16
|
+
self.feature_to_idx = {f: i for i, f in enumerate(self.X.columns)}
|
|
17
|
+
|
|
18
|
+
def _evaluate(self, individual=None):
|
|
19
|
+
y_pred, valid_regions = self.__predict(individual or self.best, self.X if self.valid is None else self.valid[0])
|
|
20
|
+
if valid_regions < self.min_rules:
|
|
21
|
+
return -9999,
|
|
22
|
+
return self._score(self.y if self.valid is None else self.valid[1], y_pred),
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def __generate_membership(var, domain, thresholds, shape='tri'):
|
|
26
|
+
th = [var.min()] + [min(max(t, var.min()), var.max()) for t in thresholds] + [var.max()]
|
|
27
|
+
|
|
28
|
+
if shape == 'tri':
|
|
29
|
+
mid = [(x1 + x2) / 2 for x1, x2 in zip(th[:-1], th[1:])]
|
|
30
|
+
return [skf.trapmf(domain, [domain.min()] * 2 + mid[:2])] + \
|
|
31
|
+
[skf.trimf(domain, [x1, x2, x3]) for x1, x2, x3 in zip(mid[:-2], mid[1:-1], mid[2:])] + \
|
|
32
|
+
[skf.trapmf(domain, mid[-2:] + [domain.max()] * 2)]
|
|
33
|
+
if shape == 'trap':
|
|
34
|
+
beg = [None, domain.min()] + [(3 * x1 + x2) / 4 for x1, x2 in zip(th[1:-1], th[2:])] + [domain.max()]
|
|
35
|
+
end = [domain.min()] + [(x1 + 3 * x2) / 4 for x1, x2 in zip(th[:-2], th[1:-1])] + [domain.max()]
|
|
36
|
+
return [skf.trapmf(domain, [end[i - 1], beg[i], end[i], beg[i + 1]]) for i in range(1, len(th))]
|
|
37
|
+
raise ValueError('Supported shape values are only \'tri\' and \'trap\'')
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def __extend_domain(x, q_low=0.05, q_high=0.95, p=0.05, k_sigma=2.0, abs_min_margin=0.0):
|
|
41
|
+
ql, qh = np.quantile(x, [q_low, q_high])
|
|
42
|
+
margin = max(p * (qh - ql), k_sigma * np.std(x), abs_min_margin)
|
|
43
|
+
return np.array([ql - margin, qh + margin])
|
|
44
|
+
|
|
45
|
+
def __get_activations(self, x, functions_domains, valid_masks):
|
|
46
|
+
levels = [np.array([skf.interp_membership(domain, mf, x[index]) for mf in mfs])
|
|
47
|
+
for mfs, domain, index in functions_domains.values()]
|
|
48
|
+
return np.prod(np.meshgrid(*levels, indexing='ij'), axis=0).ravel()[valid_masks]
|
|
49
|
+
|
|
50
|
+
def __fuzzify(self, cuts):
|
|
51
|
+
cuts = dict(zip(self.features, cuts))
|
|
52
|
+
doms = {c: FGIn.__extend_domain(self.X[c]) for c in self.features}
|
|
53
|
+
return {c: (FGIn.__generate_membership(self.X[c], doms[c], cuts[c], 'trap'), doms[c],
|
|
54
|
+
self.feature_to_idx[c]) for c in self.features}
|
|
55
|
+
|
|
56
|
+
def __predict(self, individual=None, to_pred=None):
|
|
57
|
+
cuts = self._get_cuts(individual or self.best)
|
|
58
|
+
masks = np.array([self._region(to_pred, cuts) == r for r in range(np.prod([s + 1 for s in self.slices]))])
|
|
59
|
+
valid_masks = masks.sum(axis=1) >= 3
|
|
60
|
+
|
|
61
|
+
masks = [mask for mask in masks if mask.sum() >= 3]
|
|
62
|
+
functions_domains = self.__fuzzify(cuts)
|
|
63
|
+
|
|
64
|
+
pred = np.array([self._output_estimation(mask, to_pred) for mask in masks]).T
|
|
65
|
+
activations = np.array([self.__get_activations(x, functions_domains, valid_masks) for x in to_pred.values])
|
|
66
|
+
|
|
67
|
+
if self.output == Target.CLASSIFICATION:
|
|
68
|
+
classes, idx = np.unique(pred, return_inverse=True)
|
|
69
|
+
pred = classes[np.argmax(np.vstack([activations[:, idx == i].sum(axis=1) for i, c in enumerate(classes)]),
|
|
70
|
+
axis=0)]
|
|
71
|
+
else:
|
|
72
|
+
pred = (pred * activations).sum(axis=1)
|
|
73
|
+
|
|
74
|
+
return pd.DataFrame(pred, index=to_pred.index), len(masks)
|