psyke 0.8.9.dev48__py3-none-any.whl → 1.0.4.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- psyke/__init__.py +112 -24
- psyke/clustering/__init__.py +4 -0
- psyke/clustering/cream/__init__.py +2 -6
- psyke/clustering/exact/__init__.py +10 -7
- psyke/clustering/utils.py +0 -1
- psyke/extraction/__init__.py +6 -2
- psyke/extraction/cart/{predictor.py → CartPredictor.py} +52 -7
- psyke/extraction/cart/FairTree.py +205 -0
- psyke/extraction/cart/FairTreePredictor.py +56 -0
- psyke/extraction/cart/__init__.py +27 -52
- psyke/extraction/hypercubic/__init__.py +58 -7
- psyke/extraction/hypercubic/creepy/__init__.py +14 -6
- psyke/extraction/hypercubic/ginger/__init__.py +100 -0
- psyke/extraction/hypercubic/gridex/__init__.py +6 -48
- psyke/extraction/hypercubic/gridrex/__init__.py +2 -2
- psyke/extraction/hypercubic/hypercube.py +33 -26
- psyke/extraction/hypercubic/iter/__init__.py +5 -0
- psyke/extraction/hypercubic/strategy.py +13 -9
- psyke/extraction/real/__init__.py +21 -22
- psyke/extraction/real/utils.py +2 -2
- psyke/extraction/trepan/__init__.py +19 -15
- psyke/genetic/__init__.py +0 -0
- psyke/genetic/fgin/__init__.py +74 -0
- psyke/genetic/gin/__init__.py +144 -0
- psyke/hypercubepredictor.py +4 -2
- psyke/tuning/pedro/__init__.py +4 -2
- psyke/utils/logic.py +4 -8
- {psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info}/METADATA +39 -19
- psyke-1.0.4.dev10.dist-info/RECORD +46 -0
- {psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info}/WHEEL +1 -1
- {psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info/licenses}/LICENSE +2 -1
- psyke-0.8.9.dev48.dist-info/RECORD +0 -40
- {psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from typing import Union, Any
|
|
3
|
+
|
|
4
|
+
from psyke.extraction.cart import FairTreeClassifier, FairTreeRegressor, LeafSequence, LeafConstraints
|
|
5
|
+
from psyke.extraction.cart.CartPredictor import CartPredictor
|
|
6
|
+
from psyke.schema import LessThan, GreaterThan, SchemaException, Value
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FairTreePredictor(CartPredictor):
|
|
10
|
+
"""
|
|
11
|
+
A wrapper for fair decision and regression trees of psyke.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, predictor: Union[FairTreeClassifier, FairTreeRegressor] = FairTreeClassifier(),
|
|
15
|
+
discretization=None, normalization=None):
|
|
16
|
+
super().__init__(predictor, discretization, normalization)
|
|
17
|
+
|
|
18
|
+
def __iter__(self) -> LeafSequence:
|
|
19
|
+
leaves = [node for node in self.recurse(self._predictor.root, {})]
|
|
20
|
+
return (leaf for leaf in leaves)
|
|
21
|
+
|
|
22
|
+
@staticmethod
|
|
23
|
+
def merge_constraints(constraints: LeafConstraints, constraint: Value, feature: str):
|
|
24
|
+
if feature in constraints:
|
|
25
|
+
try:
|
|
26
|
+
constraints[feature][-1] *= constraint
|
|
27
|
+
except SchemaException:
|
|
28
|
+
constraints[feature].append(constraint)
|
|
29
|
+
else:
|
|
30
|
+
constraints[feature] = [constraint]
|
|
31
|
+
return constraints
|
|
32
|
+
|
|
33
|
+
def recurse(self, node, constraints) -> Union[LeafSequence, tuple[LeafConstraints, Any]]:
|
|
34
|
+
if node.is_leaf_node():
|
|
35
|
+
return constraints, node.value
|
|
36
|
+
|
|
37
|
+
feature = node.feature
|
|
38
|
+
threshold = node.threshold if self.normalization is None else \
|
|
39
|
+
(node.threshold * self.normalization[feature][1] + self.normalization[feature][0])
|
|
40
|
+
|
|
41
|
+
left = self.recurse(node.left, self.merge_constraints(copy.deepcopy(constraints), LessThan(threshold), feature))
|
|
42
|
+
right = self.recurse(node.right, self.merge_constraints(copy.deepcopy(constraints),
|
|
43
|
+
GreaterThan(threshold), feature))
|
|
44
|
+
return (left if isinstance(left, list) else [left]) + (right if isinstance(right, list) else [right])
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def predictor(self) -> Union[FairTreeClassifier, FairTreeRegressor]:
|
|
48
|
+
return self._predictor
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def n_leaves(self) -> int:
|
|
52
|
+
return self._predictor.n_leaves
|
|
53
|
+
|
|
54
|
+
@predictor.setter
|
|
55
|
+
def predictor(self, predictor: Union[FairTreeClassifier, FairTreeRegressor]):
|
|
56
|
+
self._predictor = predictor
|
|
@@ -3,78 +3,53 @@ from abc import ABC
|
|
|
3
3
|
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
4
4
|
|
|
5
5
|
from psyke.extraction import PedagogicalExtractor
|
|
6
|
-
from psyke.extraction.cart.predictor import CartPredictor, LeafConstraints, LeafSequence
|
|
7
6
|
from psyke import get_default_random_seed
|
|
8
|
-
from psyke.
|
|
9
|
-
from psyke.
|
|
10
|
-
from tuprolog.
|
|
11
|
-
from
|
|
12
|
-
from typing import Iterable
|
|
7
|
+
from psyke.extraction.cart.FairTree import FairTreeClassifier, FairTreeRegressor
|
|
8
|
+
from psyke.schema import DiscreteFeature, Value
|
|
9
|
+
from tuprolog.theory import Theory
|
|
10
|
+
from typing import Iterable, Any
|
|
13
11
|
import pandas as pd
|
|
14
12
|
|
|
15
13
|
|
|
16
14
|
TREE_SEED = get_default_random_seed()
|
|
17
15
|
|
|
16
|
+
LeafConstraints = dict[str, list[Value]]
|
|
17
|
+
LeafSequence = Iterable[tuple[LeafConstraints, Any]]
|
|
18
|
+
|
|
18
19
|
|
|
19
20
|
class Cart(PedagogicalExtractor, ABC):
|
|
20
21
|
|
|
21
22
|
def __init__(self, predictor, max_depth: int = 3, max_leaves: int = None, max_features=None,
|
|
22
23
|
discretization: Iterable[DiscreteFeature] = None,
|
|
23
24
|
normalization=None, simplify: bool = True):
|
|
25
|
+
from psyke.extraction.cart.CartPredictor import CartPredictor
|
|
26
|
+
|
|
24
27
|
super().__init__(predictor, discretization, normalization)
|
|
25
|
-
self.
|
|
28
|
+
self.is_fair = None
|
|
29
|
+
self._cart_predictor = CartPredictor(discretization=discretization, normalization=normalization)
|
|
26
30
|
self.depth = max_depth
|
|
27
31
|
self.leaves = max_leaves
|
|
28
32
|
self.max_features = max_features
|
|
29
33
|
self._simplify = simplify
|
|
30
34
|
|
|
31
|
-
def _create_body(self, variables: dict[str, Var], conditions: LeafConstraints) -> Iterable[Struct]:
|
|
32
|
-
results = []
|
|
33
|
-
for feature_name, cond_list in conditions.items():
|
|
34
|
-
for condition in cond_list:
|
|
35
|
-
features = [d for d in self.discretization if feature_name in d.admissible_values]
|
|
36
|
-
feature: DiscreteFeature = features[0] if len(features) > 0 else None
|
|
37
|
-
results.append(create_term(variables[feature_name], condition) if feature is None else
|
|
38
|
-
create_term(variables[feature.name],
|
|
39
|
-
feature.admissible_values[feature_name],
|
|
40
|
-
isinstance(condition, GreaterThan)))
|
|
41
|
-
return results
|
|
42
|
-
|
|
43
|
-
@staticmethod
|
|
44
|
-
def _simplify_nodes(nodes: list) -> Iterable:
|
|
45
|
-
simplified = [nodes.pop(0)]
|
|
46
|
-
while len(nodes) > 0:
|
|
47
|
-
first_node = nodes[0][0]
|
|
48
|
-
for k, conditions in first_node.items():
|
|
49
|
-
for condition in conditions:
|
|
50
|
-
if all(k in node[0] and condition in node[0][k] for node in nodes):
|
|
51
|
-
[node[0][k].remove(condition) for node in nodes]
|
|
52
|
-
simplified.append(nodes.pop(0))
|
|
53
|
-
return [({k: v for k, v in rule.items() if v != []}, prediction) for rule, prediction in simplified]
|
|
54
|
-
|
|
55
|
-
def _create_theory(self, data: pd.DataFrame) -> Theory:
|
|
56
|
-
new_theory = mutable_theory()
|
|
57
|
-
nodes = [node for node in self._cart_predictor]
|
|
58
|
-
nodes = Cart._simplify_nodes(nodes) if self._simplify else nodes
|
|
59
|
-
for (constraints, prediction) in nodes:
|
|
60
|
-
if self.normalization is not None and data.columns[-1] in self.normalization:
|
|
61
|
-
m, s = self.normalization[data.columns[-1]]
|
|
62
|
-
prediction = prediction * s + m
|
|
63
|
-
variables = create_variable_list(self.discretization, data)
|
|
64
|
-
new_theory.assertZ(
|
|
65
|
-
clause(
|
|
66
|
-
create_head(data.columns[-1], list(variables.values()), prediction),
|
|
67
|
-
self._create_body(variables, constraints)
|
|
68
|
-
)
|
|
69
|
-
)
|
|
70
|
-
return new_theory
|
|
71
|
-
|
|
72
35
|
def _extract(self, data: pd.DataFrame) -> Theory:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
36
|
+
from psyke.extraction.cart.FairTreePredictor import FairTreePredictor
|
|
37
|
+
|
|
38
|
+
if self.is_fair:
|
|
39
|
+
self._cart_predictor = FairTreePredictor(discretization=self.discretization,
|
|
40
|
+
normalization=self.normalization)
|
|
41
|
+
fair_tree = FairTreeClassifier if isinstance(data.iloc[0, -1], str) else FairTreeRegressor
|
|
42
|
+
self._cart_predictor.predictor = fair_tree(max_depth=self.depth, max_leaves=self.leaves,
|
|
43
|
+
protected_attr=self.is_fair)
|
|
44
|
+
else:
|
|
45
|
+
tree = DecisionTreeClassifier if isinstance(data.iloc[0, -1], str) else DecisionTreeRegressor
|
|
46
|
+
self._cart_predictor.predictor = tree(random_state=TREE_SEED, max_depth=self.depth,
|
|
47
|
+
max_leaf_nodes=self.leaves, max_features=self.max_features)
|
|
76
48
|
self._cart_predictor.predictor.fit(data.iloc[:, :-1], data.iloc[:, -1])
|
|
77
|
-
return self.
|
|
49
|
+
return self._cart_predictor.create_theory(data, self._simplify)
|
|
50
|
+
|
|
51
|
+
def make_fair(self, features: Iterable[str]):
|
|
52
|
+
self.is_fair = features
|
|
78
53
|
|
|
79
54
|
def _predict(self, dataframe: pd.DataFrame) -> Iterable:
|
|
80
55
|
return self._cart_predictor.predict(dataframe)
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import math
|
|
4
3
|
from abc import ABC
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
from itertools import combinations
|
|
6
|
+
|
|
5
7
|
import numpy as np
|
|
6
8
|
import pandas as pd
|
|
7
9
|
from sklearn.base import ClassifierMixin
|
|
@@ -13,7 +15,7 @@ from psyke.extraction import PedagogicalExtractor
|
|
|
13
15
|
from psyke.extraction.hypercubic.hypercube import HyperCube, RegressionCube, ClassificationCube, ClosedCube, Point, \
|
|
14
16
|
GenericCube
|
|
15
17
|
from psyke.hypercubepredictor import HyperCubePredictor
|
|
16
|
-
from psyke.schema import
|
|
18
|
+
from psyke.schema import Value
|
|
17
19
|
from psyke.utils.logic import create_variable_list, create_head, to_var, Simplifier
|
|
18
20
|
from psyke.utils import Target
|
|
19
21
|
from psyke.extraction.hypercubic.strategy import Strategy, FixedStrategy
|
|
@@ -24,19 +26,62 @@ class HyperCubeExtractor(HyperCubePredictor, PedagogicalExtractor, ABC):
|
|
|
24
26
|
HyperCubePredictor.__init__(self, output=output, normalization=normalization)
|
|
25
27
|
PedagogicalExtractor.__init__(self, predictor, discretization=discretization, normalization=normalization)
|
|
26
28
|
self._default_surrounding_cube = False
|
|
29
|
+
self.threshold = None
|
|
27
30
|
|
|
28
|
-
def _default_cube(self) -> HyperCube | RegressionCube | ClassificationCube:
|
|
31
|
+
def _default_cube(self, dimensions=None) -> HyperCube | RegressionCube | ClassificationCube:
|
|
29
32
|
if self._output == Target.CONSTANT:
|
|
30
|
-
return HyperCube()
|
|
33
|
+
return HyperCube(dimensions)
|
|
31
34
|
if self._output == Target.REGRESSION:
|
|
32
|
-
return RegressionCube()
|
|
33
|
-
return ClassificationCube()
|
|
35
|
+
return RegressionCube(dimensions)
|
|
36
|
+
return ClassificationCube(dimensions)
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def _find_couples(to_split: Iterable[HyperCube], not_in_cache: set[HyperCube],
|
|
40
|
+
adjacent_cache: dict[tuple[HyperCube, HyperCube], str | None]) -> \
|
|
41
|
+
Iterable[tuple[HyperCube, HyperCube, str]]:
|
|
42
|
+
|
|
43
|
+
for cube1, cube2 in combinations(to_split, 2):
|
|
44
|
+
key = (cube1, cube2) if id(cube1) < id(cube2) else (cube2, cube1)
|
|
45
|
+
|
|
46
|
+
if (cube1 in not_in_cache) or (cube2 in not_in_cache):
|
|
47
|
+
adjacent_cache[key] = cube1.is_adjacent(cube2)
|
|
48
|
+
feature = adjacent_cache.get(key)
|
|
49
|
+
if feature is not None:
|
|
50
|
+
yield cube1, cube2, feature
|
|
51
|
+
|
|
52
|
+
def _evaluate_merge(self, not_in_cache: Iterable[HyperCube], dataframe: pd.DataFrame, feature: str,
|
|
53
|
+
cube: HyperCube, other_cube: HyperCube,
|
|
54
|
+
merge_cache: dict[tuple[HyperCube, HyperCube], HyperCube | None]) -> bool:
|
|
55
|
+
if (cube in not_in_cache) or (other_cube in not_in_cache):
|
|
56
|
+
merged_cube = cube.merge_along_dimension(other_cube, feature)
|
|
57
|
+
merged_cube.update(dataframe, self.predictor)
|
|
58
|
+
merge_cache[(cube, other_cube)] = merged_cube
|
|
59
|
+
return cube.output == other_cube.output if self._output == Target.CLASSIFICATION else \
|
|
60
|
+
merge_cache[(cube, other_cube)].diversity < self.threshold
|
|
34
61
|
|
|
35
62
|
def _sort_cubes(self):
|
|
36
63
|
cubes = [(cube.diversity, i, cube) for i, cube in enumerate(self._hypercubes)]
|
|
37
64
|
cubes.sort()
|
|
38
65
|
self._hypercubes = [cube[2] for cube in cubes]
|
|
39
66
|
|
|
67
|
+
def _merge(self, to_split: list[HyperCube], dataframe: pd.DataFrame) -> Iterable[HyperCube]:
|
|
68
|
+
not_in_cache = set(to_split)
|
|
69
|
+
adjacent_cache = {}
|
|
70
|
+
merge_cache = {}
|
|
71
|
+
while True:
|
|
72
|
+
to_merge = [([cube, other_cube], merge_cache[(cube, other_cube)]) for cube, other_cube, feature in
|
|
73
|
+
HyperCubeExtractor._find_couples(to_split, not_in_cache, adjacent_cache) if
|
|
74
|
+
self._evaluate_merge(not_in_cache, dataframe, feature, cube, other_cube, merge_cache)]
|
|
75
|
+
|
|
76
|
+
if len(to_merge) == 0:
|
|
77
|
+
break
|
|
78
|
+
best = min(to_merge, key=lambda c: c[1].diversity)
|
|
79
|
+
for cube in best[0]:
|
|
80
|
+
to_split.remove(cube)
|
|
81
|
+
to_split.append(best[1])
|
|
82
|
+
not_in_cache = [best[1]]
|
|
83
|
+
return to_split
|
|
84
|
+
|
|
40
85
|
def extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
41
86
|
theory = PedagogicalExtractor.extract(self, dataframe)
|
|
42
87
|
self._surrounding = HyperCube.create_surrounding_cube(dataframe, output=self._output)
|
|
@@ -209,10 +254,16 @@ class FeatureRanker:
|
|
|
209
254
|
|
|
210
255
|
|
|
211
256
|
class Grid:
|
|
212
|
-
def __init__(self, iterations: int = 1, strategy: Strategy |
|
|
257
|
+
def __init__(self, iterations: int = 1, strategy: Strategy | Iterable[Strategy] = FixedStrategy()):
|
|
213
258
|
self.iterations = iterations
|
|
214
259
|
self.strategy = strategy
|
|
215
260
|
|
|
261
|
+
def make_fair(self, features: Iterable[str]):
|
|
262
|
+
if isinstance(self.strategy, Strategy):
|
|
263
|
+
self.strategy.make_fair(features)
|
|
264
|
+
elif isinstance(self.strategy, Iterable):
|
|
265
|
+
[strategy.make_fair(features) for strategy in self.strategy]
|
|
266
|
+
|
|
216
267
|
def get(self, feature: str, depth: int) -> int:
|
|
217
268
|
if isinstance(self.strategy, list):
|
|
218
269
|
return self.strategy[depth].get(feature)
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from collections import Iterable
|
|
4
|
-
import
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from typing import Callable, Any
|
|
5
|
+
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from sklearn.base import ClassifierMixin
|
|
7
8
|
from tuprolog.theory import Theory
|
|
@@ -16,16 +17,23 @@ class CReEPy(HyperCubeExtractor):
|
|
|
16
17
|
Explanator implementing CReEPy algorithm.
|
|
17
18
|
"""
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
20
|
+
ClusteringType = Callable[[int, float, Target, int, Any, Any, int], HyperCubeClustering]
|
|
21
|
+
|
|
22
|
+
def __init__(self, predictor, clustering: ClusteringType = Clustering.exact, depth: int = 3,
|
|
23
|
+
error_threshold: float = 0.1, output: Target = Target.CONSTANT, gauss_components: int = 5,
|
|
24
|
+
ranks: Iterable[(str, float)] = tuple(), ignore_threshold: float = 0.0, discretization=None,
|
|
25
|
+
normalization=None, seed: int = get_default_random_seed()):
|
|
23
26
|
super().__init__(predictor, Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output,
|
|
24
27
|
discretization, normalization)
|
|
25
28
|
self.clustering = clustering(depth, error_threshold, self._output, gauss_components, discretization,
|
|
26
29
|
normalization, seed)
|
|
27
30
|
self._default_surrounding_cube = True
|
|
28
31
|
self._dimensions_to_ignore = set([dimension for dimension, relevance in ranks if relevance < ignore_threshold])
|
|
32
|
+
self._protected_features = []
|
|
33
|
+
|
|
34
|
+
def make_fair(self, features: Iterable[str]):
|
|
35
|
+
self.clustering.make_fair(features)
|
|
36
|
+
self._dimensions_to_ignore.update(features)
|
|
29
37
|
|
|
30
38
|
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
31
39
|
if not isinstance(self.clustering, HyperCubeClustering):
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
from typing import Iterable
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from sklearn.base import ClassifierMixin
|
|
7
|
+
from sklearn.preprocessing import PolynomialFeatures
|
|
8
|
+
from tuprolog.theory import Theory
|
|
9
|
+
|
|
10
|
+
from psyke import get_default_random_seed, Target
|
|
11
|
+
from psyke.extraction.hypercubic import HyperCubeExtractor, HyperCube, RegressionCube
|
|
12
|
+
|
|
13
|
+
from deap import base, creator
|
|
14
|
+
|
|
15
|
+
from psyke.genetic.gin import GIn
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GInGER(HyperCubeExtractor):
|
|
19
|
+
"""
|
|
20
|
+
Explanator implementing GInGER algorithm.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, predictor, features, sigmas, max_slices, min_rules=1, max_poly=1, alpha=0.5, indpb=0.5,
|
|
24
|
+
tournsize=3, metric='R2', n_gen=50, n_pop=50, threshold=None, valid=None,
|
|
25
|
+
output: Target = Target.REGRESSION, normalization=None, seed: int = get_default_random_seed()):
|
|
26
|
+
super().__init__(predictor, output=Target.CLASSIFICATION if isinstance(predictor, ClassifierMixin) else output,
|
|
27
|
+
normalization=normalization)
|
|
28
|
+
self.threshold = threshold
|
|
29
|
+
np.random.seed(seed)
|
|
30
|
+
|
|
31
|
+
self.features = features
|
|
32
|
+
self.max_features = len(features)
|
|
33
|
+
self.sigmas = sigmas
|
|
34
|
+
self.max_slices = max_slices
|
|
35
|
+
self.min_rules = min_rules
|
|
36
|
+
self.poly = max_poly
|
|
37
|
+
self.trained_poly = None
|
|
38
|
+
|
|
39
|
+
self.alpha = alpha
|
|
40
|
+
self.indpb = indpb
|
|
41
|
+
self.tournsize = tournsize
|
|
42
|
+
self.metric = metric
|
|
43
|
+
|
|
44
|
+
self.n_gen = n_gen
|
|
45
|
+
self.n_pop = n_pop
|
|
46
|
+
self.valid = valid
|
|
47
|
+
|
|
48
|
+
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
|
|
49
|
+
creator.create("Individual", list, fitness=creator.FitnessMax)
|
|
50
|
+
|
|
51
|
+
def __poly_names(self):
|
|
52
|
+
return [''.join(['' if pp == 0 else f'{n} * ' if pp == 1 else f'{n}**{pp} * '
|
|
53
|
+
for pp, n in zip(p, self.trained_poly.feature_names_in_)])[:-3]
|
|
54
|
+
for p in self.trained_poly.powers_]
|
|
55
|
+
|
|
56
|
+
def _predict(self, dataframe: pd.DataFrame) -> Iterable:
|
|
57
|
+
dataframe = pd.DataFrame(self.trained_poly.fit_transform(dataframe), columns=self.__poly_names())
|
|
58
|
+
return np.array([self._predict_from_cubes(row.to_dict()) for _, row in dataframe.iterrows()])
|
|
59
|
+
|
|
60
|
+
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
61
|
+
best = {}
|
|
62
|
+
for poly in range(self.poly):
|
|
63
|
+
for slices in list(itertools.product(range(1, self.max_slices + 1), repeat=self.max_features)):
|
|
64
|
+
gr = GIn((dataframe.iloc[:, :-1], dataframe.iloc[:, -1]), self.valid, self.features, self.sigmas,
|
|
65
|
+
slices, min_rules=self.min_rules, poly=poly + 1, alpha=self.alpha, indpb=self.indpb,
|
|
66
|
+
tournsize=self.tournsize, metric=self.metric, output=self._output, warm=True)
|
|
67
|
+
|
|
68
|
+
b, score, _, _ = gr.run(n_gen=self.n_gen, n_pop=self.n_pop)
|
|
69
|
+
best[(score, poly + 1, slices)] = b
|
|
70
|
+
m = min(best)
|
|
71
|
+
poly, slices, best = m[1], m[2], best[m]
|
|
72
|
+
self.trained_poly = PolynomialFeatures(degree=poly, include_bias=False)
|
|
73
|
+
transformed = pd.DataFrame(self.trained_poly.fit_transform(dataframe.iloc[:, :-1]), columns=self.__poly_names())
|
|
74
|
+
transformed[dataframe.columns[-1]] = dataframe.iloc[:, -1].values
|
|
75
|
+
|
|
76
|
+
self._surrounding = HyperCube.create_surrounding_cube(transformed, output=self._output)
|
|
77
|
+
|
|
78
|
+
cuts = [sorted(best[sum(slices[:i]):sum(slices[:i + 1])]) for i in range(len(slices))]
|
|
79
|
+
|
|
80
|
+
intervals = [[(transformed[self.features[i]].min(), cut[0])] +
|
|
81
|
+
[(cut[i], cut[i + 1]) for i in range(len(cut) - 1)] +
|
|
82
|
+
[(cut[-1], transformed[self.features[i]].max())] for i, cut in enumerate(cuts)]
|
|
83
|
+
|
|
84
|
+
hypercubes = [{f: iv for f, iv in zip(self.features, combo)} for combo in itertools.product(*intervals)]
|
|
85
|
+
mi_ma = {f: (transformed[f].min(), transformed[f].max()) for f in transformed.columns if f not in self.features}
|
|
86
|
+
self._hypercubes = [self._default_cube({feat: h[feat] if feat in self.features else mi_ma[feat]
|
|
87
|
+
for feat in transformed.columns[:-1]}) for h in hypercubes]
|
|
88
|
+
self._hypercubes = [c for c in self._hypercubes if c.count(transformed) >= 2]
|
|
89
|
+
for c in self._hypercubes:
|
|
90
|
+
for feature in transformed.columns:
|
|
91
|
+
if feature not in self.features:
|
|
92
|
+
for direction in ['+', '-']:
|
|
93
|
+
c.set_infinite(feature, direction)
|
|
94
|
+
c.update(transformed)
|
|
95
|
+
if self.threshold is not None:
|
|
96
|
+
self._hypercubes = self._merge(self._hypercubes, transformed)
|
|
97
|
+
return self._create_theory(transformed)
|
|
98
|
+
|
|
99
|
+
def make_fair(self, features: Iterable[str]):
|
|
100
|
+
self._dimensions_to_ignore.update(features)
|
|
@@ -59,7 +59,6 @@ class GridEx(HyperCubeExtractor):
|
|
|
59
59
|
def _iterate(self, dataframe: pd.DataFrame):
|
|
60
60
|
fake = dataframe.copy()
|
|
61
61
|
prev = [self._surrounding]
|
|
62
|
-
next_iteration = []
|
|
63
62
|
|
|
64
63
|
for iteration in self.grid.iterate():
|
|
65
64
|
next_iteration = []
|
|
@@ -67,53 +66,12 @@ class GridEx(HyperCubeExtractor):
|
|
|
67
66
|
if cube.count(dataframe) == 0:
|
|
68
67
|
continue
|
|
69
68
|
if cube.diversity < self.threshold:
|
|
70
|
-
self._hypercubes
|
|
69
|
+
self._hypercubes.append(cube)
|
|
71
70
|
continue
|
|
72
71
|
to_split, fake = self._cubes_to_split(cube, iteration, dataframe, fake)
|
|
73
|
-
next_iteration
|
|
74
|
-
prev = next_iteration
|
|
75
|
-
self._hypercubes
|
|
72
|
+
next_iteration.extend(self._merge(to_split, fake))
|
|
73
|
+
prev = next_iteration
|
|
74
|
+
self._hypercubes.extend(prev)
|
|
76
75
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
adjacent_cache: dict[tuple[HyperCube, HyperCube], str | None]) -> \
|
|
80
|
-
Iterable[tuple[HyperCube, HyperCube, str]]:
|
|
81
|
-
checked = []
|
|
82
|
-
eligible = []
|
|
83
|
-
for cube in to_split:
|
|
84
|
-
checked.append(cube)
|
|
85
|
-
for other_cube in [c for c in to_split if c not in checked]:
|
|
86
|
-
if (cube in not_in_cache) or (other_cube in not_in_cache):
|
|
87
|
-
adjacent_cache[(cube, other_cube)] = cube.is_adjacent(other_cube)
|
|
88
|
-
adjacent_feature = adjacent_cache[(cube, other_cube)]
|
|
89
|
-
eligible.append((cube, other_cube, adjacent_feature))
|
|
90
|
-
return [couple for couple in eligible if couple[2] is not None]
|
|
91
|
-
|
|
92
|
-
def _evaluate_merge(self, not_in_cache: Iterable[HyperCube],
|
|
93
|
-
dataframe: pd.DataFrame, feature: str,
|
|
94
|
-
cube: HyperCube, other_cube: HyperCube,
|
|
95
|
-
merge_cache: dict[(HyperCube, HyperCube), HyperCube | None]) -> bool:
|
|
96
|
-
if (cube in not_in_cache) or (other_cube in not_in_cache):
|
|
97
|
-
merged_cube = cube.merge_along_dimension(other_cube, feature)
|
|
98
|
-
merged_cube.update(dataframe, self.predictor)
|
|
99
|
-
merge_cache[(cube, other_cube)] = merged_cube
|
|
100
|
-
return cube.output == other_cube.output if self._output == Target.CLASSIFICATION else \
|
|
101
|
-
merge_cache[(cube, other_cube)].diversity < self.threshold
|
|
102
|
-
|
|
103
|
-
def _merge(self, to_split: Iterable[HyperCube], dataframe: pd.DataFrame) -> Iterable[HyperCube]:
|
|
104
|
-
not_in_cache = [cube for cube in to_split]
|
|
105
|
-
adjacent_cache = {}
|
|
106
|
-
merge_cache = {}
|
|
107
|
-
cont = True
|
|
108
|
-
while cont:
|
|
109
|
-
to_merge = [([cube, other_cube], merge_cache[(cube, other_cube)]) for cube, other_cube, feature in
|
|
110
|
-
GridEx._find_couples(to_split, not_in_cache, adjacent_cache) if
|
|
111
|
-
self._evaluate_merge(not_in_cache, dataframe, feature, cube, other_cube, merge_cache)]
|
|
112
|
-
if len(to_merge) == 0:
|
|
113
|
-
cont = False
|
|
114
|
-
else:
|
|
115
|
-
sorted(to_merge, key=lambda c: c[1].diversity)
|
|
116
|
-
best = to_merge[0]
|
|
117
|
-
to_split = [cube for cube in to_split if cube not in best[0]] + [best[1]]
|
|
118
|
-
not_in_cache = [best[1]]
|
|
119
|
-
return to_split
|
|
76
|
+
def make_fair(self, features: Iterable[str]):
|
|
77
|
+
self.grid.make_fair(features)
|
|
@@ -5,12 +5,12 @@ from psyke.extraction.hypercubic.gridex import GridEx
|
|
|
5
5
|
|
|
6
6
|
class GridREx(GridEx):
|
|
7
7
|
"""
|
|
8
|
-
Explanator implementing GridREx algorithm.
|
|
8
|
+
Explanator implementing GridREx algorithm, doi:10.24963/kr.2022/57.
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
def __init__(self, predictor, grid: Grid, min_examples: int, threshold: float, normalization,
|
|
12
12
|
seed=get_default_random_seed()):
|
|
13
13
|
super().__init__(predictor, grid, min_examples, threshold, Target.REGRESSION, None, normalization, seed)
|
|
14
14
|
|
|
15
|
-
def _default_cube(self) -> RegressionCube:
|
|
15
|
+
def _default_cube(self, dimensions=None) -> RegressionCube:
|
|
16
16
|
return RegressionCube()
|
|
@@ -143,10 +143,9 @@ class HyperCube:
|
|
|
143
143
|
self._default = True
|
|
144
144
|
|
|
145
145
|
def set_infinite(self, dimension: str, direction: str):
|
|
146
|
-
if dimension in self._infinite_dimensions:
|
|
147
|
-
self._infinite_dimensions[dimension]
|
|
148
|
-
|
|
149
|
-
self._infinite_dimensions[dimension] = [direction]
|
|
146
|
+
if dimension not in self._infinite_dimensions:
|
|
147
|
+
self._infinite_dimensions[dimension] = set()
|
|
148
|
+
self._infinite_dimensions[dimension].add(direction)
|
|
150
149
|
|
|
151
150
|
def copy_infinite_dimensions(self, dimensions: dict[str, str]):
|
|
152
151
|
self._infinite_dimensions = dimensions.copy()
|
|
@@ -260,13 +259,15 @@ class HyperCube:
|
|
|
260
259
|
if not self.is_default and value is not None]
|
|
261
260
|
|
|
262
261
|
@staticmethod
|
|
263
|
-
def create_surrounding_cube(dataset: pd.DataFrame, closed: bool = False,
|
|
264
|
-
|
|
262
|
+
def create_surrounding_cube(dataset: pd.DataFrame, closed: bool = False, output=None,
|
|
263
|
+
features_to_ignore: Iterable[str] = []) -> GenericCube:
|
|
265
264
|
output = Target.CONSTANT if output is None else output
|
|
266
265
|
dimensions = {
|
|
267
266
|
column: (min(dataset[column]) - HyperCube.EPSILON * 2, max(dataset[column]) + HyperCube.EPSILON * 2)
|
|
268
267
|
for column in dataset.columns[:-1]
|
|
269
268
|
}
|
|
269
|
+
for column in features_to_ignore:
|
|
270
|
+
dimensions[column] = (-np.inf, np.inf)
|
|
270
271
|
if closed:
|
|
271
272
|
if output == Target.CONSTANT:
|
|
272
273
|
return ClosedCube(dimensions)
|
|
@@ -432,14 +433,16 @@ class HyperCube:
|
|
|
432
433
|
else:
|
|
433
434
|
self.update_dimension(feature, (lower, upper))
|
|
434
435
|
|
|
435
|
-
def update(self, dataset: pd.DataFrame, predictor) -> None:
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
436
|
+
def update(self, dataset: pd.DataFrame, predictor=None) -> None:
|
|
437
|
+
idx = self.filter_indices(dataset.iloc[:, :-1])
|
|
438
|
+
filtered = dataset.iloc[idx, :-1]
|
|
439
|
+
if len(filtered > 0):
|
|
440
|
+
predictions = dataset.iloc[idx, -1] if predictor is None else predictor.predict(filtered)
|
|
441
|
+
self._output = np.mean(predictions)
|
|
442
|
+
self._diversity = np.std(predictions)
|
|
443
|
+
self._error = (abs(predictions - self._output)).mean()
|
|
444
|
+
means = filtered.describe().loc['mean']
|
|
445
|
+
self._barycenter = Point(means.index.values, means.values)
|
|
443
446
|
|
|
444
447
|
# TODO: why this is not a property?
|
|
445
448
|
def init_diversity(self, std: float) -> None:
|
|
@@ -450,10 +453,11 @@ class RegressionCube(HyperCube):
|
|
|
450
453
|
def __init__(self, dimension: dict[str, tuple] = None, limits: set[Limit] = None, output=None):
|
|
451
454
|
super().__init__(dimension=dimension, limits=limits, output=LinearRegression() if output is None else output)
|
|
452
455
|
|
|
453
|
-
def update(self, dataset: pd.DataFrame, predictor) -> None:
|
|
454
|
-
|
|
456
|
+
def update(self, dataset: pd.DataFrame, predictor=None) -> None:
|
|
457
|
+
idx = self.filter_indices(dataset.iloc[:, :-1])
|
|
458
|
+
filtered = dataset.iloc[idx, :-1]
|
|
455
459
|
if len(filtered > 0):
|
|
456
|
-
predictions = predictor.predict(filtered)
|
|
460
|
+
predictions = dataset.iloc[idx, -1] if predictor is None else predictor.predict(filtered)
|
|
457
461
|
self._output.fit(filtered, predictions)
|
|
458
462
|
self._diversity = self._error = (abs(self._output.predict(filtered) - predictions)).mean()
|
|
459
463
|
means = filtered.describe().loc['mean']
|
|
@@ -471,12 +475,14 @@ class RegressionCube(HyperCube):
|
|
|
471
475
|
return new_cube
|
|
472
476
|
|
|
473
477
|
def body(self, variables: dict[str, Var], ignore: list[str], unscale=None, normalization=None) -> Iterable[Struct]:
|
|
474
|
-
intercept = self.output.intercept_
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
478
|
+
intercept = self.output.intercept_
|
|
479
|
+
intercept = np.array(intercept).flatten()[0] if isinstance(intercept, Iterable) else intercept
|
|
480
|
+
intercept = intercept if normalization is None else unscale(sum(
|
|
481
|
+
[-self.output.coef_.flatten()[i] * normalization[name][0] / normalization[name][1] for i, name in
|
|
482
|
+
enumerate(self.dimensions.keys())], intercept), list(normalization.keys())[-1])
|
|
483
|
+
coefs = self.output.coef_.flatten() if normalization is None else [
|
|
484
|
+
self.output.coef_.flatten()[i] / normalization[name][1] * normalization[list(normalization.keys())[-1]][1]
|
|
485
|
+
for i, name in enumerate(self.dimensions.keys())
|
|
480
486
|
]
|
|
481
487
|
return list(super().body(variables, ignore, unscale, normalization)) + [linear_function_creator(
|
|
482
488
|
list(variables.values()), [to_rounded_real(v) for v in coefs], to_rounded_real(intercept)
|
|
@@ -487,10 +493,11 @@ class ClassificationCube(HyperCube):
|
|
|
487
493
|
def __init__(self, dimension: dict[str, tuple] = None, limits: set[Limit] = None, output: str = ""):
|
|
488
494
|
super().__init__(dimension=dimension, limits=limits, output=output)
|
|
489
495
|
|
|
490
|
-
def update(self, dataset: pd.DataFrame, predictor) -> None:
|
|
491
|
-
|
|
496
|
+
def update(self, dataset: pd.DataFrame, predictor=None) -> None:
|
|
497
|
+
idx = self.filter_indices(dataset.iloc[:, :-1])
|
|
498
|
+
filtered = dataset.iloc[idx, :-1]
|
|
492
499
|
if len(filtered > 0):
|
|
493
|
-
predictions = predictor.predict(filtered)
|
|
500
|
+
predictions = dataset.iloc[idx, -1] if predictor is None else predictor.predict(filtered)
|
|
494
501
|
self._output = mode(predictions)
|
|
495
502
|
self._diversity = self._error = 1 - sum(p == self.output for p in predictions) / len(predictions)
|
|
496
503
|
means = filtered.describe().loc['mean']
|
|
@@ -23,6 +23,7 @@ class ITER(HyperCubeExtractor):
|
|
|
23
23
|
raise NotImplementedError
|
|
24
24
|
self.predictor = predictor
|
|
25
25
|
self.min_update = min_update
|
|
26
|
+
self._init_points = n_points
|
|
26
27
|
self.n_points = n_points
|
|
27
28
|
self.max_iterations = max_iterations
|
|
28
29
|
self.min_examples = min_examples
|
|
@@ -33,6 +34,10 @@ class ITER(HyperCubeExtractor):
|
|
|
33
34
|
self.seed = seed
|
|
34
35
|
self.ignore_dimensions = ignore_dimensions if ignore_dimensions is not None else []
|
|
35
36
|
|
|
37
|
+
def make_fair(self, features: Iterable[str]):
|
|
38
|
+
self.n_points = self._init_points
|
|
39
|
+
self.ignore_dimensions += list(features)
|
|
40
|
+
|
|
36
41
|
def _best_cube(self, dataframe: pd.DataFrame, cube: GenericCube, cubes: Iterable[Expansion]) -> Expansion | None:
|
|
37
42
|
expansions = []
|
|
38
43
|
for limit in cubes:
|