psyke 0.8.9.dev48__py3-none-any.whl → 1.0.4.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- psyke/__init__.py +112 -24
- psyke/clustering/__init__.py +4 -0
- psyke/clustering/cream/__init__.py +2 -6
- psyke/clustering/exact/__init__.py +10 -7
- psyke/clustering/utils.py +0 -1
- psyke/extraction/__init__.py +6 -2
- psyke/extraction/cart/{predictor.py → CartPredictor.py} +52 -7
- psyke/extraction/cart/FairTree.py +205 -0
- psyke/extraction/cart/FairTreePredictor.py +56 -0
- psyke/extraction/cart/__init__.py +27 -52
- psyke/extraction/hypercubic/__init__.py +58 -7
- psyke/extraction/hypercubic/creepy/__init__.py +14 -6
- psyke/extraction/hypercubic/ginger/__init__.py +100 -0
- psyke/extraction/hypercubic/gridex/__init__.py +6 -48
- psyke/extraction/hypercubic/gridrex/__init__.py +2 -2
- psyke/extraction/hypercubic/hypercube.py +33 -26
- psyke/extraction/hypercubic/iter/__init__.py +5 -0
- psyke/extraction/hypercubic/strategy.py +13 -9
- psyke/extraction/real/__init__.py +21 -22
- psyke/extraction/real/utils.py +2 -2
- psyke/extraction/trepan/__init__.py +19 -15
- psyke/genetic/__init__.py +0 -0
- psyke/genetic/fgin/__init__.py +74 -0
- psyke/genetic/gin/__init__.py +144 -0
- psyke/hypercubepredictor.py +4 -2
- psyke/tuning/pedro/__init__.py +4 -2
- psyke/utils/logic.py +4 -8
- {psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info}/METADATA +39 -19
- psyke-1.0.4.dev10.dist-info/RECORD +46 -0
- {psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info}/WHEEL +1 -1
- {psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info/licenses}/LICENSE +2 -1
- psyke-0.8.9.dev48.dist-info/RECORD +0 -40
- {psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from functools import reduce
|
|
4
|
-
from
|
|
4
|
+
from collections.abc import Iterable
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class Strategy:
|
|
8
|
-
def __init__(self):
|
|
9
|
-
self._partitions =
|
|
8
|
+
def __init__(self, partitions = None):
|
|
9
|
+
self._partitions = partitions
|
|
10
|
+
self._no_features = []
|
|
10
11
|
|
|
11
12
|
def get(self, feature: str) -> int:
|
|
12
13
|
raise NotImplementedError
|
|
13
14
|
|
|
15
|
+
def make_fair(self, features: Iterable[str]):
|
|
16
|
+
self._no_features = features
|
|
17
|
+
|
|
14
18
|
def partition_number(self, features: Iterable[str]) -> int:
|
|
15
19
|
return reduce(lambda x, y: x * y, map(self.get, features), 1)
|
|
16
20
|
|
|
@@ -29,23 +33,23 @@ class Strategy:
|
|
|
29
33
|
|
|
30
34
|
class FixedStrategy(Strategy):
|
|
31
35
|
def __init__(self, partitions: int = 2):
|
|
32
|
-
super().__init__()
|
|
33
|
-
self._partitions = partitions
|
|
36
|
+
super().__init__(partitions)
|
|
34
37
|
|
|
35
38
|
def get(self, feature: str) -> int:
|
|
36
|
-
return self._partitions
|
|
39
|
+
return 1 if feature in self._no_features else self._partitions
|
|
37
40
|
|
|
38
41
|
def __str__(self):
|
|
39
42
|
return "Fixed ({})".format(super().__str__())
|
|
40
43
|
|
|
41
44
|
|
|
42
45
|
class AdaptiveStrategy(Strategy):
|
|
43
|
-
def __init__(self, features: Iterable[str], partitions: Iterable[tuple[float, float]] | None = None):
|
|
44
|
-
super().__init__()
|
|
46
|
+
def __init__(self, features: Iterable[(str, float)], partitions: Iterable[tuple[float, float]] | None = None):
|
|
47
|
+
super().__init__(partitions if partitions is not None else [(0.33, 2), (0.67, 3)])
|
|
45
48
|
self.features = features
|
|
46
|
-
self._partitions = partitions if partitions is not None else [(0.33, 2), (0.67, 3)]
|
|
47
49
|
|
|
48
50
|
def get(self, feature: str) -> int:
|
|
51
|
+
if feature in self._no_features:
|
|
52
|
+
return 1
|
|
49
53
|
importance = next(filter(lambda t: t[0] == feature, self.features))[1]
|
|
50
54
|
n = 1
|
|
51
55
|
for (imp, part) in self._partitions:
|
|
@@ -15,13 +15,12 @@ class REAL(PedagogicalExtractor):
|
|
|
15
15
|
"""
|
|
16
16
|
Explanator implementing Rule Extraction As Learning (REAL) algorithm, doi:10.1016/B978-1-55860-335-6.50013-1.
|
|
17
17
|
The algorithm is sensible to features' order in the provided dataset during extraction.
|
|
18
|
-
To make it reproducible the features are internally sorted (alphabetically).
|
|
19
18
|
"""
|
|
20
19
|
|
|
21
20
|
def __init__(self, predictor, discretization: Iterable[DiscreteFeature]):
|
|
22
21
|
super().__init__(predictor, discretization)
|
|
22
|
+
self._ignore_feature = []
|
|
23
23
|
self._ruleset: IndexedRuleSet = IndexedRuleSet()
|
|
24
|
-
self._output_mapping = {}
|
|
25
24
|
|
|
26
25
|
@property
|
|
27
26
|
def n_rules(self):
|
|
@@ -31,7 +30,7 @@ class REAL(PedagogicalExtractor):
|
|
|
31
30
|
new_rule = self._rule_from_example(sample)
|
|
32
31
|
return any([new_rule in rule for rule in rules])
|
|
33
32
|
|
|
34
|
-
def
|
|
33
|
+
def _body(self, variables: dict[str, Var], rule: Rule) -> list[Struct]:
|
|
35
34
|
result = []
|
|
36
35
|
for predicates, truth_value in zip(rule.to_lists(), [True, False]):
|
|
37
36
|
for predicate in predicates:
|
|
@@ -40,27 +39,24 @@ class REAL(PedagogicalExtractor):
|
|
|
40
39
|
return result
|
|
41
40
|
|
|
42
41
|
def _create_clause(self, dataset: pd.DataFrame, variables: dict[str, Var], key: int, rule: Rule) -> Clause:
|
|
43
|
-
|
|
44
|
-
sorted(list(variables.values())),
|
|
45
|
-
str(sorted(list(set(dataset.iloc[:, -1])))[key]))
|
|
46
|
-
return clause(head, self._create_body(variables, rule))
|
|
42
|
+
return clause(create_head(dataset.columns[-1], list(variables.values()), key), self._body(variables, rule))
|
|
47
43
|
|
|
48
44
|
def _create_new_rule(self, sample: pd.Series) -> Rule:
|
|
49
45
|
rule = self._rule_from_example(sample)
|
|
50
46
|
return self._generalise(rule, sample)
|
|
51
47
|
|
|
52
48
|
def _create_ruleset(self, dataset: pd.DataFrame) -> IndexedRuleSet:
|
|
53
|
-
ruleset = IndexedRuleSet.create_indexed_ruleset(dataset)
|
|
54
|
-
for
|
|
49
|
+
ruleset = IndexedRuleSet.create_indexed_ruleset(sorted(set(dataset.iloc[:, -1])))
|
|
50
|
+
for _, sample in dataset.iloc[:, :-1].iterrows():
|
|
55
51
|
prediction = list(self.predictor.predict(sample.to_frame().transpose()))[0]
|
|
56
|
-
rules = ruleset.get(
|
|
52
|
+
rules = ruleset.get(prediction)
|
|
57
53
|
if not self._covers(sample, rules):
|
|
58
54
|
rules.append(self._create_new_rule(sample))
|
|
59
55
|
return ruleset.optimize()
|
|
60
56
|
|
|
61
|
-
def _create_theory(self, dataset: pd.DataFrame
|
|
57
|
+
def _create_theory(self, dataset: pd.DataFrame) -> MutableTheory:
|
|
62
58
|
theory = mutable_theory()
|
|
63
|
-
for key, rule in
|
|
59
|
+
for key, rule in self._ruleset.flatten():
|
|
64
60
|
variables = create_variable_list(self.discretization)
|
|
65
61
|
theory.assertZ(self._create_clause(dataset, variables, key, rule))
|
|
66
62
|
return theory
|
|
@@ -92,16 +88,22 @@ class REAL(PedagogicalExtractor):
|
|
|
92
88
|
return self._create_ruleset(dataset)
|
|
93
89
|
|
|
94
90
|
def _internal_predict(self, sample: pd.Series):
|
|
95
|
-
x = [index for index, rule in self._ruleset.flatten() if
|
|
96
|
-
|
|
97
|
-
return reverse_mapping[x[0]] if len(x) > 0 else None
|
|
91
|
+
x = [index for index, rule in self._ruleset.flatten() if self._rule_from_example(sample) in rule]
|
|
92
|
+
return x[0] if x else None
|
|
98
93
|
|
|
99
|
-
|
|
100
|
-
|
|
94
|
+
def make_fair(self, features: Iterable[str]):
|
|
95
|
+
self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \
|
|
96
|
+
if self.discretization else [features]
|
|
97
|
+
self._ignore_feature = [feature for features in self._ignore_feature for feature in features]
|
|
98
|
+
self._get_or_set.cache_clear()
|
|
99
|
+
|
|
100
|
+
def _rule_from_example(self, sample: pd.Series) -> Rule:
|
|
101
101
|
true_predicates, false_predicates = [], []
|
|
102
102
|
for feature, value in sample.items():
|
|
103
|
+
if feature in self._ignore_feature:
|
|
104
|
+
continue
|
|
103
105
|
true_predicates.append(str(feature)) if value == 1 else false_predicates.append(str(feature))
|
|
104
|
-
return Rule(
|
|
106
|
+
return Rule(true_predicates, false_predicates)
|
|
105
107
|
|
|
106
108
|
def _subset(self, samples: pd.DataFrame, predicate: str) -> (pd.DataFrame, bool):
|
|
107
109
|
samples_0 = samples.copy()
|
|
@@ -112,11 +114,8 @@ class REAL(PedagogicalExtractor):
|
|
|
112
114
|
return samples_all, len(set(self.predictor.predict(samples_all))) == 1
|
|
113
115
|
|
|
114
116
|
def _extract(self, dataframe: pd.DataFrame) -> Theory:
|
|
115
|
-
# Order the dataset by column to preserve reproducibility.
|
|
116
|
-
dataframe = dataframe.sort_values(by=list(dataframe.columns.values), ascending=False)
|
|
117
|
-
self._output_mapping = {value: index for index, value in enumerate(sorted(set(dataframe.iloc[:, -1])))}
|
|
118
117
|
self._ruleset = self._get_or_set(HashableDataFrame(dataframe))
|
|
119
|
-
return self._create_theory(dataframe
|
|
118
|
+
return self._create_theory(dataframe)
|
|
120
119
|
|
|
121
120
|
def _predict(self, dataframe) -> Iterable:
|
|
122
121
|
return np.array([self._internal_predict(data.transpose()) for _, data in dataframe.iterrows()])
|
psyke/extraction/real/utils.py
CHANGED
|
@@ -49,5 +49,5 @@ class IndexedRuleSet(dict[int, list[Rule]]):
|
|
|
49
49
|
]
|
|
50
50
|
|
|
51
51
|
@staticmethod
|
|
52
|
-
def create_indexed_ruleset(
|
|
53
|
-
return IndexedRuleSet({
|
|
52
|
+
def create_indexed_ruleset(indices: Iterable) -> IndexedRuleSet:
|
|
53
|
+
return IndexedRuleSet({i: [] for i in indices})
|
|
@@ -15,11 +15,17 @@ class Trepan(PedagogicalExtractor):
|
|
|
15
15
|
def __init__(self, predictor, discretization: Iterable[DiscreteFeature], min_examples: int = 0, max_depth: int = 3,
|
|
16
16
|
split_logic: SplitLogic = SplitLogic.DEFAULT):
|
|
17
17
|
super().__init__(predictor, discretization)
|
|
18
|
+
self._ignore_feature = []
|
|
18
19
|
self.min_examples = min_examples
|
|
19
20
|
self.max_depth = max_depth
|
|
20
21
|
self.split_logic = split_logic
|
|
21
22
|
self._root: Node
|
|
22
23
|
|
|
24
|
+
def make_fair(self, features: Iterable[str]):
|
|
25
|
+
self._ignore_feature = [list(i.admissible_values.keys()) for i in self.discretization if i.name in features] \
|
|
26
|
+
if self.discretization else [features]
|
|
27
|
+
self._ignore_feature = [feature for features in self._ignore_feature for feature in features]
|
|
28
|
+
|
|
23
29
|
@property
|
|
24
30
|
def n_rules(self):
|
|
25
31
|
return sum(1 for _ in self._root)
|
|
@@ -29,7 +35,7 @@ class Trepan(PedagogicalExtractor):
|
|
|
29
35
|
raise NotImplementedError()
|
|
30
36
|
if node.n_classes == 1:
|
|
31
37
|
return None
|
|
32
|
-
splits =
|
|
38
|
+
splits = self._create_splits(node, names)
|
|
33
39
|
return None if len(splits) == 0 or splits[0].children[0].depth > self.max_depth else splits[0].children
|
|
34
40
|
|
|
35
41
|
def _compact(self):
|
|
@@ -55,28 +61,26 @@ class Trepan(PedagogicalExtractor):
|
|
|
55
61
|
def _create_split(node: Node, column: str) -> Union[Split, None]:
|
|
56
62
|
true_examples = Trepan._create_samples(node, column, 1.0)
|
|
57
63
|
false_examples = Trepan._create_samples(node, column, 0.0)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
true_node = Node(true_examples, node.n_examples,
|
|
64
|
+
true_constraints = list(node.constraints) + [(column, 1.0)]
|
|
65
|
+
false_constraints = list(node.constraints) + [(column, 0.0)]
|
|
66
|
+
true_node = Node(true_examples, node.n_examples, true_constraints, depth=node.depth + 1) \
|
|
61
67
|
if true_examples.shape[0] > 0 else None
|
|
62
|
-
false_node = Node(false_examples, node.n_examples,
|
|
68
|
+
false_node = Node(false_examples, node.n_examples, false_constraints, depth=node.depth + 1) \
|
|
63
69
|
if false_examples.shape[0] > 0 else None
|
|
64
70
|
return None if true_node is None or false_node is None else Split(node, (true_node, false_node))
|
|
65
71
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
if
|
|
71
|
-
|
|
72
|
-
if split is not None:
|
|
73
|
-
splits.add(split)
|
|
72
|
+
def _create_splits(self, node: Node, names: Iterable[str]) -> SortedList[Split]:
|
|
73
|
+
splits, constraints = Trepan._init_splits(node)
|
|
74
|
+
for column in [column for column in names if column not in list(constraints) + self._ignore_feature]:
|
|
75
|
+
split = Trepan._create_split(node, column)
|
|
76
|
+
if split is not None:
|
|
77
|
+
splits.add(split)
|
|
74
78
|
return splits
|
|
75
79
|
|
|
76
|
-
def _create_theory(self, name: str
|
|
80
|
+
def _create_theory(self, name: str) -> MutableTheory:
|
|
77
81
|
theory = mutable_theory()
|
|
78
82
|
for node in self._root:
|
|
79
|
-
variables = create_variable_list(self.discretization
|
|
83
|
+
variables = create_variable_list(self.discretization)
|
|
80
84
|
theory.assertZ(
|
|
81
85
|
clause(
|
|
82
86
|
create_head(name, list(variables.values()), str(node.dominant)),
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
from psyke import Target
|
|
5
|
+
from psyke.genetic.gin import GIn
|
|
6
|
+
|
|
7
|
+
import skfuzzy as skf
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FGIn(GIn):
|
|
11
|
+
|
|
12
|
+
def __init__(self, train, valid, features, sigmas, slices, min_rules=1, poly=1, alpha=0.5, indpb=0.5, tournsize=3,
|
|
13
|
+
metric='R2', output=Target.REGRESSION, warm=False):
|
|
14
|
+
super().__init__(train, valid, features, sigmas, slices, min_rules, poly, alpha, indpb, tournsize,
|
|
15
|
+
metric, output, warm)
|
|
16
|
+
self.feature_to_idx = {f: i for i, f in enumerate(self.X.columns)}
|
|
17
|
+
|
|
18
|
+
def _evaluate(self, individual=None):
|
|
19
|
+
y_pred, valid_regions = self.__predict(individual or self.best, self.X if self.valid is None else self.valid[0])
|
|
20
|
+
if valid_regions < self.min_rules:
|
|
21
|
+
return -9999,
|
|
22
|
+
return self._score(self.y if self.valid is None else self.valid[1], y_pred),
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def __generate_membership(var, domain, thresholds, shape='tri'):
|
|
26
|
+
th = [var.min()] + [min(max(t, var.min()), var.max()) for t in thresholds] + [var.max()]
|
|
27
|
+
|
|
28
|
+
if shape == 'tri':
|
|
29
|
+
mid = [(x1 + x2) / 2 for x1, x2 in zip(th[:-1], th[1:])]
|
|
30
|
+
return [skf.trapmf(domain, [domain.min()] * 2 + mid[:2])] + \
|
|
31
|
+
[skf.trimf(domain, [x1, x2, x3]) for x1, x2, x3 in zip(mid[:-2], mid[1:-1], mid[2:])] + \
|
|
32
|
+
[skf.trapmf(domain, mid[-2:] + [domain.max()] * 2)]
|
|
33
|
+
if shape == 'trap':
|
|
34
|
+
beg = [None, domain.min()] + [(3 * x1 + x2) / 4 for x1, x2 in zip(th[1:-1], th[2:])] + [domain.max()]
|
|
35
|
+
end = [domain.min()] + [(x1 + 3 * x2) / 4 for x1, x2 in zip(th[:-2], th[1:-1])] + [domain.max()]
|
|
36
|
+
return [skf.trapmf(domain, [end[i - 1], beg[i], end[i], beg[i + 1]]) for i in range(1, len(th))]
|
|
37
|
+
raise ValueError('Supported shape values are only \'tri\' and \'trap\'')
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def __extend_domain(x, q_low=0.05, q_high=0.95, p=0.05, k_sigma=2.0, abs_min_margin=0.0):
|
|
41
|
+
ql, qh = np.quantile(x, [q_low, q_high])
|
|
42
|
+
margin = max(p * (qh - ql), k_sigma * np.std(x), abs_min_margin)
|
|
43
|
+
return np.array([ql - margin, qh + margin])
|
|
44
|
+
|
|
45
|
+
def __get_activations(self, x, functions_domains, valid_masks):
|
|
46
|
+
levels = [np.array([skf.interp_membership(domain, mf, x[index]) for mf in mfs])
|
|
47
|
+
for mfs, domain, index in functions_domains.values()]
|
|
48
|
+
return np.prod(np.meshgrid(*levels, indexing='ij'), axis=0).ravel()[valid_masks]
|
|
49
|
+
|
|
50
|
+
def __fuzzify(self, cuts):
|
|
51
|
+
cuts = dict(zip(self.features, cuts))
|
|
52
|
+
doms = {c: FGIn.__extend_domain(self.X[c]) for c in self.features}
|
|
53
|
+
return {c: (FGIn.__generate_membership(self.X[c], doms[c], cuts[c], 'trap'), doms[c],
|
|
54
|
+
self.feature_to_idx[c]) for c in self.features}
|
|
55
|
+
|
|
56
|
+
def __predict(self, individual=None, to_pred=None):
|
|
57
|
+
cuts = self._get_cuts(individual or self.best)
|
|
58
|
+
masks = np.array([self._region(to_pred, cuts) == r for r in range(np.prod([s + 1 for s in self.slices]))])
|
|
59
|
+
valid_masks = masks.sum(axis=1) >= 3
|
|
60
|
+
|
|
61
|
+
masks = [mask for mask in masks if mask.sum() >= 3]
|
|
62
|
+
functions_domains = self.__fuzzify(cuts)
|
|
63
|
+
|
|
64
|
+
pred = np.array([self._output_estimation(mask, to_pred) for mask in masks]).T
|
|
65
|
+
activations = np.array([self.__get_activations(x, functions_domains, valid_masks) for x in to_pred.values])
|
|
66
|
+
|
|
67
|
+
if self.output == Target.CLASSIFICATION:
|
|
68
|
+
classes, idx = np.unique(pred, return_inverse=True)
|
|
69
|
+
pred = classes[np.argmax(np.vstack([activations[:, idx == i].sum(axis=1) for i, c in enumerate(classes)]),
|
|
70
|
+
axis=0)]
|
|
71
|
+
else:
|
|
72
|
+
pred = (pred * activations).sum(axis=1)
|
|
73
|
+
|
|
74
|
+
return pd.DataFrame(pred, index=to_pred.index), len(masks)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from statistics import mode
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from deap import base, creator, tools, algorithms
|
|
5
|
+
import random
|
|
6
|
+
from sklearn.linear_model import LinearRegression
|
|
7
|
+
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, f1_score, accuracy_score
|
|
8
|
+
from sklearn.preprocessing import PolynomialFeatures
|
|
9
|
+
|
|
10
|
+
from psyke import Target
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GIn:
|
|
14
|
+
|
|
15
|
+
def __init__(self, train, valid, features, sigmas, slices, min_rules=1, poly=1, alpha=0.5, indpb=0.5, tournsize=3,
|
|
16
|
+
metric='R2', output=Target.REGRESSION, warm=False):
|
|
17
|
+
self.X, self.y = train
|
|
18
|
+
self.valid = valid
|
|
19
|
+
self.output = output
|
|
20
|
+
|
|
21
|
+
self.features = features
|
|
22
|
+
self.sigmas = sigmas
|
|
23
|
+
self.slices = slices
|
|
24
|
+
self.min_rules = min_rules
|
|
25
|
+
self.poly = PolynomialFeatures(degree=poly, include_bias=False)
|
|
26
|
+
|
|
27
|
+
self.alpha = alpha
|
|
28
|
+
self.indpb = indpb
|
|
29
|
+
self.tournsize = tournsize
|
|
30
|
+
self.metric = metric
|
|
31
|
+
|
|
32
|
+
self.toolbox = None
|
|
33
|
+
self.stats = None
|
|
34
|
+
self.hof = None
|
|
35
|
+
self.best = None
|
|
36
|
+
|
|
37
|
+
self.__setup(warm)
|
|
38
|
+
|
|
39
|
+
def _region(self, x, cuts):
|
|
40
|
+
indices = [np.searchsorted(np.array(cut), x[f].to_numpy(), side='right')
|
|
41
|
+
for cut, f in zip(cuts, self.features)]
|
|
42
|
+
|
|
43
|
+
regions = np.zeros(len(x), dtype=int)
|
|
44
|
+
multiplier = 1
|
|
45
|
+
for idx, n in zip(reversed(indices), reversed([len(cut) + 1 for cut in cuts])):
|
|
46
|
+
regions += idx * multiplier
|
|
47
|
+
multiplier *= n
|
|
48
|
+
|
|
49
|
+
return regions
|
|
50
|
+
|
|
51
|
+
def _output_estimation(self, mask, to_pred):
|
|
52
|
+
if self.output == Target.REGRESSION:
|
|
53
|
+
return LinearRegression().fit(self.poly.fit_transform(self.X)[mask], self.y[mask]).predict(
|
|
54
|
+
self.poly.fit_transform(to_pred))
|
|
55
|
+
if self.output == Target.CONSTANT:
|
|
56
|
+
return np.mean(self.y[mask])
|
|
57
|
+
if self.output == Target.CLASSIFICATION:
|
|
58
|
+
return mode(self.y[mask])
|
|
59
|
+
raise ValueError('Supported outputs are Target.{REGRESSION, CONSTANT, CLASSIFICATION}')
|
|
60
|
+
|
|
61
|
+
def _score(self, true, pred):
|
|
62
|
+
if self.metric == 'R2':
|
|
63
|
+
return r2_score(true, pred)
|
|
64
|
+
if self.metric == 'MAE':
|
|
65
|
+
return -mean_absolute_error(true, pred)
|
|
66
|
+
if self.metric == 'MSE':
|
|
67
|
+
return -mean_squared_error(true, pred)
|
|
68
|
+
if self.metric == 'F1':
|
|
69
|
+
return f1_score(true, pred, average='weighted')
|
|
70
|
+
if self.metric == 'ACC':
|
|
71
|
+
return accuracy_score(true, pred)
|
|
72
|
+
raise ValueError('Supported metrics are R2, MAE, MSE, F1, ACC')
|
|
73
|
+
|
|
74
|
+
def predict(self, to_pred):
|
|
75
|
+
return self.__predict(to_pred=to_pred)[0]
|
|
76
|
+
|
|
77
|
+
def _get_cuts(self, individual):
|
|
78
|
+
boundaries = np.cumsum([0] + list(self.slices))
|
|
79
|
+
return [sorted(individual[boundaries[i]:boundaries[i + 1]]) for i in range(len(self.slices))]
|
|
80
|
+
|
|
81
|
+
def __predict(self, individual=None, to_pred=None):
|
|
82
|
+
cuts = self._get_cuts(individual or self.best)
|
|
83
|
+
|
|
84
|
+
regions = self._region(to_pred, cuts)
|
|
85
|
+
regionsT = self._region(self.X, cuts)
|
|
86
|
+
|
|
87
|
+
pred = np.empty(len(to_pred), dtype=f'U{self.y.str.len().max()}') if self.output == Target.CLASSIFICATION \
|
|
88
|
+
else np.zeros(len(to_pred))
|
|
89
|
+
valid_regions = 0
|
|
90
|
+
|
|
91
|
+
for r in range(np.prod([s + 1 for s in self.slices])):
|
|
92
|
+
mask = regions == r
|
|
93
|
+
maskT = regionsT == r
|
|
94
|
+
if min(mask.sum(), maskT.sum()) < 3:
|
|
95
|
+
if self.output != Target.CLASSIFICATION:
|
|
96
|
+
pred[mask] = np.mean(self.y)
|
|
97
|
+
continue
|
|
98
|
+
pred[mask] = self._output_estimation(maskT, to_pred[mask])
|
|
99
|
+
valid_regions += 1
|
|
100
|
+
|
|
101
|
+
return pred, valid_regions
|
|
102
|
+
|
|
103
|
+
def _evaluate(self, individual=None):
|
|
104
|
+
y_pred, valid_regions = self.__predict(individual or self.best, self.X if self.valid is None else self.valid[0])
|
|
105
|
+
if valid_regions < self.min_rules:
|
|
106
|
+
return -9999,
|
|
107
|
+
return self._score(self.y if self.valid is None else self.valid[1], y_pred),
|
|
108
|
+
|
|
109
|
+
def __setup(self, warm=False):
|
|
110
|
+
if not warm:
|
|
111
|
+
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
|
|
112
|
+
creator.create("Individual", list, fitness=creator.FitnessMax)
|
|
113
|
+
|
|
114
|
+
self.toolbox = base.Toolbox()
|
|
115
|
+
for f in self.features:
|
|
116
|
+
self.toolbox.register(f, random.uniform, self.X[f].min(), self.X[f].max())
|
|
117
|
+
|
|
118
|
+
self.toolbox.register("individual", tools.initCycle, creator.Individual,
|
|
119
|
+
(sum([[getattr(self.toolbox, f) for i in range(s)]
|
|
120
|
+
for f, s in zip(self.features, self.slices)], [])), n=1)
|
|
121
|
+
|
|
122
|
+
self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual)
|
|
123
|
+
|
|
124
|
+
self.toolbox.register("mate", tools.cxBlend, alpha=self.alpha)
|
|
125
|
+
self.toolbox.register("mutate", tools.mutGaussian, indpb=self.indpb, mu=0,
|
|
126
|
+
sigma=sum([[sig] * s for sig, s in zip(self.sigmas, self.slices)], []))
|
|
127
|
+
self.toolbox.register("select", tools.selTournament, tournsize=self.tournsize)
|
|
128
|
+
self.toolbox.register("evaluate", self._evaluate)
|
|
129
|
+
|
|
130
|
+
self.stats = tools.Statistics(lambda ind: ind.fitness.values[0])
|
|
131
|
+
self.stats.register("avg", np.mean)
|
|
132
|
+
# self.stats.register("min", np.min)
|
|
133
|
+
self.stats.register("max", np.max)
|
|
134
|
+
# self.stats.register("std", np.std)
|
|
135
|
+
|
|
136
|
+
self.hof = tools.HallOfFame(1)
|
|
137
|
+
|
|
138
|
+
def run(self, n_pop=30, cxpb=0.8, mutpb=0.5, n_gen=50, seed=123):
|
|
139
|
+
random.seed(seed)
|
|
140
|
+
pop = self.toolbox.population(n=n_pop)
|
|
141
|
+
result, log = algorithms.eaSimple(pop, self.toolbox, cxpb=cxpb, mutpb=mutpb, ngen=n_gen,
|
|
142
|
+
stats=self.stats, halloffame=self.hof, verbose=False)
|
|
143
|
+
self.best = tools.selBest(pop, 1)[0]
|
|
144
|
+
return self.best, self._evaluate()[0], result, log
|
psyke/hypercubepredictor.py
CHANGED
|
@@ -45,9 +45,9 @@ class HyperCubePredictor(EvaluableModel):
|
|
|
45
45
|
idx = tree.query([list(row.values())], k=1)[1][0][0]
|
|
46
46
|
return HyperCubePredictor._get_cube_output(cubes[idx], row)
|
|
47
47
|
|
|
48
|
-
def _brute_predict_surface(self, row:
|
|
48
|
+
def _brute_predict_surface(self, row: pd.Series) -> GenericCube:
|
|
49
49
|
return min([(
|
|
50
|
-
cube.surface_distance(Point(list(row.keys()), list(row.values
|
|
50
|
+
cube.surface_distance(Point(list(row.keys()), list(row.values))), cube.volume(), cube
|
|
51
51
|
) for cube in self._hypercubes])[-1]
|
|
52
52
|
|
|
53
53
|
def _create_brute_tree(self, criterion: str = 'center', n: int = 2) -> (BallTree, list[GenericCube]):
|
|
@@ -76,6 +76,8 @@ class HyperCubePredictor(EvaluableModel):
|
|
|
76
76
|
return round(HyperCubePredictor._get_cube_output(cube, data), get_int_precision())
|
|
77
77
|
|
|
78
78
|
def _find_cube(self, data: dict[str, float]) -> GenericCube | None:
|
|
79
|
+
if not self._hypercubes:
|
|
80
|
+
return None
|
|
79
81
|
data = data.copy()
|
|
80
82
|
for dimension in self._dimensions_to_ignore:
|
|
81
83
|
if dimension in data:
|
psyke/tuning/pedro/__init__.py
CHANGED
|
@@ -55,8 +55,10 @@ class PEDRO(SKEOptimizer, IterativeOptimizer):
|
|
|
55
55
|
patience = self.patience
|
|
56
56
|
while patience > 0:
|
|
57
57
|
print("{}. {}. Threshold = {:.2f}. ".format(self.algorithm_name, grid, threshold), end="")
|
|
58
|
-
|
|
59
|
-
|
|
58
|
+
param_dict = dict(min_examples=25, threshold=threshold, normalization=self.normalization)
|
|
59
|
+
if self.algorithm != Extractor.gridrex:
|
|
60
|
+
param_dict['output'] = self.output
|
|
61
|
+
extractor = self.algorithm(self.predictor, grid, **param_dict)
|
|
60
62
|
_ = extractor.extract(self.dataframe)
|
|
61
63
|
error_function = (lambda *x: 1 - extractor.accuracy(*x)) if self.output == Target.CLASSIFICATION \
|
|
62
64
|
else extractor.mae
|
psyke/utils/logic.py
CHANGED
|
@@ -123,14 +123,10 @@ def to_var(name: str) -> Var:
|
|
|
123
123
|
return var(name[0].upper() + name[1:])
|
|
124
124
|
|
|
125
125
|
|
|
126
|
-
def create_variable_list(features: list[DiscreteFeature], dataset: pd.DataFrame = None
|
|
127
|
-
if
|
|
128
|
-
features = sorted(features, key=lambda x: x.name)
|
|
129
|
-
dataset = sorted(dataset.columns[:-1]) if dataset is not None else None
|
|
130
|
-
else:
|
|
131
|
-
dataset = dataset.columns[:-1] if dataset is not None else None
|
|
126
|
+
def create_variable_list(features: list[DiscreteFeature], dataset: pd.DataFrame = None) -> dict[str, Var]:
|
|
127
|
+
dataset = dataset.columns[:-1] if dataset is not None else None
|
|
132
128
|
values = {feature.name: to_var(feature.name) for feature in features} \
|
|
133
|
-
if
|
|
129
|
+
if features else {name: to_var(name) for name in dataset}
|
|
134
130
|
return values
|
|
135
131
|
|
|
136
132
|
|
|
@@ -325,4 +321,4 @@ def get_not_in_rule(min_included: bool = False, max_included: bool = True) -> Cl
|
|
|
325
321
|
parser = DEFAULT_CLAUSES_PARSER
|
|
326
322
|
theory = parser.parse_clauses(not_in_textual_rule(LE if min_included else L, GE if max_included else G),
|
|
327
323
|
operators=None)
|
|
328
|
-
return theory[0]
|
|
324
|
+
return theory[0]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: psyke
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.4.dev10
|
|
4
4
|
Summary: Python-based implementation of PSyKE, i.e. a Platform for Symbolic Knowledge Extraction
|
|
5
5
|
Home-page: https://github.com/psykei/psyke-python
|
|
6
6
|
Author: Matteo Magnini
|
|
@@ -16,33 +16,55 @@ Classifier: Topic :: Software Development :: Libraries
|
|
|
16
16
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
17
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
18
18
|
Classifier: Programming Language :: Python :: 3
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
21
|
Classifier: Programming Language :: Prolog
|
|
22
|
-
Requires-Python:
|
|
22
|
+
Requires-Python: ==3.11
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE
|
|
25
|
-
Requires-Dist: numpy~=
|
|
26
|
-
Requires-Dist: pandas~=2.
|
|
27
|
-
Requires-Dist: scikit-learn~=1.
|
|
25
|
+
Requires-Dist: numpy~=2.3.4
|
|
26
|
+
Requires-Dist: pandas~=2.3.0
|
|
27
|
+
Requires-Dist: scikit-learn~=1.8.0
|
|
28
28
|
Requires-Dist: 2ppy~=0.4.0
|
|
29
29
|
Requires-Dist: kneed~=0.8.1
|
|
30
30
|
Requires-Dist: sympy~=1.11
|
|
31
|
+
Dynamic: author
|
|
32
|
+
Dynamic: author-email
|
|
33
|
+
Dynamic: classifier
|
|
34
|
+
Dynamic: description
|
|
35
|
+
Dynamic: description-content-type
|
|
36
|
+
Dynamic: home-page
|
|
37
|
+
Dynamic: keywords
|
|
38
|
+
Dynamic: license
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
Dynamic: platform
|
|
41
|
+
Dynamic: project-url
|
|
42
|
+
Dynamic: requires-dist
|
|
43
|
+
Dynamic: requires-python
|
|
44
|
+
Dynamic: summary
|
|
31
45
|
|
|
32
46
|
# PSyKE
|
|
33
47
|
|
|
34
48
|

|
|
35
49
|
|
|
36
|
-
|
|
50
|
+
Quick links:
|
|
37
51
|
* [Home Page](https://apice.unibo.it/xwiki/bin/view/PSyKE/)
|
|
38
52
|
* [GitHub Repository](https://github.com/psykei/psyke-python)
|
|
39
53
|
* [PyPi Repository](https://pypi.org/project/psyke/)
|
|
40
54
|
* [Issues](https://github.com/psykei/psyke-python/issues)
|
|
41
55
|
|
|
56
|
+
## Latest Releases
|
|
57
|
+
|
|
58
|
+
* PSyKE 1.0: Compatibility with Python 3.11.x
|
|
59
|
+
* PSyKE 0.10: New genetic algorithms for knowledge extraction
|
|
60
|
+
* PSyKE 0.9: Fairness mitigation support for knowedge extractors
|
|
61
|
+
* PSyKE 0.8: New features: local explainability and counterfactual support
|
|
62
|
+
* PSyKE 0.7: New SKE algorithms implemented
|
|
63
|
+
|
|
42
64
|
## Intro
|
|
43
65
|
|
|
44
66
|
[PSyKE](https://apice.unibo.it/xwiki/bin/view/PSyKE/) (Platform for Symbolic Knowledge Extraction)
|
|
45
|
-
is intended as a library for extracting symbolic knowledge (in the form of logic
|
|
67
|
+
is intended as a library for extracting symbolic knowledge (in the form of logic rule lists) out of sub-symbolic predictors.
|
|
46
68
|
|
|
47
69
|
More precisely, PSyKE offers a general purpose API for knowledge extraction, and a number of different algorithms implementing it,
|
|
48
70
|
supporting both classification and regression problems.
|
|
@@ -91,16 +113,14 @@ We are working on PSyKE to extend its features to encompass explainable clusteri
|
|
|
91
113
|
|
|
92
114
|
### End users
|
|
93
115
|
|
|
94
|
-
PSyKE is deployed as a library on Pypi
|
|
116
|
+
PSyKE is deployed as a library on Pypi. It can be installed as Python package by running:
|
|
95
117
|
```bash
|
|
96
118
|
pip install psyke
|
|
97
119
|
```
|
|
98
120
|
|
|
99
121
|
#### Requirements
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
* `scikit-learn`
|
|
103
|
-
* `2ppy`
|
|
122
|
+
|
|
123
|
+
Please refer to the [requirements file](https://github.com/psykei/psyke-python/blob/master/requirements.txt)
|
|
104
124
|
|
|
105
125
|
##### Test requirements
|
|
106
126
|
* `skl2onnx`
|
|
@@ -108,15 +128,15 @@ pip install psyke
|
|
|
108
128
|
* `parameterized`
|
|
109
129
|
|
|
110
130
|
Once installed, it is possible to create an extractor from a predictor
|
|
111
|
-
(e.g. Neural Network, Support Vector Machine, K-Nearest
|
|
112
|
-
and from the
|
|
131
|
+
(e.g. Neural Network, Support Vector Machine, K-Nearest Neighbours, Random Forest, etc.)
|
|
132
|
+
and from the data set used to train the predictor.
|
|
113
133
|
|
|
114
134
|
> **Note:** the predictor must expose a method named `predict` to be properly used as an oracle.
|
|
115
135
|
|
|
116
136
|
#### End users
|
|
117
137
|
|
|
118
138
|
A brief example is presented in `demo.py` script in the `demo/` folder.
|
|
119
|
-
Using `sklearn`'s Iris
|
|
139
|
+
Using `sklearn`'s Iris data set we train a K-Nearest Neighbours to predict the correct output class.
|
|
120
140
|
Before training, we make the dataset discrete.
|
|
121
141
|
After that we create two different extractors: REAL and Trepan.
|
|
122
142
|
We output the extracted theory for both extractors.
|
|
@@ -142,8 +162,8 @@ iris(PetalLength8, PetalWidth8, SepalLength8, SepalWidth8, setosa) :- true.
|
|
|
142
162
|
## Developers
|
|
143
163
|
|
|
144
164
|
Working with PSyKE codebase requires a number of tools to be installed:
|
|
145
|
-
* Python 3.
|
|
146
|
-
+ Python version
|
|
165
|
+
* Python 3.11
|
|
166
|
+
+ Python version >= `3.12.x` are currently __not__ supported
|
|
147
167
|
|
|
148
168
|
* JDK 11+ (please ensure the `JAVA_HOME` environment variable is properly configured)
|
|
149
169
|
* Git 2.20+
|