psyke 0.4.9.dev6__py3-none-any.whl → 1.0.4.dev10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- psyke/__init__.py +231 -85
- psyke/clustering/__init__.py +9 -4
- psyke/clustering/cream/__init__.py +6 -10
- psyke/clustering/exact/__init__.py +17 -11
- psyke/clustering/utils.py +0 -1
- psyke/extraction/__init__.py +25 -0
- psyke/extraction/cart/CartPredictor.py +128 -0
- psyke/extraction/cart/FairTree.py +205 -0
- psyke/extraction/cart/FairTreePredictor.py +56 -0
- psyke/extraction/cart/__init__.py +48 -62
- psyke/extraction/hypercubic/__init__.py +187 -47
- psyke/extraction/hypercubic/cosmik/__init__.py +47 -0
- psyke/extraction/hypercubic/creepy/__init__.py +24 -29
- psyke/extraction/hypercubic/divine/__init__.py +86 -0
- psyke/extraction/hypercubic/ginger/__init__.py +100 -0
- psyke/extraction/hypercubic/gridex/__init__.py +45 -84
- psyke/extraction/hypercubic/gridrex/__init__.py +4 -4
- psyke/extraction/hypercubic/hex/__init__.py +104 -0
- psyke/extraction/hypercubic/hypercube.py +275 -72
- psyke/extraction/hypercubic/iter/__init__.py +45 -46
- psyke/extraction/hypercubic/strategy.py +13 -9
- psyke/extraction/real/__init__.py +24 -29
- psyke/extraction/real/utils.py +2 -2
- psyke/extraction/trepan/__init__.py +24 -19
- psyke/genetic/__init__.py +0 -0
- psyke/genetic/fgin/__init__.py +74 -0
- psyke/genetic/gin/__init__.py +144 -0
- psyke/hypercubepredictor.py +102 -0
- psyke/schema/__init__.py +230 -36
- psyke/tuning/__init__.py +40 -28
- psyke/tuning/crash/__init__.py +33 -64
- psyke/tuning/orchid/__init__.py +21 -23
- psyke/tuning/pedro/__init__.py +70 -56
- psyke/utils/logic.py +8 -8
- psyke/utils/plot.py +79 -3
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/METADATA +42 -22
- psyke-1.0.4.dev10.dist-info/RECORD +46 -0
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/WHEEL +1 -1
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info/licenses}/LICENSE +2 -1
- psyke/extraction/cart/predictor.py +0 -73
- psyke-0.4.9.dev6.dist-info/RECORD +0 -36
- {psyke-0.4.9.dev6.dist-info → psyke-1.0.4.dev10.dist-info}/top_level.txt +0 -0
psyke/tuning/__init__.py
CHANGED
|
@@ -3,6 +3,7 @@ from enum import Enum
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
+
from psyke.extraction.hypercubic import Grid
|
|
6
7
|
from psyke.utils import Target
|
|
7
8
|
|
|
8
9
|
|
|
@@ -12,14 +13,12 @@ class Objective(Enum):
|
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class Optimizer:
|
|
15
|
-
def __init__(self, dataframe: pd.DataFrame,
|
|
16
|
-
|
|
17
|
-
readability_tradeoff: float = 0.1, patience: int = 5,
|
|
16
|
+
def __init__(self, dataframe: pd.DataFrame, output: Target = Target.CONSTANT, max_error_increase: float = 1.2,
|
|
17
|
+
min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, patience: int = 5,
|
|
18
18
|
normalization=None, discretization=None):
|
|
19
19
|
self.dataframe = dataframe
|
|
20
|
-
self.algorithm = algorithm
|
|
21
20
|
self.output = output
|
|
22
|
-
self.
|
|
21
|
+
self.max_error_increase = max_error_increase
|
|
23
22
|
self.min_rule_decrease = min_rule_decrease
|
|
24
23
|
self.readability_tradeoff = readability_tradeoff
|
|
25
24
|
self.patience = patience
|
|
@@ -30,23 +29,13 @@ class Optimizer:
|
|
|
30
29
|
def search(self):
|
|
31
30
|
raise NotImplementedError
|
|
32
31
|
|
|
33
|
-
def
|
|
34
|
-
|
|
35
|
-
return (best[1] - other[1]) * 2
|
|
36
|
-
return 1 / (
|
|
37
|
-
(1 - other[0] / best[0]) ** self.readability_tradeoff *
|
|
38
|
-
np.ceil(other[1] / self.readability_tradeoff) / np.ceil(best[1] / self.readability_tradeoff)
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
@staticmethod
|
|
42
|
-
def _best(params):
|
|
43
|
-
param_dict = {Optimizer.__score(t): t for t in params}
|
|
32
|
+
def _best(self, params):
|
|
33
|
+
param_dict = {self._score(t): t for t in params}
|
|
44
34
|
min_param = min(param_dict)
|
|
45
35
|
return min_param, param_dict[min_param]
|
|
46
36
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
return param[0] * np.ceil(param[1] / 5)
|
|
37
|
+
def _score(self, param):
|
|
38
|
+
return param[0] * np.ceil(param[1] * self.readability_tradeoff)
|
|
50
39
|
|
|
51
40
|
def _best_param(self, param):
|
|
52
41
|
param_dict = {t[param]: t for t in self.params}
|
|
@@ -54,24 +43,47 @@ class Optimizer:
|
|
|
54
43
|
return min_param, param_dict[min_param]
|
|
55
44
|
|
|
56
45
|
def get_best(self):
|
|
57
|
-
names = [
|
|
58
|
-
params = [
|
|
46
|
+
names = ["Combined", "Predictive loss", "N rules"]
|
|
47
|
+
params = [self._best(self.params), self._best_param(0), self._best_param(1)]
|
|
59
48
|
for n, p in zip(names, params):
|
|
60
49
|
self._print_params(n, p[1])
|
|
61
50
|
print()
|
|
62
|
-
return
|
|
51
|
+
return self._best(self.params)[1], self._best_param(0)[1], self._best_param(1)[1]
|
|
63
52
|
|
|
64
53
|
def _print_params(self, n, param):
|
|
65
54
|
raise NotImplementedError
|
|
66
55
|
|
|
67
56
|
|
|
68
|
-
class
|
|
69
|
-
def __init__(self, predictor,
|
|
70
|
-
min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1,
|
|
71
|
-
|
|
57
|
+
class SKEOptimizer(Optimizer, ABC):
|
|
58
|
+
def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2,
|
|
59
|
+
min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, patience: int = 5,
|
|
60
|
+
objective: Objective = Objective.MODEL, output: Target = Target.CONSTANT,
|
|
72
61
|
normalization=None, discretization=None):
|
|
73
|
-
super().__init__(dataframe,
|
|
62
|
+
super().__init__(dataframe, output, max_error_increase, min_rule_decrease, readability_tradeoff,
|
|
74
63
|
patience, normalization, discretization)
|
|
75
64
|
self.predictor = predictor
|
|
76
|
-
self.max_depth = max_depth
|
|
77
65
|
self.objective = objective
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class IterativeOptimizer(Optimizer, ABC):
|
|
69
|
+
def __init__(self, dataframe: pd.DataFrame, max_error_increase: float = 1.2,
|
|
70
|
+
min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 10,
|
|
71
|
+
patience: int = 5, output: Target = Target.CONSTANT, normalization=None, discretization=None):
|
|
72
|
+
super().__init__(dataframe, output, max_error_increase, min_rule_decrease, readability_tradeoff,
|
|
73
|
+
patience, normalization, discretization)
|
|
74
|
+
self.max_depth = max_depth
|
|
75
|
+
|
|
76
|
+
def _iteration_improvement(self, best, other):
|
|
77
|
+
if other[0] == best[0]:
|
|
78
|
+
return (best[1] - other[1]) * 2
|
|
79
|
+
return 1 / (
|
|
80
|
+
(1 - other[0] / best[0]) ** self.readability_tradeoff *
|
|
81
|
+
np.ceil(other[1] / self.readability_tradeoff) / np.ceil(best[1] / self.readability_tradeoff)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def _check_iteration_improvement(self, best, current):
|
|
85
|
+
improvement = \
|
|
86
|
+
self._iteration_improvement([best[0], best[1]], [current[0], current[1]]) if best is not None else np.inf
|
|
87
|
+
if isinstance(improvement, complex):
|
|
88
|
+
improvement = 1.0
|
|
89
|
+
return current, improvement < 1.2
|
psyke/tuning/crash/__init__.py
CHANGED
|
@@ -1,91 +1,60 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
|
|
3
|
-
import numpy as np
|
|
4
3
|
import pandas as pd
|
|
5
4
|
|
|
6
|
-
from psyke import
|
|
7
|
-
from psyke.tuning import
|
|
5
|
+
from psyke.tuning import Objective, SKEOptimizer
|
|
6
|
+
from psyke.tuning.orchid import OrCHiD
|
|
8
7
|
from psyke.utils import Target
|
|
9
8
|
|
|
10
9
|
|
|
11
|
-
class CRASH(
|
|
10
|
+
class CRASH(SKEOptimizer):
|
|
12
11
|
class Algorithm(Enum):
|
|
13
12
|
ExACT = 1,
|
|
14
13
|
CREAM = 2
|
|
15
14
|
|
|
16
|
-
def __init__(self, predictor, dataframe: pd.DataFrame,
|
|
15
|
+
def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2,
|
|
17
16
|
min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 10,
|
|
18
|
-
|
|
19
|
-
objective: Objective = Objective.MODEL, normalization=None):
|
|
20
|
-
super().__init__(predictor,
|
|
21
|
-
|
|
22
|
-
self.
|
|
17
|
+
max_gauss_components: int = 5, patience: int = 5, output: Target = Target.CONSTANT,
|
|
18
|
+
objective: Objective = Objective.MODEL, normalization=None, discretization=None):
|
|
19
|
+
super().__init__(predictor, dataframe, max_error_increase, min_rule_decrease, readability_tradeoff,
|
|
20
|
+
patience, objective, output, normalization, discretization)
|
|
21
|
+
self.max_depth = max_depth
|
|
22
|
+
self.max_gauss_components = max_gauss_components
|
|
23
23
|
|
|
24
24
|
def search(self):
|
|
25
|
-
self.params =
|
|
25
|
+
self.params = []
|
|
26
|
+
for algorithm in [OrCHiD.Algorithm.ExACT, OrCHiD.Algorithm.CREAM]:
|
|
27
|
+
self.params += self.__search_algorithm(algorithm)
|
|
26
28
|
|
|
27
|
-
def
|
|
29
|
+
def __search_algorithm(self, algorithm):
|
|
28
30
|
params = []
|
|
29
31
|
best = None
|
|
30
32
|
|
|
31
|
-
for
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
) if best is not None else np.inf
|
|
38
|
-
|
|
39
|
-
best = b
|
|
40
|
-
params += p
|
|
41
|
-
|
|
42
|
-
if len(params) > 1 and improvement < 1.2:
|
|
43
|
-
break
|
|
44
|
-
return params
|
|
45
|
-
|
|
46
|
-
def __search_threshold(self, depth):
|
|
47
|
-
step = self.model_mae / 2.0
|
|
48
|
-
threshold = self.model_mae * 0.9
|
|
49
|
-
params = []
|
|
50
|
-
patience = self.patience
|
|
51
|
-
while patience > 0:
|
|
52
|
-
print(f"{self.algorithm}. Depth: {depth}. Threshold = {threshold:.2f}. ", end="")
|
|
53
|
-
extractor = Extractor.creepy(
|
|
54
|
-
self.predictor, depth=depth, error_threshold=threshold, output=self.output,
|
|
55
|
-
gauss_components=10, normalization=self.normalization,
|
|
56
|
-
clustering=Clustering.cream if self.algorithm == CRASH.Algorithm.CREAM else Clustering.exact
|
|
57
|
-
)
|
|
58
|
-
_ = extractor.extract(self.dataframe)
|
|
59
|
-
mae, n = (extractor.mae(self.dataframe, self.predictor) if self.objective == Objective.MODEL else
|
|
60
|
-
extractor.mae(self.dataframe)), extractor.n_rules
|
|
61
|
-
print(f"MAE = {mae:.2f}, {n} rules")
|
|
62
|
-
|
|
63
|
-
if len(params) == 0:
|
|
64
|
-
params.append((mae, n, depth, threshold))
|
|
65
|
-
threshold += step
|
|
66
|
-
continue
|
|
67
|
-
|
|
68
|
-
if (n == 1) or (mae == 0.0):
|
|
69
|
-
params.append((mae, n, depth, threshold))
|
|
70
|
-
break
|
|
71
|
-
|
|
72
|
-
if mae > params[0][0] * self.max_mae_increase:
|
|
33
|
+
for gauss_components in range(2, self.max_gauss_components + 1):
|
|
34
|
+
data = self.dataframe.sample(n=gauss_components * 100) if gauss_components * 100 < len(self.dataframe) \
|
|
35
|
+
else self.dataframe
|
|
36
|
+
current_params = self.__search_components(data, algorithm, gauss_components)
|
|
37
|
+
current_best = self._best(current_params)[1]
|
|
38
|
+
if best is not None and self._score(best) <= self._score(current_best):
|
|
73
39
|
break
|
|
40
|
+
best = current_best
|
|
41
|
+
params += current_params
|
|
74
42
|
|
|
75
|
-
improvement = (params[-1][0] / mae) + (1 - n / params[-1][1])
|
|
76
|
-
|
|
77
|
-
if improvement <= 1 or n > np.ceil(params[-1][1] * self.min_rule_decrease):
|
|
78
|
-
patience -= 1
|
|
79
|
-
step = max(step, abs(mae - threshold) / max(patience, 1))
|
|
80
|
-
if mae != params[-1][0] or n != params[-1][1]:
|
|
81
|
-
params.append((mae, n, depth, threshold))
|
|
82
|
-
threshold += step
|
|
83
43
|
return params
|
|
84
44
|
|
|
45
|
+
def __search_components(self, data, algorithm, gauss_components):
|
|
46
|
+
orchid = OrCHiD(data, algorithm, self.output, self.max_error_increase, self.min_rule_decrease,
|
|
47
|
+
self.readability_tradeoff, self.patience, self.max_depth, gauss_components,
|
|
48
|
+
self.normalization, self.discretization)
|
|
49
|
+
orchid.search()
|
|
50
|
+
return [(*p, gauss_components, algorithm) for p in orchid.params]
|
|
51
|
+
|
|
85
52
|
def _print_params(self, name, params):
|
|
86
|
-
print("
|
|
53
|
+
print("*****************************")
|
|
87
54
|
print(f"Best {name}")
|
|
88
|
-
print("
|
|
55
|
+
print("*****************************")
|
|
89
56
|
print(f"MAE = {params[0]:.2f}, {params[1]} rules")
|
|
57
|
+
print(f"Algorithm = {params[5]}")
|
|
90
58
|
print(f"Threshold = {params[3]:.2f}")
|
|
91
59
|
print(f"Depth = {params[2]}")
|
|
60
|
+
print(f"Gaussian components = {params[4]}")
|
psyke/tuning/orchid/__init__.py
CHANGED
|
@@ -4,60 +4,58 @@ import numpy as np
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
6
|
from psyke import Clustering, EvaluableModel
|
|
7
|
-
from psyke.tuning import Optimizer
|
|
7
|
+
from psyke.tuning import Optimizer, IterativeOptimizer
|
|
8
8
|
from psyke.utils import Target
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
class OrCHiD(
|
|
11
|
+
class OrCHiD(IterativeOptimizer):
|
|
12
12
|
class Algorithm(Enum):
|
|
13
13
|
ExACT = 1,
|
|
14
14
|
CREAM = 2
|
|
15
15
|
|
|
16
16
|
def __init__(self, dataframe: pd.DataFrame, algorithm, output: Target = Target.CONSTANT,
|
|
17
|
-
|
|
18
|
-
patience: int = 5, max_depth: int = 10, normalization=None, discretization=None):
|
|
19
|
-
super().__init__(dataframe,
|
|
20
|
-
|
|
21
|
-
self.
|
|
17
|
+
max_error_increase: float = 1.2, min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1,
|
|
18
|
+
patience: int = 5, max_depth: int = 10, gauss_components=10, normalization=None, discretization=None):
|
|
19
|
+
super().__init__(dataframe, max_error_increase, min_rule_decrease, readability_tradeoff, max_depth, patience,
|
|
20
|
+
output, normalization, discretization)
|
|
21
|
+
self.algorithm = algorithm
|
|
22
|
+
self.gauss_components = gauss_components
|
|
22
23
|
|
|
23
24
|
def search(self):
|
|
24
25
|
self.params = self.__search_depth()
|
|
25
26
|
|
|
26
27
|
def __search_depth(self):
|
|
27
|
-
params = []
|
|
28
|
-
best = None
|
|
28
|
+
params, best = [], None
|
|
29
29
|
|
|
30
30
|
for depth in range(1, self.max_depth + 1):
|
|
31
|
-
|
|
32
|
-
|
|
31
|
+
current_params = self.__search_threshold(depth)
|
|
32
|
+
current_best = self._best(current_params)[1]
|
|
33
33
|
print()
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
) if best is not None else np.inf
|
|
37
|
-
|
|
38
|
-
best = b
|
|
39
|
-
params += p
|
|
34
|
+
best, to_break = self._check_iteration_improvement(best, current_best)
|
|
35
|
+
params += current_params
|
|
40
36
|
|
|
41
|
-
if len(params) > 1 and
|
|
37
|
+
if len(params) > 1 and to_break:
|
|
42
38
|
break
|
|
43
39
|
return params
|
|
44
40
|
|
|
45
41
|
def __search_threshold(self, depth):
|
|
46
42
|
step = 1.0
|
|
47
|
-
threshold = 1.0
|
|
43
|
+
threshold = 1.0
|
|
48
44
|
params = []
|
|
49
45
|
patience = self.patience
|
|
50
46
|
while patience > 0:
|
|
51
|
-
print(f"{self.algorithm}. Depth: {depth}. Threshold = {threshold:.2f}. "
|
|
47
|
+
print(f"{self.algorithm}. Depth: {depth}. Threshold = {threshold:.2f}. "
|
|
48
|
+
f"Gaussian components = {self.gauss_components}. ", end="")
|
|
52
49
|
clustering = (Clustering.cream if self.algorithm == OrCHiD.Algorithm.CREAM else Clustering.exact)(
|
|
53
|
-
depth=depth, error_threshold=threshold, gauss_components=
|
|
50
|
+
depth=depth, error_threshold=threshold, gauss_components=self.gauss_components, output=self.output
|
|
54
51
|
)
|
|
55
52
|
clustering.fit(self.dataframe)
|
|
56
53
|
task, metric = \
|
|
57
54
|
(EvaluableModel.Task.CLASSIFICATION, EvaluableModel.ClassificationScore.INVERSE_ACCURACY) \
|
|
58
55
|
if self.output == Target.CLASSIFICATION else \
|
|
59
56
|
(EvaluableModel.Task.REGRESSION, EvaluableModel.RegressionScore.MAE)
|
|
60
|
-
p, n = clustering.score(self.dataframe, None, False, False, task,
|
|
57
|
+
p, n = clustering.score(self.dataframe, None, False, False, task=task,
|
|
58
|
+
scoring_function=[metric])[metric][0], clustering.n_rules
|
|
61
59
|
|
|
62
60
|
print(f"Predictive loss = {p:.2f}, {n} rules")
|
|
63
61
|
|
|
@@ -71,7 +69,7 @@ class OrCHiD(Optimizer):
|
|
|
71
69
|
params.append((p, n, depth, threshold))
|
|
72
70
|
break
|
|
73
71
|
|
|
74
|
-
if p > params[0][0] * self.
|
|
72
|
+
if p > params[0][0] * self.max_error_increase:
|
|
75
73
|
break
|
|
76
74
|
|
|
77
75
|
improvement = (params[-1][0] / p) + (1 - n / params[-1][1])
|
psyke/tuning/pedro/__init__.py
CHANGED
|
@@ -1,44 +1,73 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
import pandas as pd
|
|
3
3
|
from enum import Enum
|
|
4
|
-
|
|
4
|
+
|
|
5
|
+
from sklearn.metrics import accuracy_score
|
|
6
|
+
|
|
7
|
+
from psyke import Extractor, Target
|
|
5
8
|
from psyke.extraction.hypercubic import Grid, FeatureRanker
|
|
6
9
|
from psyke.extraction.hypercubic.strategy import AdaptiveStrategy, FixedStrategy
|
|
7
|
-
from psyke.tuning import Objective,
|
|
10
|
+
from psyke.tuning import Objective, IterativeOptimizer, SKEOptimizer
|
|
8
11
|
|
|
9
12
|
|
|
10
|
-
class PEDRO(
|
|
13
|
+
class PEDRO(SKEOptimizer, IterativeOptimizer):
|
|
11
14
|
class Algorithm(Enum):
|
|
12
15
|
GRIDEX = 1,
|
|
13
|
-
GRIDREX = 2
|
|
16
|
+
GRIDREX = 2,
|
|
17
|
+
HEX = 3
|
|
14
18
|
|
|
15
|
-
def __init__(self, predictor, dataframe: pd.DataFrame,
|
|
19
|
+
def __init__(self, predictor, dataframe: pd.DataFrame, max_error_increase: float = 1.2,
|
|
16
20
|
min_rule_decrease: float = 0.9, readability_tradeoff: float = 0.1, max_depth: int = 3,
|
|
17
21
|
patience: int = 3, algorithm: Algorithm = Algorithm.GRIDREX, objective: Objective = Objective.MODEL,
|
|
18
|
-
normalization=None):
|
|
19
|
-
|
|
20
|
-
|
|
22
|
+
output: Target = Target.CONSTANT, normalization=None, discretization=None):
|
|
23
|
+
SKEOptimizer.__init__(self, predictor, dataframe, max_error_increase, min_rule_decrease,
|
|
24
|
+
readability_tradeoff, patience, objective, output, normalization, discretization)
|
|
25
|
+
IterativeOptimizer.__init__(self, dataframe, max_error_increase, min_rule_decrease, readability_tradeoff,
|
|
26
|
+
max_depth, patience, output, normalization, discretization)
|
|
27
|
+
self.algorithm = Extractor.gridrex if algorithm == PEDRO.Algorithm.GRIDREX else \
|
|
28
|
+
Extractor.gridex if algorithm == PEDRO.Algorithm.GRIDEX else Extractor.hex
|
|
29
|
+
self.algorithm_name = "GridREx" if algorithm == PEDRO.Algorithm.GRIDREX else \
|
|
30
|
+
"GridEx" if algorithm == PEDRO.Algorithm.GRIDEX else "HEx"
|
|
21
31
|
self.ranked = FeatureRanker(dataframe.columns[:-1]).fit(predictor, dataframe.iloc[:, :-1]).rankings()
|
|
22
|
-
|
|
23
|
-
|
|
32
|
+
predictions = self.predictor.predict(dataframe.iloc[:, :-1]).flatten()
|
|
33
|
+
expected = self.dataframe.iloc[:, -1].values
|
|
34
|
+
self.error = 1 - accuracy_score(predictions, expected) if output == Target.CLASSIFICATION else \
|
|
35
|
+
abs(predictions - expected).mean()
|
|
36
|
+
|
|
37
|
+
def _search_depth(self, strategy, critical, max_partitions):
|
|
38
|
+
params, best = [], None
|
|
39
|
+
|
|
40
|
+
for iterations in range(self.max_depth):
|
|
41
|
+
current_params = self.__search_threshold(Grid(iterations + 1, strategy), critical, max_partitions)
|
|
42
|
+
current_best = self._best(current_params)[1]
|
|
43
|
+
print()
|
|
44
|
+
best, to_break = self._check_iteration_improvement(best, current_best)
|
|
45
|
+
params += current_params
|
|
46
|
+
|
|
47
|
+
if len(params) > 1 and to_break:
|
|
48
|
+
break
|
|
49
|
+
return params
|
|
24
50
|
|
|
25
51
|
def __search_threshold(self, grid, critical, max_partitions):
|
|
26
|
-
step = self.
|
|
27
|
-
threshold = self.
|
|
52
|
+
step = self.error / 2.0
|
|
53
|
+
threshold = self.error * 0.5
|
|
28
54
|
params = []
|
|
29
55
|
patience = self.patience
|
|
30
56
|
while patience > 0:
|
|
31
|
-
print("{}. {}. Threshold = {:.2f}. ".format(self.
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
57
|
+
print("{}. {}. Threshold = {:.2f}. ".format(self.algorithm_name, grid, threshold), end="")
|
|
58
|
+
param_dict = dict(min_examples=25, threshold=threshold, normalization=self.normalization)
|
|
59
|
+
if self.algorithm != Extractor.gridrex:
|
|
60
|
+
param_dict['output'] = self.output
|
|
61
|
+
extractor = self.algorithm(self.predictor, grid, **param_dict)
|
|
35
62
|
_ = extractor.extract(self.dataframe)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
63
|
+
error_function = (lambda *x: 1 - extractor.accuracy(*x)) if self.output == Target.CLASSIFICATION \
|
|
64
|
+
else extractor.mae
|
|
65
|
+
error, n = (error_function(self.dataframe, self.predictor) if self.objective == Objective.MODEL else
|
|
66
|
+
error_function(self.dataframe)), extractor.n_rules
|
|
67
|
+
print("MAE = {:.2f}, {} rules".format(error, n))
|
|
39
68
|
|
|
40
69
|
if len(params) == 0:
|
|
41
|
-
params.append((
|
|
70
|
+
params.append((error, n, threshold, grid))
|
|
42
71
|
threshold += step
|
|
43
72
|
continue
|
|
44
73
|
|
|
@@ -46,44 +75,24 @@ class PEDRO(Optimizer):
|
|
|
46
75
|
break
|
|
47
76
|
|
|
48
77
|
if n == 1:
|
|
49
|
-
params.append((
|
|
78
|
+
params.append((error, n, threshold, grid))
|
|
50
79
|
break
|
|
51
80
|
|
|
52
|
-
if
|
|
81
|
+
if error > params[0][0] * self.max_error_increase:
|
|
53
82
|
break
|
|
54
83
|
|
|
55
|
-
improvement = (params[-1][0] /
|
|
84
|
+
improvement = (params[-1][0] / error) + (1 - n / params[-1][1])
|
|
56
85
|
|
|
57
86
|
if improvement <= 1 or n > np.ceil(params[-1][1] * self.min_rule_decrease):
|
|
58
87
|
patience -= 1
|
|
59
|
-
step = max(step, abs(
|
|
88
|
+
step = max(step, abs(error - threshold) / max(patience, 1))
|
|
60
89
|
elif not critical:
|
|
61
90
|
patience = self.patience
|
|
62
|
-
if
|
|
63
|
-
params.append((
|
|
91
|
+
if error != params[-1][0] or n != params[-1][1]:
|
|
92
|
+
params.append((error, n, threshold, grid))
|
|
64
93
|
threshold += step
|
|
65
94
|
return params
|
|
66
95
|
|
|
67
|
-
def __search_depth(self, strategy, critical, max_partitions):
|
|
68
|
-
params = []
|
|
69
|
-
best = None
|
|
70
|
-
|
|
71
|
-
for iterations in range(self.max_depth):
|
|
72
|
-
grid = Grid(iterations + 1, strategy)
|
|
73
|
-
p = self.__search_threshold(grid, critical, max_partitions)
|
|
74
|
-
b = Optimizer._best(p)[1]
|
|
75
|
-
print()
|
|
76
|
-
improvement = self._depth_improvement(
|
|
77
|
-
[best[0], best[1]], [b[0], b[1]]
|
|
78
|
-
) if best is not None else np.inf
|
|
79
|
-
|
|
80
|
-
best = b
|
|
81
|
-
params += p
|
|
82
|
-
|
|
83
|
-
if len(params) > 1 and improvement < 1.2:
|
|
84
|
-
break
|
|
85
|
-
return params
|
|
86
|
-
|
|
87
96
|
def __contains(self, strategies, strategy):
|
|
88
97
|
for s in strategies:
|
|
89
98
|
if strategy.equals(s, self.dataframe.columns[:-1]):
|
|
@@ -91,21 +100,26 @@ class PEDRO(Optimizer):
|
|
|
91
100
|
return False
|
|
92
101
|
|
|
93
102
|
def search(self):
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
103
|
+
max_partitions = 200
|
|
104
|
+
base_partitions = FixedStrategy(2).partition_number(self.dataframe.columns[:-1]) * 3
|
|
105
|
+
if base_partitions <= max_partitions:
|
|
106
|
+
strategies = [FixedStrategy(2)]
|
|
107
|
+
if FixedStrategy(3).partition_number(self.dataframe.columns[:-1]) <= max_partitions:
|
|
108
|
+
strategies.append(FixedStrategy(3))
|
|
109
|
+
else:
|
|
110
|
+
strategies = []
|
|
111
|
+
base_partitions = max_partitions
|
|
98
112
|
|
|
99
113
|
for n in [2, 3, 5, 10]:
|
|
100
114
|
for th in [0.99, 0.75, 0.67, 0.5, 0.3]:
|
|
101
115
|
strategy = AdaptiveStrategy(self.ranked, [(th, n)])
|
|
102
|
-
if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions
|
|
116
|
+
if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions and \
|
|
103
117
|
not self.__contains(strategies, strategy):
|
|
104
118
|
strategies.append(strategy)
|
|
105
119
|
|
|
106
120
|
for (a, b) in [(0.33, 0.67), (0.25, 0.75), (0.1, 0.9)]:
|
|
107
121
|
strategy = AdaptiveStrategy(self.ranked, [(a, 2), (b, 3)])
|
|
108
|
-
if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions
|
|
122
|
+
if strategy.partition_number(self.dataframe.columns[:-1]) < base_partitions and \
|
|
109
123
|
not self.__contains(strategies, strategy):
|
|
110
124
|
strategies.append(strategy)
|
|
111
125
|
|
|
@@ -116,16 +130,16 @@ class PEDRO(Optimizer):
|
|
|
116
130
|
|
|
117
131
|
params = []
|
|
118
132
|
for strategy in strategies:
|
|
119
|
-
params += self.
|
|
120
|
-
|
|
121
|
-
|
|
133
|
+
params += self._search_depth(strategy,
|
|
134
|
+
strategy.partition_number(self.dataframe.columns[:-1]) > avg,
|
|
135
|
+
base_partitions)
|
|
122
136
|
self.params = params
|
|
123
137
|
|
|
124
138
|
def _print_params(self, name, params):
|
|
125
139
|
print("**********************")
|
|
126
140
|
print(f"Best {name}")
|
|
127
141
|
print("**********************")
|
|
128
|
-
print(f"
|
|
142
|
+
print(f"Error = {params[0]:.2f}, {params[1]} rules")
|
|
129
143
|
print(f"Threshold = {params[2]:.2f}")
|
|
130
144
|
print(f"Iterations = {params[3].iterations}")
|
|
131
145
|
print(f"Strategy = {params[3].strategy}")
|
psyke/utils/logic.py
CHANGED
|
@@ -123,17 +123,17 @@ def to_var(name: str) -> Var:
|
|
|
123
123
|
return var(name[0].upper() + name[1:])
|
|
124
124
|
|
|
125
125
|
|
|
126
|
-
def create_variable_list(features: list[DiscreteFeature], dataset: pd.DataFrame = None
|
|
127
|
-
if
|
|
128
|
-
features = sorted(features, key=lambda x: x.name)
|
|
129
|
-
dataset = sorted(dataset.columns[:-1]) if dataset is not None else None
|
|
130
|
-
else:
|
|
131
|
-
dataset = dataset.columns[:-1] if dataset is not None else None
|
|
126
|
+
def create_variable_list(features: list[DiscreteFeature], dataset: pd.DataFrame = None) -> dict[str, Var]:
|
|
127
|
+
dataset = dataset.columns[:-1] if dataset is not None else None
|
|
132
128
|
values = {feature.name: to_var(feature.name) for feature in features} \
|
|
133
|
-
if
|
|
129
|
+
if features else {name: to_var(name) for name in dataset}
|
|
134
130
|
return values
|
|
135
131
|
|
|
136
132
|
|
|
133
|
+
def last_in_body(body: Struct) -> Struct:
|
|
134
|
+
return body.args[-1] if body.args[-1].functor == 'is' else last_in_body(body.args[-1])
|
|
135
|
+
|
|
136
|
+
|
|
137
137
|
def create_head(functor: str, variables: Iterable[Var], output) -> Struct:
|
|
138
138
|
if isinstance(output, Var):
|
|
139
139
|
variables += [output]
|
|
@@ -321,4 +321,4 @@ def get_not_in_rule(min_included: bool = False, max_included: bool = True) -> Cl
|
|
|
321
321
|
parser = DEFAULT_CLAUSES_PARSER
|
|
322
322
|
theory = parser.parse_clauses(not_in_textual_rule(LE if min_included else L, GE if max_included else G),
|
|
323
323
|
operators=None)
|
|
324
|
-
return theory[0]
|
|
324
|
+
return theory[0]
|
psyke/utils/plot.py
CHANGED
|
@@ -7,10 +7,85 @@ import matplotlib.pyplot as plt
|
|
|
7
7
|
from matplotlib.lines import Line2D
|
|
8
8
|
from tuprolog.solve.prolog import prolog_solver
|
|
9
9
|
from tuprolog.theory import Theory, mutable_theory
|
|
10
|
-
|
|
10
|
+
|
|
11
|
+
from psyke.extraction.hypercubic import HyperCubeExtractor
|
|
12
|
+
from psyke.utils.logic import data_to_struct, get_in_rule, get_not_in_rule
|
|
11
13
|
|
|
12
14
|
import matplotlib
|
|
13
|
-
matplotlib.use('TkAgg')
|
|
15
|
+
#matplotlib.use('TkAgg')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def plot_init(xlim, ylim, xlabel, ylabel, size=(4, 3), equal=False):
|
|
19
|
+
plt.figure(figsize=size)
|
|
20
|
+
if equal:
|
|
21
|
+
plt.gca().set_aspect(1)
|
|
22
|
+
plt.xlim(xlim)
|
|
23
|
+
plt.ylim(ylim)
|
|
24
|
+
plt.gca().set_xlabel(xlabel)
|
|
25
|
+
plt.gca().set_ylabel(ylabel)
|
|
26
|
+
plt.gca().set_rasterized(True)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def plot_point(x, y, color, marker, ec=None):
|
|
30
|
+
plt.scatter(x, y, c=color, marker=marker, edgecolors=ec, linewidths=0.6)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def plot_classification_samples(dataframe, classes, colors, markers, labels, loc, name, show=True, ec=None):
|
|
34
|
+
marks = [Line2D([0], [0], color=c, marker=m, lw="0") for c, m in zip(colors, markers)]
|
|
35
|
+
|
|
36
|
+
for cl, c, m in zip(classes, colors, markers):
|
|
37
|
+
df = dataframe[dataframe.target == cl]
|
|
38
|
+
plot_point(df["petal length"], df["petal width"], c, m, ec=ec)
|
|
39
|
+
|
|
40
|
+
plt.gca().legend(marks, labels, loc=loc)
|
|
41
|
+
plt.savefig("plot/{}.pdf".format(name), dpi=500, bbox_inches='tight')
|
|
42
|
+
if show:
|
|
43
|
+
plt.show()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def plot_boundaries(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str],
|
|
47
|
+
a: float = .5, h: str = '////////', ls='-', e=.05, fc='none', ec=None, reverse=False):
|
|
48
|
+
cubes = extractor._hypercubes.copy()
|
|
49
|
+
if reverse:
|
|
50
|
+
cubes.reverse()
|
|
51
|
+
for cube in cubes:
|
|
52
|
+
plt.gca().fill_between((cube[x][0] - e, cube[x][1] + e), cube[y][0] - e, cube[y][1] + e,
|
|
53
|
+
fc=colors[cube.output] if fc is None else fc,
|
|
54
|
+
ec=colors[cube.output] if ec is None else ec, alpha=a, hatch=h, linestyle=ls)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def plot_surfaces(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], ec='r', e=.05):
|
|
58
|
+
for cube in extractor._hypercubes:
|
|
59
|
+
plt.gca().fill_between((cube[x][0] - e, cube[x][1] + e), cube[y][0] - e, cube[y][1] + e,
|
|
60
|
+
fc='none', ec=ec)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def plot_perimeters(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str], n: int = 5,
|
|
64
|
+
ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8):
|
|
65
|
+
for cube in extractor._hypercubes:
|
|
66
|
+
for corner in cube.perimeter_samples(n):
|
|
67
|
+
plt.scatter(corner[x], corner[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def plot_centers(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str],
|
|
71
|
+
ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8):
|
|
72
|
+
for cube in extractor._hypercubes:
|
|
73
|
+
center = cube.center
|
|
74
|
+
plt.scatter(center[x], center[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def plot_corners(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str],
|
|
78
|
+
ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8):
|
|
79
|
+
for cube in extractor._hypercubes:
|
|
80
|
+
for corner in cube.corners():
|
|
81
|
+
plt.scatter(corner[x], corner[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def plot_barycenters(extractor: HyperCubeExtractor, x: str, y: str, colors: dict[str, str],
|
|
85
|
+
ec: str = 'r', m: str = '*', s: int = 60, z: float = 1e10, lw: float = 0.8):
|
|
86
|
+
for cube in extractor._hypercubes:
|
|
87
|
+
center = cube.barycenter
|
|
88
|
+
plt.scatter(center[x], center[y], c=colors[cube.output], marker=m, edgecolor=ec, s=s, zorder=z, linewidth=lw)
|
|
14
89
|
|
|
15
90
|
|
|
16
91
|
def predict_from_theory(theory: Theory, data: pd.DataFrame) -> list[float or str]:
|
|
@@ -95,6 +170,7 @@ def plot_theory(theory: Theory, data: pd.DataFrame = None, output: str = 'plot.p
|
|
|
95
170
|
pass
|
|
96
171
|
# ax.text2D(0., 0.88, pretty_theory(theory, new_line=False), transform=ax.transAxes, fontsize=8)
|
|
97
172
|
if isinstance(ys[0], str):
|
|
98
|
-
custom_lines = [Line2D([0], [0], marker='o', markerfacecolor=get_color(c),
|
|
173
|
+
custom_lines = [Line2D([0], [0], marker='o', markerfacecolor=get_color(c),
|
|
174
|
+
markersize=20, color='w') for c in classes]
|
|
99
175
|
ax.legend(custom_lines, classes, loc='upper left', numpoints=1, ncol=3, fontsize=18, bbox_to_anchor=(0, 0))
|
|
100
176
|
plt.savefig(output, format='pdf')
|