optimal-omt 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- optimal_omt-0.1.0/PKG-INFO +13 -0
- optimal_omt-0.1.0/README.md +0 -0
- optimal_omt-0.1.0/omt/DatabaseParser.py +128 -0
- optimal_omt-0.1.0/omt/Estimator.py +138 -0
- optimal_omt-0.1.0/omt/ORToolsClassifier.py +577 -0
- optimal_omt-0.1.0/omt/TreeStructure.py +226 -0
- optimal_omt-0.1.0/omt/__init__.py +3 -0
- optimal_omt-0.1.0/optimal_omt.egg-info/PKG-INFO +13 -0
- optimal_omt-0.1.0/optimal_omt.egg-info/SOURCES.txt +12 -0
- optimal_omt-0.1.0/optimal_omt.egg-info/dependency_links.txt +1 -0
- optimal_omt-0.1.0/optimal_omt.egg-info/requires.txt +8 -0
- optimal_omt-0.1.0/optimal_omt.egg-info/top_level.txt +1 -0
- optimal_omt-0.1.0/pyproject.toml +24 -0
- optimal_omt-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: optimal-omt
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Optimal Model Trees using OR-Tools
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: numpy
|
|
8
|
+
Requires-Dist: pandas
|
|
9
|
+
Requires-Dist: scikit-learn
|
|
10
|
+
Requires-Dist: binarytree
|
|
11
|
+
Requires-Dist: ortools
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest; extra == "dev"
|
|
File without changes
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import arff as rf
|
|
2
|
+
from sklearn.preprocessing import StandardScaler
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from TreeStructure import Multiplier
|
|
5
|
+
from Binarizer import binarize_reduced, save_binarized_df
|
|
6
|
+
|
|
7
|
+
def DataParser(name, ProbType, one_hot = True,toInt = False,StdScale=False):
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
pd.set_option('future.no_silent_downcasting', True)
|
|
11
|
+
except:
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
data = rf.load(open(f'{ProbType}Problems/{name}','rt'))
|
|
15
|
+
|
|
16
|
+
label_name = data['attributes'][-1][0]
|
|
17
|
+
|
|
18
|
+
df = pd.DataFrame(data['data'])
|
|
19
|
+
df.columns = [i[0] for i in data['attributes'] ]
|
|
20
|
+
|
|
21
|
+
eliminated_cols = []
|
|
22
|
+
# ELIMIATING A COLUMN FROM ALL DATASETS IF ALL THE VALUES IN IT ARE THE SAME IN THE TRAIN SET
|
|
23
|
+
for i in df.columns:
|
|
24
|
+
if df[i].nunique() <= 1:
|
|
25
|
+
df.drop(columns=[i], inplace=True)
|
|
26
|
+
eliminated_cols.append(i)
|
|
27
|
+
|
|
28
|
+
for i in [ i for i in data['attributes'] if i[0] not in eliminated_cols]:
|
|
29
|
+
# first with the features
|
|
30
|
+
if i[0] != label_name:
|
|
31
|
+
if type(i[1]) == str:
|
|
32
|
+
# replace NaN with mean for Numeric value features
|
|
33
|
+
mean_value = round(df[i[0]].mean(),3)
|
|
34
|
+
df[i[0]] = df[i[0]].fillna(value=mean_value).astype(float)
|
|
35
|
+
if toInt == True:
|
|
36
|
+
# df[i[0]] = df[i[0]].round(3) # todo comment off if you do not want to round
|
|
37
|
+
df.loc[:, i[0]] *= Multiplier(df[i[0]])
|
|
38
|
+
else:
|
|
39
|
+
if one_hot == False:
|
|
40
|
+
temp_feature_values = {i: ind for ind, i in enumerate(i[1])}
|
|
41
|
+
df[i[0]] = df[i[0]].replace(temp_feature_values).astype(float)
|
|
42
|
+
else:
|
|
43
|
+
if len(i[1]) == 2:
|
|
44
|
+
df = pd.get_dummies(df,columns=[i[0]],drop_first=True,dtype=int)
|
|
45
|
+
else:
|
|
46
|
+
df = pd.get_dummies(df, columns=[i[0]],dtype=int)
|
|
47
|
+
elif i[0] == label_name and ProbType == 'Classification': # todo FOR CLASSIFICATION SCALE TARGET
|
|
48
|
+
if len(i[1]) == 2:
|
|
49
|
+
temp_label_values = {i[1][0]:-1,
|
|
50
|
+
i[1][1]:1}
|
|
51
|
+
df[i[0]] = df[i[0]].replace(temp_label_values).astype(float)
|
|
52
|
+
# else:
|
|
53
|
+
# temp_label_values = {i: ind for ind, i in enumerate(i[1])}
|
|
54
|
+
# df[i[0]] = df[i[0]].replace(temp_label_values).astype(int)
|
|
55
|
+
# df.dropna(inplace=True)
|
|
56
|
+
|
|
57
|
+
# ELIMIATING A COLUMN IF ALL THE VALUES IN IT ARE THE SAME
|
|
58
|
+
for i in df.columns:
|
|
59
|
+
if df[i].nunique() == 1:
|
|
60
|
+
df = df.drop(i, axis=1)
|
|
61
|
+
|
|
62
|
+
if StdScale == True:
|
|
63
|
+
##### STANDARD SCALING #######
|
|
64
|
+
std_scaler = StandardScaler()
|
|
65
|
+
if ProbType == 'Classification':
|
|
66
|
+
features = list(df.columns.drop([label_name]))
|
|
67
|
+
df_scaled = df.drop(columns=label_name)
|
|
68
|
+
else:
|
|
69
|
+
features = list(df.columns)
|
|
70
|
+
df_scaled = df
|
|
71
|
+
df_scaled = std_scaler.fit_transform(df_scaled.to_numpy())
|
|
72
|
+
df_scaled = pd.DataFrame(df_scaled,columns=features)
|
|
73
|
+
if ProbType == 'Classification':
|
|
74
|
+
df_scaled.insert(len(features), label_name, df[label_name], True)
|
|
75
|
+
df = df_scaled
|
|
76
|
+
|
|
77
|
+
id_name = 'ID'
|
|
78
|
+
if id_name in df.columns:
|
|
79
|
+
df.drop(columns=[id_name],inplace=True)
|
|
80
|
+
|
|
81
|
+
return df
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
|
|
85
|
+
#################### REGRESSION #####################
|
|
86
|
+
# collection = os.listdir('RegressionProblems')
|
|
87
|
+
#
|
|
88
|
+
# for i in collection:
|
|
89
|
+
# df = regression_data_caller(i)
|
|
90
|
+
# print(df)
|
|
91
|
+
#########################################
|
|
92
|
+
|
|
93
|
+
#################### CLASSIFICATION #####################
|
|
94
|
+
collection = [
|
|
95
|
+
############ BINARY ############
|
|
96
|
+
'blogger.arff',
|
|
97
|
+
'boxing.arff',
|
|
98
|
+
'mux6.arff',
|
|
99
|
+
'corral.arff',
|
|
100
|
+
'biomed.arff',
|
|
101
|
+
'ionosphere.arff',
|
|
102
|
+
'jEdit.arff',
|
|
103
|
+
'schizo.arff',
|
|
104
|
+
'colic.arff',
|
|
105
|
+
'threeOf9.arff',
|
|
106
|
+
'R_data_frame.arff',
|
|
107
|
+
'australian.arff',
|
|
108
|
+
'doa_bwin_balanced.arff',
|
|
109
|
+
'blood-transf.arff',
|
|
110
|
+
'autoUniv.arff',
|
|
111
|
+
'parity.arff',
|
|
112
|
+
'banknote.arff',
|
|
113
|
+
'gametes_Epistasis.arff',
|
|
114
|
+
'kr-vs-kp.arff',
|
|
115
|
+
'banana.arff'
|
|
116
|
+
]
|
|
117
|
+
for i in collection:
|
|
118
|
+
df = DataParser(i,
|
|
119
|
+
'Classification',
|
|
120
|
+
one_hot = False,
|
|
121
|
+
toInt = False,
|
|
122
|
+
StdScale=False)
|
|
123
|
+
print(i)
|
|
124
|
+
print(df)
|
|
125
|
+
df = binarize_reduced(df,max_thresholds=20)
|
|
126
|
+
print(df)
|
|
127
|
+
save_binarized_df(df,f'DL85_Problems/{i.split(".")[0]}.txt')
|
|
128
|
+
#########################################
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
2
|
+
from .ORToolsClassifier import optimal_OMT
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class OptimalModelTreeClassifier(BaseEstimator, ClassifierMixin):
|
|
7
|
+
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
splits=1,
|
|
11
|
+
C=1.0,
|
|
12
|
+
timeout=60,
|
|
13
|
+
split_type="Parallel",
|
|
14
|
+
model_tree=True,
|
|
15
|
+
random_seed=7,
|
|
16
|
+
super_sparse_integers=True,
|
|
17
|
+
meta=False,
|
|
18
|
+
ww=False,
|
|
19
|
+
sum_to_zero=False,
|
|
20
|
+
console_log=True,
|
|
21
|
+
):
|
|
22
|
+
self.splits = splits
|
|
23
|
+
self.C = C
|
|
24
|
+
self.timeout = timeout
|
|
25
|
+
self.split_type = split_type
|
|
26
|
+
self.model_tree = model_tree
|
|
27
|
+
self.random_seed = random_seed
|
|
28
|
+
self.super_sparse_integers = super_sparse_integers
|
|
29
|
+
self.meta = meta
|
|
30
|
+
self.ww = ww
|
|
31
|
+
self.sum_to_zero = sum_to_zero
|
|
32
|
+
self.console_log = console_log
|
|
33
|
+
|
|
34
|
+
def fit(self, X, y):
|
|
35
|
+
|
|
36
|
+
if not isinstance(X, pd.DataFrame):
|
|
37
|
+
X = pd.DataFrame(X)
|
|
38
|
+
|
|
39
|
+
# FORCE y INTO 1D
|
|
40
|
+
y = np.asarray(y).ravel()
|
|
41
|
+
df = X.copy()
|
|
42
|
+
df["target"] = y
|
|
43
|
+
|
|
44
|
+
features = list(X.columns)
|
|
45
|
+
labels = ("target", np.unique(y))
|
|
46
|
+
|
|
47
|
+
config = {
|
|
48
|
+
"RandomSeed": self.random_seed,
|
|
49
|
+
"ProbType": "Classification",
|
|
50
|
+
"ModelTree": self.model_tree,
|
|
51
|
+
"SplitType": self.split_type,
|
|
52
|
+
"Timeout": self.timeout,
|
|
53
|
+
"Meta": self.meta,
|
|
54
|
+
"WW": self.ww,
|
|
55
|
+
"SuperSparseIntegers": self.super_sparse_integers,
|
|
56
|
+
"SumToZero": self.sum_to_zero,
|
|
57
|
+
"ConsoleLog": self.console_log,
|
|
58
|
+
"df_name": "sklearn",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
self.tree_, self.runtime_ = optimal_OMT(
|
|
62
|
+
df=df,
|
|
63
|
+
features=features,
|
|
64
|
+
labels=labels,
|
|
65
|
+
Splits=self.splits,
|
|
66
|
+
C=self.C,
|
|
67
|
+
config=config,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
self.classes_ = np.unique(y)
|
|
71
|
+
self.model_ = self.tree_
|
|
72
|
+
self.built_tree_ = self.tree_.build_tree(self.tree_.root.value)
|
|
73
|
+
|
|
74
|
+
return self
|
|
75
|
+
|
|
76
|
+
def predict(self, X):
|
|
77
|
+
|
|
78
|
+
if not isinstance(X, pd.DataFrame):
|
|
79
|
+
X = pd.DataFrame(X)
|
|
80
|
+
|
|
81
|
+
X_dict = X.to_dict("index")
|
|
82
|
+
|
|
83
|
+
preds = self.model_.predict_class(
|
|
84
|
+
X_dict,
|
|
85
|
+
self.built_tree_,
|
|
86
|
+
None,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
return np.asarray(preds)
|
|
90
|
+
|
|
91
|
+
# from sklearn.model_selection import train_test_split
|
|
92
|
+
# from sklearn.model_selection import GridSearchCV, cross_val_score
|
|
93
|
+
# from sklearn.metrics import accuracy_score
|
|
94
|
+
# from sklearn.utils import shuffle
|
|
95
|
+
# from DatabaseParser import DataParser
|
|
96
|
+
|
|
97
|
+
# if __name__ == "__main__":
|
|
98
|
+
# clf = OptimalModelTreeClassifier(
|
|
99
|
+
# splits=1,
|
|
100
|
+
# C=1
|
|
101
|
+
# )
|
|
102
|
+
#
|
|
103
|
+
# ProbType = 'Classification'
|
|
104
|
+
# file = 'blogger'
|
|
105
|
+
# df = DataParser(f'{file}.arff', ProbType, one_hot=True)
|
|
106
|
+
# df = shuffle(df,random_state=7)
|
|
107
|
+
#
|
|
108
|
+
# X = df.loc[ : , df.columns != 'class']
|
|
109
|
+
# y = df['class']
|
|
110
|
+
|
|
111
|
+
# data = load_df(as_frame=True)
|
|
112
|
+
# X = data.data
|
|
113
|
+
# y = data.target
|
|
114
|
+
|
|
115
|
+
# X_train, X_test, y_train, y_test = train_test_split(
|
|
116
|
+
# X,
|
|
117
|
+
# y,
|
|
118
|
+
# test_size=0.2,
|
|
119
|
+
# stratify=y,
|
|
120
|
+
# random_state=7
|
|
121
|
+
# )
|
|
122
|
+
#
|
|
123
|
+
# clf.fit(X_train,y_train)
|
|
124
|
+
# y_pred = clf.predict(X_test)
|
|
125
|
+
# acc = accuracy_score(y_test, y_pred)
|
|
126
|
+
# print("Test accuracy:", acc)
|
|
127
|
+
|
|
128
|
+
# cross_val_score(clf, X, y, cv=3)
|
|
129
|
+
# grid = GridSearchCV(
|
|
130
|
+
# OptimalModelTreeClassifier(),
|
|
131
|
+
# {"C": [0.1, 1, 10], "splits": [1, 2]},
|
|
132
|
+
# cv=3
|
|
133
|
+
# )
|
|
134
|
+
# print('Grid Search')
|
|
135
|
+
# grid.fit(X, y)
|
|
136
|
+
# y_pred = clf.predict(X_test)
|
|
137
|
+
# acc = accuracy_score(y_test, y_pred)
|
|
138
|
+
# print("Test accuracy:", acc)
|
|
@@ -0,0 +1,577 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from time import process_time as tm
|
|
3
|
+
from ortools.linear_solver import pywraplp
|
|
4
|
+
from binarytree import build
|
|
5
|
+
from .TreeStructure import OptimalTree,Parent
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def optimal_OMT(df, features, labels, Splits, C, config):
|
|
9
|
+
|
|
10
|
+
gamma = 1 # this is the margin of the SVMs
|
|
11
|
+
|
|
12
|
+
df = df.reset_index(drop=True)
|
|
13
|
+
I = df.index.values
|
|
14
|
+
|
|
15
|
+
classes = df[labels[0]].unique()
|
|
16
|
+
|
|
17
|
+
LabelsPerClass = {
|
|
18
|
+
c: {
|
|
19
|
+
i: 1 if df.loc[i, labels[0]] == c else -1
|
|
20
|
+
for i in I
|
|
21
|
+
}
|
|
22
|
+
for c in classes
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
mu = {
|
|
26
|
+
feature: min([abs(first - second)
|
|
27
|
+
for first, second in zip(df[feature][:-1], df[feature][1:])
|
|
28
|
+
if second != first
|
|
29
|
+
])
|
|
30
|
+
for feature in features
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
mu_min = min(mu.values())
|
|
34
|
+
|
|
35
|
+
# depth of the tree DOES NOT include root level
|
|
36
|
+
nodes = [i for i in range(2 ** (int(np.ceil(np.log2(Splits + 1))) + 1) - 1)]
|
|
37
|
+
binary_tree = build(nodes)
|
|
38
|
+
root = binary_tree.levels[0][0]
|
|
39
|
+
|
|
40
|
+
# print(binary_tree)
|
|
41
|
+
|
|
42
|
+
T_L = [i.value for i in binary_tree.leaves] # leave nodes
|
|
43
|
+
T_B = [i for i in binary_tree.values if i not in T_L] # branch nodes
|
|
44
|
+
|
|
45
|
+
A_l = {
|
|
46
|
+
i: [j.value for j in list(root) if j != i and j.left != None and i in j.left.values] for i in binary_tree.values
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
A_r = {
|
|
50
|
+
i: [j.value for j in list(root) if j != i and j.left != None and i in j.right.values] for i in
|
|
51
|
+
binary_tree.values
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
D_l = {
|
|
55
|
+
i : [k.value for k in j.left.leaves]
|
|
56
|
+
for i in T_B
|
|
57
|
+
for j in list(root)
|
|
58
|
+
if j.value == i
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
D_r = {
|
|
62
|
+
i: [k.value for k in j.right.leaves]
|
|
63
|
+
for i in T_B
|
|
64
|
+
for j in list(root)
|
|
65
|
+
if j.value == i
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
P = {
|
|
70
|
+
i: Parent(root, i) for i in binary_tree.values
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
m = pywraplp.Solver.CreateSolver("SCIP")
|
|
74
|
+
|
|
75
|
+
if m is None:
|
|
76
|
+
raise RuntimeError("SCIP not available")
|
|
77
|
+
|
|
78
|
+
INF = m.infinity()
|
|
79
|
+
|
|
80
|
+
# m.setParam('Threads',1)
|
|
81
|
+
m.SetTimeLimit( int(config["Timeout"] * 60 * 1000) )
|
|
82
|
+
|
|
83
|
+
feature_bounds = {}
|
|
84
|
+
|
|
85
|
+
for f in features:
|
|
86
|
+
mn = float(df[f].min())
|
|
87
|
+
|
|
88
|
+
mx = float(df[f].max())
|
|
89
|
+
|
|
90
|
+
feature_bounds[f] = (mn, mx)
|
|
91
|
+
|
|
92
|
+
M = (
|
|
93
|
+
sum(max(abs(df[f].min()), abs(df[f].max())) for f in features)
|
|
94
|
+
+
|
|
95
|
+
max(abs(df[f].min()), abs(df[f].max()))
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# ==================================================
|
|
99
|
+
# VARIABLES
|
|
100
|
+
# ==================================================
|
|
101
|
+
|
|
102
|
+
d = { t: m.BoolVar(f"d[{t}]") for t in T_B }
|
|
103
|
+
|
|
104
|
+
# Parallel split variables
|
|
105
|
+
a = {
|
|
106
|
+
(f, t): m.BoolVar(f"a[{f},{t}]")
|
|
107
|
+
for f in features
|
|
108
|
+
for t in T_B
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
b = {
|
|
112
|
+
t: m.NumVar(-INF, INF, f"b[{t}]")
|
|
113
|
+
for t in T_B
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
z = {
|
|
117
|
+
(i, l): m.BoolVar(f"z[{i},{l}]")
|
|
118
|
+
for i in I
|
|
119
|
+
for l in T_L
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
lvar = {
|
|
123
|
+
t: m.BoolVar(f"l[{t}]")
|
|
124
|
+
for t in T_L
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# ==================================================
|
|
128
|
+
# LEAF SVM
|
|
129
|
+
# ==================================================
|
|
130
|
+
|
|
131
|
+
binary_case = len(classes) == 2
|
|
132
|
+
|
|
133
|
+
if binary_case:
|
|
134
|
+
Beta = {
|
|
135
|
+
(f, t): m.NumVar(-INF, INF, f"Beta[{f},{t}]")
|
|
136
|
+
for f in features
|
|
137
|
+
for t in T_L
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
Bet_abs = {
|
|
141
|
+
(f, t): m.NumVar(0, INF, f"Bet_abs[{f},{t}]")
|
|
142
|
+
for f in features
|
|
143
|
+
for t in T_L
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
Delta = {
|
|
147
|
+
t: m.NumVar(-INF, INF, f"Delta[{t}]")
|
|
148
|
+
for t in T_L
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
e = {
|
|
152
|
+
(i, t): m.NumVar(0, INF, f"e[{i},{t}]")
|
|
153
|
+
for i in I
|
|
154
|
+
for t in T_L
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
else:
|
|
158
|
+
Beta = {
|
|
159
|
+
(c, f, t): m.NumVar(
|
|
160
|
+
-INF,
|
|
161
|
+
INF,
|
|
162
|
+
f"Beta[{c},{f},{t}]"
|
|
163
|
+
)
|
|
164
|
+
for c in classes
|
|
165
|
+
for f in features
|
|
166
|
+
for t in T_L
|
|
167
|
+
}
|
|
168
|
+
Bet_abs = {
|
|
169
|
+
(c, f, t): m.NumVar(
|
|
170
|
+
0,
|
|
171
|
+
INF,
|
|
172
|
+
f"Bet_abs[{c},{f},{t}]"
|
|
173
|
+
)
|
|
174
|
+
for c in classes
|
|
175
|
+
for f in features
|
|
176
|
+
for t in T_L
|
|
177
|
+
}
|
|
178
|
+
Delta = {
|
|
179
|
+
(c, t): m.NumVar(
|
|
180
|
+
-INF,
|
|
181
|
+
INF,
|
|
182
|
+
f"Delta[{c},{t}]"
|
|
183
|
+
)
|
|
184
|
+
for c in classes
|
|
185
|
+
for t in T_L
|
|
186
|
+
}
|
|
187
|
+
e = {
|
|
188
|
+
(c, i, t): m.NumVar(
|
|
189
|
+
0,
|
|
190
|
+
INF,
|
|
191
|
+
f"e[{c},{i},{t}]"
|
|
192
|
+
)
|
|
193
|
+
for c in classes
|
|
194
|
+
for i in I
|
|
195
|
+
for t in T_L
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
# ==================================================
|
|
199
|
+
# SPLIT STRUCTURE
|
|
200
|
+
# ==================================================
|
|
201
|
+
|
|
202
|
+
for t in T_B:
|
|
203
|
+
m.Add(
|
|
204
|
+
sum(a[f, t] for f in features)
|
|
205
|
+
== d[t]
|
|
206
|
+
)
|
|
207
|
+
for t in [i for i in T_B if i != root.value]:
|
|
208
|
+
m.Add(d[t] <= d[P[t]])
|
|
209
|
+
|
|
210
|
+
# ==================================================
|
|
211
|
+
# LEAF OCCUPANCY
|
|
212
|
+
# ==================================================
|
|
213
|
+
|
|
214
|
+
for t in T_L:
|
|
215
|
+
for i in I:
|
|
216
|
+
m.Add(z[i, t] <= lvar[t])
|
|
217
|
+
|
|
218
|
+
for t in T_L:
|
|
219
|
+
m.Add(
|
|
220
|
+
sum(z[i, t] for i in I)
|
|
221
|
+
>= lvar[t]
|
|
222
|
+
)
|
|
223
|
+
for i in I:
|
|
224
|
+
m.Add(
|
|
225
|
+
sum(z[i, t] for t in T_L)
|
|
226
|
+
== 1
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# ==================================================
|
|
230
|
+
# ROUTING CONSTRAINTS
|
|
231
|
+
# Indicator -> Big-M
|
|
232
|
+
# ==================================================
|
|
233
|
+
|
|
234
|
+
for i in I:
|
|
235
|
+
for leaf in T_L:
|
|
236
|
+
# LEFT
|
|
237
|
+
for t in A_l[leaf]:
|
|
238
|
+
m.Add(
|
|
239
|
+
sum(
|
|
240
|
+
a[f, t]
|
|
241
|
+
* (df.loc[i, f] + mu[f] - mu_min)
|
|
242
|
+
for f in features
|
|
243
|
+
)
|
|
244
|
+
+ mu_min
|
|
245
|
+
<=
|
|
246
|
+
b[t] + M * (1 - z[i, leaf])
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# RIGHT
|
|
250
|
+
for t in A_r[leaf]:
|
|
251
|
+
m.Add(
|
|
252
|
+
sum( a[f, t] * df.loc[i, f] for f in features )
|
|
253
|
+
>=
|
|
254
|
+
b[t] - M * (1 - z[i, leaf])
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# ==================================================
|
|
258
|
+
|
|
259
|
+
# ACTIVE SPLIT => NONEMPTY DESCENDANTS
|
|
260
|
+
|
|
261
|
+
# ==================================================
|
|
262
|
+
|
|
263
|
+
for t in T_B:
|
|
264
|
+
m.Add(
|
|
265
|
+
d[t] <= sum( lvar[k] for k in D_l[t] )
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
m.Add(
|
|
269
|
+
d[t] <= sum( lvar[k] for k in D_r[t] )
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# ==================================================
|
|
273
|
+
# LEAF CLASSIFIERS
|
|
274
|
+
# ==================================================
|
|
275
|
+
|
|
276
|
+
if binary_case:
|
|
277
|
+
# assumes labels encoded {-1,+1}
|
|
278
|
+
for i in I:
|
|
279
|
+
yi = float(df.loc[i, labels[0]])
|
|
280
|
+
for t in T_L:
|
|
281
|
+
svm_expr = (
|
|
282
|
+
|
|
283
|
+
sum(
|
|
284
|
+
|
|
285
|
+
Beta[f, t]
|
|
286
|
+
|
|
287
|
+
* df.loc[i, f]
|
|
288
|
+
|
|
289
|
+
for f in features
|
|
290
|
+
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
+ Delta[t]
|
|
294
|
+
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
m.Add(
|
|
298
|
+
|
|
299
|
+
gamma
|
|
300
|
+
|
|
301
|
+
- e[i, t]
|
|
302
|
+
|
|
303
|
+
<= svm_expr * yi
|
|
304
|
+
|
|
305
|
+
+ M * (1 - z[i, t])
|
|
306
|
+
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# abs(Beta)
|
|
310
|
+
|
|
311
|
+
for f in features:
|
|
312
|
+
|
|
313
|
+
for t in T_L:
|
|
314
|
+
m.Add(
|
|
315
|
+
|
|
316
|
+
Bet_abs[f, t]
|
|
317
|
+
|
|
318
|
+
>= Beta[f, t]
|
|
319
|
+
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
m.Add(
|
|
323
|
+
|
|
324
|
+
Bet_abs[f, t]
|
|
325
|
+
|
|
326
|
+
>= -Beta[f, t]
|
|
327
|
+
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
else:
|
|
331
|
+
|
|
332
|
+
for c in classes:
|
|
333
|
+
|
|
334
|
+
for i in I:
|
|
335
|
+
|
|
336
|
+
for t in T_L:
|
|
337
|
+
svm_expr = (
|
|
338
|
+
|
|
339
|
+
sum(
|
|
340
|
+
|
|
341
|
+
Beta[c,f,t]
|
|
342
|
+
|
|
343
|
+
* df.loc[i, f]
|
|
344
|
+
|
|
345
|
+
for f in features
|
|
346
|
+
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
+ Delta[c,t]
|
|
350
|
+
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
m.Add(
|
|
354
|
+
|
|
355
|
+
gamma
|
|
356
|
+
|
|
357
|
+
- e[c,i,t]
|
|
358
|
+
|
|
359
|
+
<= svm_expr
|
|
360
|
+
|
|
361
|
+
* LabelsPerClass[c][i]
|
|
362
|
+
|
|
363
|
+
+ M * (1 - z[i, t])
|
|
364
|
+
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
for c in classes:
|
|
368
|
+
|
|
369
|
+
for f in features:
|
|
370
|
+
|
|
371
|
+
for t in T_L:
|
|
372
|
+
m.Add(
|
|
373
|
+
|
|
374
|
+
Bet_abs[c,f,t]
|
|
375
|
+
|
|
376
|
+
>= Beta[c,f,t]
|
|
377
|
+
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
m.Add(
|
|
381
|
+
|
|
382
|
+
Bet_abs[c,f,t]
|
|
383
|
+
|
|
384
|
+
>= -Beta[c,f,t]
|
|
385
|
+
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# ==================================================
|
|
389
|
+
# SPLIT BUDGET
|
|
390
|
+
# ==================================================
|
|
391
|
+
|
|
392
|
+
m.Add( sum(d[t] for t in T_B) <= Splits )
|
|
393
|
+
|
|
394
|
+
# ==================================================
|
|
395
|
+
# OBJECTIVE
|
|
396
|
+
# ==================================================
|
|
397
|
+
|
|
398
|
+
if binary_case:
|
|
399
|
+
|
|
400
|
+
objective = (
|
|
401
|
+
|
|
402
|
+
sum( Bet_abs[f, t] for f in features for t in T_L )
|
|
403
|
+
+
|
|
404
|
+
C * sum( e[i, t]
|
|
405
|
+
|
|
406
|
+
for i in I
|
|
407
|
+
|
|
408
|
+
for t in T_L
|
|
409
|
+
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
else:
|
|
415
|
+
|
|
416
|
+
objective = (
|
|
417
|
+
|
|
418
|
+
sum(
|
|
419
|
+
|
|
420
|
+
Bet_abs[c,f,t]
|
|
421
|
+
|
|
422
|
+
for c in classes
|
|
423
|
+
|
|
424
|
+
for f in features
|
|
425
|
+
|
|
426
|
+
for t in T_L
|
|
427
|
+
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
+ C * sum( e[c,i,t] for c in classes for i in I for t in T_L )
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
m.Minimize(objective)
|
|
434
|
+
|
|
435
|
+
start = tm()
|
|
436
|
+
status = m.Solve()
|
|
437
|
+
runtime = tm() - start
|
|
438
|
+
|
|
439
|
+
splitting_nodes = {}
|
|
440
|
+
|
|
441
|
+
if status != pywraplp.Solver.INFEASIBLE:
|
|
442
|
+
vars = m.variables()
|
|
443
|
+
solution = {
|
|
444
|
+
i.name():i.solution_value()
|
|
445
|
+
for i in vars}
|
|
446
|
+
|
|
447
|
+
non_zero_vars = [key for key,value in solution.items() if value > 0]
|
|
448
|
+
|
|
449
|
+
if config["SplitType"] == "Parallel":
|
|
450
|
+
splitting_nodes = {
|
|
451
|
+
i:{
|
|
452
|
+
'a': [f for f in features if solution[f'a[{f},{i}]'] > 0][0],
|
|
453
|
+
'b': round(solution[f'b[{i}]'],6)
|
|
454
|
+
}
|
|
455
|
+
for i in T_B if f'd[{i}]' in non_zero_vars
|
|
456
|
+
}
|
|
457
|
+
elif config["SplitType"] == "Oblique":
|
|
458
|
+
splitting_nodes = {
|
|
459
|
+
i: {
|
|
460
|
+
'a': {f: round(solution[f'a[{f},{i}]'], 6)
|
|
461
|
+
for f in features
|
|
462
|
+
},
|
|
463
|
+
'b': round(solution[f'b[{i}]'], 6)
|
|
464
|
+
}
|
|
465
|
+
for i in T_B if f'd[{i}]' in non_zero_vars
|
|
466
|
+
}
|
|
467
|
+
if len(classes) == 2:
|
|
468
|
+
non_empty_nodes = {
|
|
469
|
+
i: {
|
|
470
|
+
'Beta': {
|
|
471
|
+
j: round(solution[f'Beta[{j},{i}]'], 6)
|
|
472
|
+
for j in features
|
|
473
|
+
},
|
|
474
|
+
'Delta': round(solution[f'Delta[{i}]'], 6)
|
|
475
|
+
}
|
|
476
|
+
for i in T_L if f'l[{i}]' in non_zero_vars
|
|
477
|
+
}
|
|
478
|
+
else:
|
|
479
|
+
non_empty_nodes = {
|
|
480
|
+
i:{
|
|
481
|
+
c:{
|
|
482
|
+
'Beta':{
|
|
483
|
+
j: round(solution[f'Beta[{c},{j},{i}]'],6)
|
|
484
|
+
for j in features
|
|
485
|
+
},
|
|
486
|
+
'Delta':round(solution[f'Delta[{c},{i}]'],6)
|
|
487
|
+
}
|
|
488
|
+
for c in classes
|
|
489
|
+
}
|
|
490
|
+
for i in T_L if f'l[{i}]' in non_zero_vars
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
ODT = OptimalTree(
|
|
494
|
+
non_empty_nodes,
|
|
495
|
+
splitting_nodes,
|
|
496
|
+
int(np.ceil(np.log2(Splits + 1))),
|
|
497
|
+
config["SplitType"],
|
|
498
|
+
True,
|
|
499
|
+
classes
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
else:
|
|
503
|
+
print('MODEL IS INFEASIBLE')
|
|
504
|
+
ODT = None
|
|
505
|
+
|
|
506
|
+
return ODT,runtime
|
|
507
|
+
|
|
508
|
+
# from sklearn.metrics import accuracy_score
|
|
509
|
+
# from sklearn.utils import shuffle
|
|
510
|
+
# from DatabaseParser import DataParser
|
|
511
|
+
# if __name__ == "__main__":
|
|
512
|
+
#
|
|
513
|
+
# ProbType = 'Classification'
|
|
514
|
+
# TestSize = 0.2
|
|
515
|
+
# file = 'blogger'
|
|
516
|
+
# Splits = 1
|
|
517
|
+
#
|
|
518
|
+
# config ={
|
|
519
|
+
# 'RandomSeed':7,
|
|
520
|
+
# 'SplitType': 'Parallel',
|
|
521
|
+
# 'label_name': 'class',
|
|
522
|
+
# 'Timeout': 60, # for the single iteration (IN MINUTES)
|
|
523
|
+
# 'ConsoleLog':False
|
|
524
|
+
# }
|
|
525
|
+
#
|
|
526
|
+
# df = DataParser(f'{file}.arff',ProbType, one_hot=True)
|
|
527
|
+
#
|
|
528
|
+
# df = shuffle(df,random_state=config['RandomSeed'])
|
|
529
|
+
#
|
|
530
|
+
# Test_df = df.iloc[:round(len(df) * TestSize)]
|
|
531
|
+
# Train_df = df.iloc[len(Test_df):]
|
|
532
|
+
#
|
|
533
|
+
# # ELIMIATING A COLUMN FROM ALL DATASETS IF ALL THE VALUES IN IT ARE THE SAME IN THE TRAIN SET
|
|
534
|
+
# for i in Train_df.columns:
|
|
535
|
+
# if Train_df[i].nunique() == 1:
|
|
536
|
+
# Train_df = Train_df.drop(columns=[i])
|
|
537
|
+
# Test_df = Test_df.drop(columns=[i])
|
|
538
|
+
#
|
|
539
|
+
# features = list(Train_df.columns.drop(['class']))
|
|
540
|
+
# labels = df['class'].unique()
|
|
541
|
+
# labels = ('class', labels)
|
|
542
|
+
#
|
|
543
|
+
# for C in [1]:#[0.1, 1, 10, 100]:
|
|
544
|
+
# ODT,runtime = optimal_OMT(
|
|
545
|
+
# df= Train_df,
|
|
546
|
+
# features= features,
|
|
547
|
+
# labels= labels,
|
|
548
|
+
# Splits= Splits,
|
|
549
|
+
# C= C,
|
|
550
|
+
# config=config
|
|
551
|
+
# )
|
|
552
|
+
#
|
|
553
|
+
# print('Runtime:',round(runtime,3),end=" ")
|
|
554
|
+
# print('C:',C,end=' ')
|
|
555
|
+
# the_tree = ODT.build_tree(ODT.root.value)
|
|
556
|
+
# # ODT.print_tree(the_tree)
|
|
557
|
+
#
|
|
558
|
+
# # split train into features and labels
|
|
559
|
+
# X_train = Train_df.drop(columns='class')
|
|
560
|
+
# X_train = X_train.to_dict('index')
|
|
561
|
+
# Y_train = Train_df['class']
|
|
562
|
+
#
|
|
563
|
+
# # split test set into features and labels
|
|
564
|
+
# X_test = Test_df.drop(columns='class')
|
|
565
|
+
# X_test = X_test.to_dict('index')
|
|
566
|
+
# Y_test = Test_df['class']
|
|
567
|
+
#
|
|
568
|
+
# # Predict the train set
|
|
569
|
+
# train_pred = ODT.predict_class(X_train, the_tree, None)
|
|
570
|
+
# print('Train:', round(accuracy_score(Y_train, train_pred) * 100, 2), '%',end=' ')
|
|
571
|
+
#
|
|
572
|
+
# # Predict the test set
|
|
573
|
+
# test_pred = ODT.predict_class(X_test, the_tree,None)
|
|
574
|
+
# print('Test:', round(accuracy_score(Y_test, test_pred)*100,2),'%')
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from binarytree import build
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Node():
|
|
7
|
+
def __init__(self, name=None, feature=None, threshold=None, left=None, right=None, value=None):
|
|
8
|
+
# decision node
|
|
9
|
+
self.name = name
|
|
10
|
+
self.feature = feature
|
|
11
|
+
self.threshold = threshold
|
|
12
|
+
self.left = left
|
|
13
|
+
self.right = right
|
|
14
|
+
# leaf node
|
|
15
|
+
self.value = value
|
|
16
|
+
|
|
17
|
+
def __str__(self):
|
|
18
|
+
return f'Node {self.name}, feature {self.feature}, threshold {self.threshold}'
|
|
19
|
+
|
|
20
|
+
class OptimalTree():
|
|
21
|
+
def __init__(self, non_empty_nodes,splitting_nodes,depth,SplitType='Parallel',ModelTree=True,classes=[]):
|
|
22
|
+
self.non_emtpy_nodes = non_empty_nodes
|
|
23
|
+
self.splitting_nodes = splitting_nodes
|
|
24
|
+
self.depth = depth
|
|
25
|
+
self.SplitType = SplitType
|
|
26
|
+
self.ModelTree = ModelTree
|
|
27
|
+
self.nodes = [i for i in range(2 ** (depth + 1) - 1)]
|
|
28
|
+
self.classes = classes
|
|
29
|
+
|
|
30
|
+
self.complete_tree = build(self.nodes)
|
|
31
|
+
self.T_L = [i.value for i in self.complete_tree.leaves] # leave nodes
|
|
32
|
+
self.T_B = [i for i in self.complete_tree.values if i not in self.T_L]
|
|
33
|
+
self.root = self.complete_tree.levels[0][0]
|
|
34
|
+
|
|
35
|
+
def build_tree(self, current_node):
|
|
36
|
+
|
|
37
|
+
if current_node in self.splitting_nodes:
|
|
38
|
+
left_subtree = self.build_tree( Children(self.root,current_node)[0].value)
|
|
39
|
+
right_subtree = self.build_tree(Children(self.root, current_node)[1].value)
|
|
40
|
+
return Node(
|
|
41
|
+
current_node,
|
|
42
|
+
self.splitting_nodes[current_node]['a'],
|
|
43
|
+
self.splitting_nodes[current_node]['b'],
|
|
44
|
+
left_subtree,
|
|
45
|
+
right_subtree
|
|
46
|
+
)
|
|
47
|
+
elif current_node in self.non_emtpy_nodes:
|
|
48
|
+
return Node(
|
|
49
|
+
current_node,
|
|
50
|
+
value = self.non_emtpy_nodes[current_node]
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
descendants = [i for i in self.non_emtpy_nodes if current_node in Ancestors(self.root,i)]
|
|
54
|
+
if len(descendants) > 0:
|
|
55
|
+
return Node(
|
|
56
|
+
current_node,
|
|
57
|
+
value=self.non_emtpy_nodes[descendants[0]]
|
|
58
|
+
)
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError('THE TREE HAS TO MANY SPLITS')
|
|
61
|
+
|
|
62
|
+
def print_tree(self,tree, indent=" "):
|
|
63
|
+
|
|
64
|
+
if tree is not None:
|
|
65
|
+
print('Node',tree.name)
|
|
66
|
+
if tree.value is not None:
|
|
67
|
+
print(tree.value)
|
|
68
|
+
else:
|
|
69
|
+
print(f'{str(tree.feature)} < {tree.threshold} ')
|
|
70
|
+
print(f'{indent}left:', end="")
|
|
71
|
+
self.print_tree(tree.left, indent + indent)
|
|
72
|
+
print('%sright:' % (indent), end="")
|
|
73
|
+
self.print_tree(tree.right, indent + indent)
|
|
74
|
+
else:
|
|
75
|
+
print('No Node')
|
|
76
|
+
|
|
77
|
+
def predict_regr(self, X, tree,f2=None):
|
|
78
|
+
'''function to predict_regr a new dataset'''
|
|
79
|
+
predictions = [self.make_regression(x, tree,f2) for x in X.values()]
|
|
80
|
+
return predictions
|
|
81
|
+
|
|
82
|
+
def make_regression(self, x, tree,f2=None):
|
|
83
|
+
'''function to make a single prediction'''
|
|
84
|
+
features = f2 if f2 != None else x
|
|
85
|
+
if self.ModelTree:
|
|
86
|
+
if tree.value != None:
|
|
87
|
+
return sum([ tree.value['Beta'][f] * x[f] for f in features]) + tree.value['Delta']
|
|
88
|
+
else:
|
|
89
|
+
if tree.value != None:
|
|
90
|
+
return tree.value
|
|
91
|
+
|
|
92
|
+
if self.SplitType == 'Parallel':
|
|
93
|
+
if x[tree.feature] < tree.threshold:
|
|
94
|
+
return self.make_regression(x, tree.left,f2)
|
|
95
|
+
else:
|
|
96
|
+
return self.make_regression(x, tree.right,f2)
|
|
97
|
+
elif self.SplitType == 'Oblique':
|
|
98
|
+
if sum([x[key] * value for key, value in tree.feature.items()]) < tree.threshold:
|
|
99
|
+
return self.make_regression(x, tree.left,f2)
|
|
100
|
+
else:
|
|
101
|
+
return self.make_regression(x, tree.right,f2)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def predict_class(self, X, tree,f2=None):
|
|
105
|
+
'''function to predict_regr a new dataset'''
|
|
106
|
+
predictions = [self.make_classification(x, tree,f2) for x in X.values()]
|
|
107
|
+
return predictions
|
|
108
|
+
|
|
109
|
+
def make_classification(self, x, tree,f2=None):
|
|
110
|
+
'''function to make a single prediction'''
|
|
111
|
+
features = f2 if f2 != None else x
|
|
112
|
+
|
|
113
|
+
if self.ModelTree:
|
|
114
|
+
if tree.value != None:
|
|
115
|
+
if len(self.classes)>2:
|
|
116
|
+
scores = {
|
|
117
|
+
c:sum([tree.value[c]['Beta'][f] * x[f] for f in features]) + tree.value[c]['Delta']
|
|
118
|
+
for c in self.classes
|
|
119
|
+
}
|
|
120
|
+
return max(scores, key=scores.get)
|
|
121
|
+
|
|
122
|
+
else:
|
|
123
|
+
return 1 if sum([ tree.value['Beta'][f] * x[f] for f in features]) + tree.value['Delta'] > 0 else -1
|
|
124
|
+
else:
|
|
125
|
+
if tree.value != None:
|
|
126
|
+
return tree.value
|
|
127
|
+
if self.SplitType == 'Parallel':
|
|
128
|
+
if x[tree.feature] < tree.threshold:
|
|
129
|
+
return self.make_classification(x, tree.left,f2)
|
|
130
|
+
else:
|
|
131
|
+
return self.make_classification(x, tree.right,f2)
|
|
132
|
+
elif self.SplitType == 'Oblique':
|
|
133
|
+
if sum([x[key] * value for key, value in tree.feature.items()]) < tree.threshold:
|
|
134
|
+
return self.make_classification(x, tree.left,f2)
|
|
135
|
+
else:
|
|
136
|
+
return self.make_classification(x, tree.right,f2)
|
|
137
|
+
|
|
138
|
+
# Some additional functions needed to deal with binary trees
|
|
139
|
+
|
|
140
|
+
def Ancestors(root, target):
|
|
141
|
+
ancestors = []
|
|
142
|
+
|
|
143
|
+
def findAncestors(root, target):
|
|
144
|
+
# Base case
|
|
145
|
+
if root == None:
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
if root.value == target:
|
|
149
|
+
return True
|
|
150
|
+
|
|
151
|
+
# If target is present in either left or right subtree
|
|
152
|
+
# of this node, then print this node
|
|
153
|
+
if (findAncestors(root.left, target) or
|
|
154
|
+
findAncestors(root.right, target)):
|
|
155
|
+
ancestors.append(root.value)
|
|
156
|
+
# print(root.value,end=' ')
|
|
157
|
+
return True
|
|
158
|
+
|
|
159
|
+
# Else return False
|
|
160
|
+
return False
|
|
161
|
+
|
|
162
|
+
findAncestors(root, target)
|
|
163
|
+
return ancestors
|
|
164
|
+
|
|
165
|
+
def Parent(node, val):
|
|
166
|
+
the_parent = []
|
|
167
|
+
|
|
168
|
+
def findParent(node, val, parent=None):
|
|
169
|
+
if (node is None):
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
# If current node is the required node
|
|
173
|
+
if (node.value == val):
|
|
174
|
+
# assign its parent
|
|
175
|
+
the_parent.append(parent)
|
|
176
|
+
|
|
177
|
+
else:
|
|
178
|
+
# Recursive calls for the children of the current node. current node is now the new parent
|
|
179
|
+
findParent(node.left, val, node.value)
|
|
180
|
+
findParent(node.right, val, node.value)
|
|
181
|
+
|
|
182
|
+
findParent(node, val)
|
|
183
|
+
return the_parent[0]
|
|
184
|
+
|
|
185
|
+
def Children(node, val):
|
|
186
|
+
children = {}
|
|
187
|
+
def findChildren(node, val):
|
|
188
|
+
if (node is None):
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
# # If current node is the required node
|
|
192
|
+
if (node.value == val):
|
|
193
|
+
# assign its parent
|
|
194
|
+
children.update({'left':node.left,'right':node.right})
|
|
195
|
+
|
|
196
|
+
else:
|
|
197
|
+
# Recursive calls for the children of the current node. current node is now the new parent
|
|
198
|
+
findChildren(node.left, val)
|
|
199
|
+
findChildren(node.right, val)
|
|
200
|
+
|
|
201
|
+
findChildren(node, val)
|
|
202
|
+
return children['left'],children['right']
|
|
203
|
+
|
|
204
|
+
def RAE(Y_labels,Y_predicted):
|
|
205
|
+
mean_Y = np.average(Y_labels)
|
|
206
|
+
numerator = sum([ abs(i-j) for i,j in zip(Y_predicted,Y_labels)])
|
|
207
|
+
denominator = sum([abs(mean_Y - j) for j in Y_labels])
|
|
208
|
+
return round(numerator/denominator,2)
|
|
209
|
+
|
|
210
|
+
def RRSE(Y_labels,Y_predicted):
|
|
211
|
+
mean_Y = np.average(Y_labels)
|
|
212
|
+
numerator = sum([ (i-j)**2 for i,j in zip(Y_predicted,Y_labels)])
|
|
213
|
+
denominator = sum([ (mean_Y - j)**2 for j in Y_labels])
|
|
214
|
+
return round(np.sqrt(numerator/denominator),2)
|
|
215
|
+
|
|
216
|
+
def Multiplier(vector):
|
|
217
|
+
theVec = []
|
|
218
|
+
for i in vector:
|
|
219
|
+
if '.' in str(i):
|
|
220
|
+
theVec.append(len(str(i).split('.')[1]))
|
|
221
|
+
else:
|
|
222
|
+
theVec.append(0)
|
|
223
|
+
return 10**max(theVec)
|
|
224
|
+
|
|
225
|
+
if __name__ == "__main__":
|
|
226
|
+
print(Multiplier([2.01,3.0101]))
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: optimal-omt
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Optimal Model Trees using OR-Tools
|
|
5
|
+
Requires-Python: >=3.9
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: numpy
|
|
8
|
+
Requires-Dist: pandas
|
|
9
|
+
Requires-Dist: scikit-learn
|
|
10
|
+
Requires-Dist: binarytree
|
|
11
|
+
Requires-Dist: ortools
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: pytest; extra == "dev"
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
omt/DatabaseParser.py
|
|
4
|
+
omt/Estimator.py
|
|
5
|
+
omt/ORToolsClassifier.py
|
|
6
|
+
omt/TreeStructure.py
|
|
7
|
+
omt/__init__.py
|
|
8
|
+
optimal_omt.egg-info/PKG-INFO
|
|
9
|
+
optimal_omt.egg-info/SOURCES.txt
|
|
10
|
+
optimal_omt.egg-info/dependency_links.txt
|
|
11
|
+
optimal_omt.egg-info/requires.txt
|
|
12
|
+
optimal_omt.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
omt
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "optimal-omt"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Optimal Model Trees using OR-Tools"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
|
|
12
|
+
dependencies = [
|
|
13
|
+
"numpy",
|
|
14
|
+
"pandas",
|
|
15
|
+
"scikit-learn",
|
|
16
|
+
"binarytree",
|
|
17
|
+
"ortools"
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
dev = ["pytest"]
|
|
22
|
+
|
|
23
|
+
[tool.setuptools]
|
|
24
|
+
packages = ["omt"]
|