optimal-omt 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: optimal-omt
3
+ Version: 0.1.0
4
+ Summary: Optimal Model Trees using OR-Tools
5
+ Requires-Python: >=3.9
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: numpy
8
+ Requires-Dist: pandas
9
+ Requires-Dist: scikit-learn
10
+ Requires-Dist: binarytree
11
+ Requires-Dist: ortools
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest; extra == "dev"
File without changes
@@ -0,0 +1,128 @@
1
+ import arff as rf
2
+ from sklearn.preprocessing import StandardScaler
3
+ import pandas as pd
4
+ from TreeStructure import Multiplier
5
+ from Binarizer import binarize_reduced, save_binarized_df
6
+
7
+ def DataParser(name, ProbType, one_hot = True,toInt = False,StdScale=False):
8
+
9
+ try:
10
+ pd.set_option('future.no_silent_downcasting', True)
11
+ except:
12
+ pass
13
+
14
+ data = rf.load(open(f'{ProbType}Problems/{name}','rt'))
15
+
16
+ label_name = data['attributes'][-1][0]
17
+
18
+ df = pd.DataFrame(data['data'])
19
+ df.columns = [i[0] for i in data['attributes'] ]
20
+
21
+ eliminated_cols = []
22
+ # ELIMIATING A COLUMN FROM ALL DATASETS IF ALL THE VALUES IN IT ARE THE SAME IN THE TRAIN SET
23
+ for i in df.columns:
24
+ if df[i].nunique() <= 1:
25
+ df.drop(columns=[i], inplace=True)
26
+ eliminated_cols.append(i)
27
+
28
+ for i in [ i for i in data['attributes'] if i[0] not in eliminated_cols]:
29
+ # first with the features
30
+ if i[0] != label_name:
31
+ if type(i[1]) == str:
32
+ # replace NaN with mean for Numeric value features
33
+ mean_value = round(df[i[0]].mean(),3)
34
+ df[i[0]] = df[i[0]].fillna(value=mean_value).astype(float)
35
+ if toInt == True:
36
+ # df[i[0]] = df[i[0]].round(3) # todo comment off if you do not want to round
37
+ df.loc[:, i[0]] *= Multiplier(df[i[0]])
38
+ else:
39
+ if one_hot == False:
40
+ temp_feature_values = {i: ind for ind, i in enumerate(i[1])}
41
+ df[i[0]] = df[i[0]].replace(temp_feature_values).astype(float)
42
+ else:
43
+ if len(i[1]) == 2:
44
+ df = pd.get_dummies(df,columns=[i[0]],drop_first=True,dtype=int)
45
+ else:
46
+ df = pd.get_dummies(df, columns=[i[0]],dtype=int)
47
+ elif i[0] == label_name and ProbType == 'Classification': # todo FOR CLASSIFICATION SCALE TARGET
48
+ if len(i[1]) == 2:
49
+ temp_label_values = {i[1][0]:-1,
50
+ i[1][1]:1}
51
+ df[i[0]] = df[i[0]].replace(temp_label_values).astype(float)
52
+ # else:
53
+ # temp_label_values = {i: ind for ind, i in enumerate(i[1])}
54
+ # df[i[0]] = df[i[0]].replace(temp_label_values).astype(int)
55
+ # df.dropna(inplace=True)
56
+
57
+ # ELIMIATING A COLUMN IF ALL THE VALUES IN IT ARE THE SAME
58
+ for i in df.columns:
59
+ if df[i].nunique() == 1:
60
+ df = df.drop(i, axis=1)
61
+
62
+ if StdScale == True:
63
+ ##### STANDARD SCALING #######
64
+ std_scaler = StandardScaler()
65
+ if ProbType == 'Classification':
66
+ features = list(df.columns.drop([label_name]))
67
+ df_scaled = df.drop(columns=label_name)
68
+ else:
69
+ features = list(df.columns)
70
+ df_scaled = df
71
+ df_scaled = std_scaler.fit_transform(df_scaled.to_numpy())
72
+ df_scaled = pd.DataFrame(df_scaled,columns=features)
73
+ if ProbType == 'Classification':
74
+ df_scaled.insert(len(features), label_name, df[label_name], True)
75
+ df = df_scaled
76
+
77
+ id_name = 'ID'
78
+ if id_name in df.columns:
79
+ df.drop(columns=[id_name],inplace=True)
80
+
81
+ return df
82
+
83
+ if __name__ == "__main__":
84
+
85
+ #################### REGRESSION #####################
86
+ # collection = os.listdir('RegressionProblems')
87
+ #
88
+ # for i in collection:
89
+ # df = regression_data_caller(i)
90
+ # print(df)
91
+ #########################################
92
+
93
+ #################### CLASSIFICATION #####################
94
+ collection = [
95
+ ############ BINARY ############
96
+ 'blogger.arff',
97
+ 'boxing.arff',
98
+ 'mux6.arff',
99
+ 'corral.arff',
100
+ 'biomed.arff',
101
+ 'ionosphere.arff',
102
+ 'jEdit.arff',
103
+ 'schizo.arff',
104
+ 'colic.arff',
105
+ 'threeOf9.arff',
106
+ 'R_data_frame.arff',
107
+ 'australian.arff',
108
+ 'doa_bwin_balanced.arff',
109
+ 'blood-transf.arff',
110
+ 'autoUniv.arff',
111
+ 'parity.arff',
112
+ 'banknote.arff',
113
+ 'gametes_Epistasis.arff',
114
+ 'kr-vs-kp.arff',
115
+ 'banana.arff'
116
+ ]
117
+ for i in collection:
118
+ df = DataParser(i,
119
+ 'Classification',
120
+ one_hot = False,
121
+ toInt = False,
122
+ StdScale=False)
123
+ print(i)
124
+ print(df)
125
+ df = binarize_reduced(df,max_thresholds=20)
126
+ print(df)
127
+ save_binarized_df(df,f'DL85_Problems/{i.split(".")[0]}.txt')
128
+ #########################################
@@ -0,0 +1,138 @@
1
+ from sklearn.base import BaseEstimator, ClassifierMixin
2
+ from .ORToolsClassifier import optimal_OMT
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ class OptimalModelTreeClassifier(BaseEstimator, ClassifierMixin):
7
+
8
+ def __init__(
9
+ self,
10
+ splits=1,
11
+ C=1.0,
12
+ timeout=60,
13
+ split_type="Parallel",
14
+ model_tree=True,
15
+ random_seed=7,
16
+ super_sparse_integers=True,
17
+ meta=False,
18
+ ww=False,
19
+ sum_to_zero=False,
20
+ console_log=True,
21
+ ):
22
+ self.splits = splits
23
+ self.C = C
24
+ self.timeout = timeout
25
+ self.split_type = split_type
26
+ self.model_tree = model_tree
27
+ self.random_seed = random_seed
28
+ self.super_sparse_integers = super_sparse_integers
29
+ self.meta = meta
30
+ self.ww = ww
31
+ self.sum_to_zero = sum_to_zero
32
+ self.console_log = console_log
33
+
34
+ def fit(self, X, y):
35
+
36
+ if not isinstance(X, pd.DataFrame):
37
+ X = pd.DataFrame(X)
38
+
39
+ # FORCE y INTO 1D
40
+ y = np.asarray(y).ravel()
41
+ df = X.copy()
42
+ df["target"] = y
43
+
44
+ features = list(X.columns)
45
+ labels = ("target", np.unique(y))
46
+
47
+ config = {
48
+ "RandomSeed": self.random_seed,
49
+ "ProbType": "Classification",
50
+ "ModelTree": self.model_tree,
51
+ "SplitType": self.split_type,
52
+ "Timeout": self.timeout,
53
+ "Meta": self.meta,
54
+ "WW": self.ww,
55
+ "SuperSparseIntegers": self.super_sparse_integers,
56
+ "SumToZero": self.sum_to_zero,
57
+ "ConsoleLog": self.console_log,
58
+ "df_name": "sklearn",
59
+ }
60
+
61
+ self.tree_, self.runtime_ = optimal_OMT(
62
+ df=df,
63
+ features=features,
64
+ labels=labels,
65
+ Splits=self.splits,
66
+ C=self.C,
67
+ config=config,
68
+ )
69
+
70
+ self.classes_ = np.unique(y)
71
+ self.model_ = self.tree_
72
+ self.built_tree_ = self.tree_.build_tree(self.tree_.root.value)
73
+
74
+ return self
75
+
76
+ def predict(self, X):
77
+
78
+ if not isinstance(X, pd.DataFrame):
79
+ X = pd.DataFrame(X)
80
+
81
+ X_dict = X.to_dict("index")
82
+
83
+ preds = self.model_.predict_class(
84
+ X_dict,
85
+ self.built_tree_,
86
+ None,
87
+ )
88
+
89
+ return np.asarray(preds)
90
+
91
+ # from sklearn.model_selection import train_test_split
92
+ # from sklearn.model_selection import GridSearchCV, cross_val_score
93
+ # from sklearn.metrics import accuracy_score
94
+ # from sklearn.utils import shuffle
95
+ # from DatabaseParser import DataParser
96
+
97
+ # if __name__ == "__main__":
98
+ # clf = OptimalModelTreeClassifier(
99
+ # splits=1,
100
+ # C=1
101
+ # )
102
+ #
103
+ # ProbType = 'Classification'
104
+ # file = 'blogger'
105
+ # df = DataParser(f'{file}.arff', ProbType, one_hot=True)
106
+ # df = shuffle(df,random_state=7)
107
+ #
108
+ # X = df.loc[ : , df.columns != 'class']
109
+ # y = df['class']
110
+
111
+ # data = load_df(as_frame=True)
112
+ # X = data.data
113
+ # y = data.target
114
+
115
+ # X_train, X_test, y_train, y_test = train_test_split(
116
+ # X,
117
+ # y,
118
+ # test_size=0.2,
119
+ # stratify=y,
120
+ # random_state=7
121
+ # )
122
+ #
123
+ # clf.fit(X_train,y_train)
124
+ # y_pred = clf.predict(X_test)
125
+ # acc = accuracy_score(y_test, y_pred)
126
+ # print("Test accuracy:", acc)
127
+
128
+ # cross_val_score(clf, X, y, cv=3)
129
+ # grid = GridSearchCV(
130
+ # OptimalModelTreeClassifier(),
131
+ # {"C": [0.1, 1, 10], "splits": [1, 2]},
132
+ # cv=3
133
+ # )
134
+ # print('Grid Search')
135
+ # grid.fit(X, y)
136
+ # y_pred = clf.predict(X_test)
137
+ # acc = accuracy_score(y_test, y_pred)
138
+ # print("Test accuracy:", acc)
@@ -0,0 +1,577 @@
1
+ import numpy as np
2
+ from time import process_time as tm
3
+ from ortools.linear_solver import pywraplp
4
+ from binarytree import build
5
+ from .TreeStructure import OptimalTree,Parent
6
+
7
+
8
+ def optimal_OMT(df, features, labels, Splits, C, config):
9
+
10
+ gamma = 1 # this is the margin of the SVMs
11
+
12
+ df = df.reset_index(drop=True)
13
+ I = df.index.values
14
+
15
+ classes = df[labels[0]].unique()
16
+
17
+ LabelsPerClass = {
18
+ c: {
19
+ i: 1 if df.loc[i, labels[0]] == c else -1
20
+ for i in I
21
+ }
22
+ for c in classes
23
+ }
24
+
25
+ mu = {
26
+ feature: min([abs(first - second)
27
+ for first, second in zip(df[feature][:-1], df[feature][1:])
28
+ if second != first
29
+ ])
30
+ for feature in features
31
+ }
32
+
33
+ mu_min = min(mu.values())
34
+
35
+ # depth of the tree DOES NOT include root level
36
+ nodes = [i for i in range(2 ** (int(np.ceil(np.log2(Splits + 1))) + 1) - 1)]
37
+ binary_tree = build(nodes)
38
+ root = binary_tree.levels[0][0]
39
+
40
+ # print(binary_tree)
41
+
42
+ T_L = [i.value for i in binary_tree.leaves] # leave nodes
43
+ T_B = [i for i in binary_tree.values if i not in T_L] # branch nodes
44
+
45
+ A_l = {
46
+ i: [j.value for j in list(root) if j != i and j.left != None and i in j.left.values] for i in binary_tree.values
47
+ }
48
+
49
+ A_r = {
50
+ i: [j.value for j in list(root) if j != i and j.left != None and i in j.right.values] for i in
51
+ binary_tree.values
52
+ }
53
+
54
+ D_l = {
55
+ i : [k.value for k in j.left.leaves]
56
+ for i in T_B
57
+ for j in list(root)
58
+ if j.value == i
59
+ }
60
+
61
+ D_r = {
62
+ i: [k.value for k in j.right.leaves]
63
+ for i in T_B
64
+ for j in list(root)
65
+ if j.value == i
66
+ }
67
+
68
+
69
+ P = {
70
+ i: Parent(root, i) for i in binary_tree.values
71
+ }
72
+
73
+ m = pywraplp.Solver.CreateSolver("SCIP")
74
+
75
+ if m is None:
76
+ raise RuntimeError("SCIP not available")
77
+
78
+ INF = m.infinity()
79
+
80
+ # m.setParam('Threads',1)
81
+ m.SetTimeLimit( int(config["Timeout"] * 60 * 1000) )
82
+
83
+ feature_bounds = {}
84
+
85
+ for f in features:
86
+ mn = float(df[f].min())
87
+
88
+ mx = float(df[f].max())
89
+
90
+ feature_bounds[f] = (mn, mx)
91
+
92
+ M = (
93
+ sum(max(abs(df[f].min()), abs(df[f].max())) for f in features)
94
+ +
95
+ max(abs(df[f].min()), abs(df[f].max()))
96
+ )
97
+
98
+ # ==================================================
99
+ # VARIABLES
100
+ # ==================================================
101
+
102
+ d = { t: m.BoolVar(f"d[{t}]") for t in T_B }
103
+
104
+ # Parallel split variables
105
+ a = {
106
+ (f, t): m.BoolVar(f"a[{f},{t}]")
107
+ for f in features
108
+ for t in T_B
109
+ }
110
+
111
+ b = {
112
+ t: m.NumVar(-INF, INF, f"b[{t}]")
113
+ for t in T_B
114
+ }
115
+
116
+ z = {
117
+ (i, l): m.BoolVar(f"z[{i},{l}]")
118
+ for i in I
119
+ for l in T_L
120
+ }
121
+
122
+ lvar = {
123
+ t: m.BoolVar(f"l[{t}]")
124
+ for t in T_L
125
+ }
126
+
127
+ # ==================================================
128
+ # LEAF SVM
129
+ # ==================================================
130
+
131
+ binary_case = len(classes) == 2
132
+
133
+ if binary_case:
134
+ Beta = {
135
+ (f, t): m.NumVar(-INF, INF, f"Beta[{f},{t}]")
136
+ for f in features
137
+ for t in T_L
138
+ }
139
+
140
+ Bet_abs = {
141
+ (f, t): m.NumVar(0, INF, f"Bet_abs[{f},{t}]")
142
+ for f in features
143
+ for t in T_L
144
+ }
145
+
146
+ Delta = {
147
+ t: m.NumVar(-INF, INF, f"Delta[{t}]")
148
+ for t in T_L
149
+ }
150
+
151
+ e = {
152
+ (i, t): m.NumVar(0, INF, f"e[{i},{t}]")
153
+ for i in I
154
+ for t in T_L
155
+ }
156
+
157
+ else:
158
+ Beta = {
159
+ (c, f, t): m.NumVar(
160
+ -INF,
161
+ INF,
162
+ f"Beta[{c},{f},{t}]"
163
+ )
164
+ for c in classes
165
+ for f in features
166
+ for t in T_L
167
+ }
168
+ Bet_abs = {
169
+ (c, f, t): m.NumVar(
170
+ 0,
171
+ INF,
172
+ f"Bet_abs[{c},{f},{t}]"
173
+ )
174
+ for c in classes
175
+ for f in features
176
+ for t in T_L
177
+ }
178
+ Delta = {
179
+ (c, t): m.NumVar(
180
+ -INF,
181
+ INF,
182
+ f"Delta[{c},{t}]"
183
+ )
184
+ for c in classes
185
+ for t in T_L
186
+ }
187
+ e = {
188
+ (c, i, t): m.NumVar(
189
+ 0,
190
+ INF,
191
+ f"e[{c},{i},{t}]"
192
+ )
193
+ for c in classes
194
+ for i in I
195
+ for t in T_L
196
+ }
197
+
198
+ # ==================================================
199
+ # SPLIT STRUCTURE
200
+ # ==================================================
201
+
202
+ for t in T_B:
203
+ m.Add(
204
+ sum(a[f, t] for f in features)
205
+ == d[t]
206
+ )
207
+ for t in [i for i in T_B if i != root.value]:
208
+ m.Add(d[t] <= d[P[t]])
209
+
210
+ # ==================================================
211
+ # LEAF OCCUPANCY
212
+ # ==================================================
213
+
214
+ for t in T_L:
215
+ for i in I:
216
+ m.Add(z[i, t] <= lvar[t])
217
+
218
+ for t in T_L:
219
+ m.Add(
220
+ sum(z[i, t] for i in I)
221
+ >= lvar[t]
222
+ )
223
+ for i in I:
224
+ m.Add(
225
+ sum(z[i, t] for t in T_L)
226
+ == 1
227
+ )
228
+
229
+ # ==================================================
230
+ # ROUTING CONSTRAINTS
231
+ # Indicator -> Big-M
232
+ # ==================================================
233
+
234
+ for i in I:
235
+ for leaf in T_L:
236
+ # LEFT
237
+ for t in A_l[leaf]:
238
+ m.Add(
239
+ sum(
240
+ a[f, t]
241
+ * (df.loc[i, f] + mu[f] - mu_min)
242
+ for f in features
243
+ )
244
+ + mu_min
245
+ <=
246
+ b[t] + M * (1 - z[i, leaf])
247
+ )
248
+
249
+ # RIGHT
250
+ for t in A_r[leaf]:
251
+ m.Add(
252
+ sum( a[f, t] * df.loc[i, f] for f in features )
253
+ >=
254
+ b[t] - M * (1 - z[i, leaf])
255
+ )
256
+
257
+ # ==================================================
258
+
259
+ # ACTIVE SPLIT => NONEMPTY DESCENDANTS
260
+
261
+ # ==================================================
262
+
263
+ for t in T_B:
264
+ m.Add(
265
+ d[t] <= sum( lvar[k] for k in D_l[t] )
266
+ )
267
+
268
+ m.Add(
269
+ d[t] <= sum( lvar[k] for k in D_r[t] )
270
+ )
271
+
272
+ # ==================================================
273
+ # LEAF CLASSIFIERS
274
+ # ==================================================
275
+
276
+ if binary_case:
277
+ # assumes labels encoded {-1,+1}
278
+ for i in I:
279
+ yi = float(df.loc[i, labels[0]])
280
+ for t in T_L:
281
+ svm_expr = (
282
+
283
+ sum(
284
+
285
+ Beta[f, t]
286
+
287
+ * df.loc[i, f]
288
+
289
+ for f in features
290
+
291
+ )
292
+
293
+ + Delta[t]
294
+
295
+ )
296
+
297
+ m.Add(
298
+
299
+ gamma
300
+
301
+ - e[i, t]
302
+
303
+ <= svm_expr * yi
304
+
305
+ + M * (1 - z[i, t])
306
+
307
+ )
308
+
309
+ # abs(Beta)
310
+
311
+ for f in features:
312
+
313
+ for t in T_L:
314
+ m.Add(
315
+
316
+ Bet_abs[f, t]
317
+
318
+ >= Beta[f, t]
319
+
320
+ )
321
+
322
+ m.Add(
323
+
324
+ Bet_abs[f, t]
325
+
326
+ >= -Beta[f, t]
327
+
328
+ )
329
+
330
+ else:
331
+
332
+ for c in classes:
333
+
334
+ for i in I:
335
+
336
+ for t in T_L:
337
+ svm_expr = (
338
+
339
+ sum(
340
+
341
+ Beta[c,f,t]
342
+
343
+ * df.loc[i, f]
344
+
345
+ for f in features
346
+
347
+ )
348
+
349
+ + Delta[c,t]
350
+
351
+ )
352
+
353
+ m.Add(
354
+
355
+ gamma
356
+
357
+ - e[c,i,t]
358
+
359
+ <= svm_expr
360
+
361
+ * LabelsPerClass[c][i]
362
+
363
+ + M * (1 - z[i, t])
364
+
365
+ )
366
+
367
+ for c in classes:
368
+
369
+ for f in features:
370
+
371
+ for t in T_L:
372
+ m.Add(
373
+
374
+ Bet_abs[c,f,t]
375
+
376
+ >= Beta[c,f,t]
377
+
378
+ )
379
+
380
+ m.Add(
381
+
382
+ Bet_abs[c,f,t]
383
+
384
+ >= -Beta[c,f,t]
385
+
386
+ )
387
+
388
+ # ==================================================
389
+ # SPLIT BUDGET
390
+ # ==================================================
391
+
392
+ m.Add( sum(d[t] for t in T_B) <= Splits )
393
+
394
+ # ==================================================
395
+ # OBJECTIVE
396
+ # ==================================================
397
+
398
+ if binary_case:
399
+
400
+ objective = (
401
+
402
+ sum( Bet_abs[f, t] for f in features for t in T_L )
403
+ +
404
+ C * sum( e[i, t]
405
+
406
+ for i in I
407
+
408
+ for t in T_L
409
+
410
+ )
411
+
412
+ )
413
+
414
+ else:
415
+
416
+ objective = (
417
+
418
+ sum(
419
+
420
+ Bet_abs[c,f,t]
421
+
422
+ for c in classes
423
+
424
+ for f in features
425
+
426
+ for t in T_L
427
+
428
+ )
429
+
430
+ + C * sum( e[c,i,t] for c in classes for i in I for t in T_L )
431
+ )
432
+
433
+ m.Minimize(objective)
434
+
435
+ start = tm()
436
+ status = m.Solve()
437
+ runtime = tm() - start
438
+
439
+ splitting_nodes = {}
440
+
441
+ if status != pywraplp.Solver.INFEASIBLE:
442
+ vars = m.variables()
443
+ solution = {
444
+ i.name():i.solution_value()
445
+ for i in vars}
446
+
447
+ non_zero_vars = [key for key,value in solution.items() if value > 0]
448
+
449
+ if config["SplitType"] == "Parallel":
450
+ splitting_nodes = {
451
+ i:{
452
+ 'a': [f for f in features if solution[f'a[{f},{i}]'] > 0][0],
453
+ 'b': round(solution[f'b[{i}]'],6)
454
+ }
455
+ for i in T_B if f'd[{i}]' in non_zero_vars
456
+ }
457
+ elif config["SplitType"] == "Oblique":
458
+ splitting_nodes = {
459
+ i: {
460
+ 'a': {f: round(solution[f'a[{f},{i}]'], 6)
461
+ for f in features
462
+ },
463
+ 'b': round(solution[f'b[{i}]'], 6)
464
+ }
465
+ for i in T_B if f'd[{i}]' in non_zero_vars
466
+ }
467
+ if len(classes) == 2:
468
+ non_empty_nodes = {
469
+ i: {
470
+ 'Beta': {
471
+ j: round(solution[f'Beta[{j},{i}]'], 6)
472
+ for j in features
473
+ },
474
+ 'Delta': round(solution[f'Delta[{i}]'], 6)
475
+ }
476
+ for i in T_L if f'l[{i}]' in non_zero_vars
477
+ }
478
+ else:
479
+ non_empty_nodes = {
480
+ i:{
481
+ c:{
482
+ 'Beta':{
483
+ j: round(solution[f'Beta[{c},{j},{i}]'],6)
484
+ for j in features
485
+ },
486
+ 'Delta':round(solution[f'Delta[{c},{i}]'],6)
487
+ }
488
+ for c in classes
489
+ }
490
+ for i in T_L if f'l[{i}]' in non_zero_vars
491
+ }
492
+
493
+ ODT = OptimalTree(
494
+ non_empty_nodes,
495
+ splitting_nodes,
496
+ int(np.ceil(np.log2(Splits + 1))),
497
+ config["SplitType"],
498
+ True,
499
+ classes
500
+ )
501
+
502
+ else:
503
+ print('MODEL IS INFEASIBLE')
504
+ ODT = None
505
+
506
+ return ODT,runtime
507
+
508
+ # from sklearn.metrics import accuracy_score
509
+ # from sklearn.utils import shuffle
510
+ # from DatabaseParser import DataParser
511
+ # if __name__ == "__main__":
512
+ #
513
+ # ProbType = 'Classification'
514
+ # TestSize = 0.2
515
+ # file = 'blogger'
516
+ # Splits = 1
517
+ #
518
+ # config ={
519
+ # 'RandomSeed':7,
520
+ # 'SplitType': 'Parallel',
521
+ # 'label_name': 'class',
522
+ # 'Timeout': 60, # for the single iteration (IN MINUTES)
523
+ # 'ConsoleLog':False
524
+ # }
525
+ #
526
+ # df = DataParser(f'{file}.arff',ProbType, one_hot=True)
527
+ #
528
+ # df = shuffle(df,random_state=config['RandomSeed'])
529
+ #
530
+ # Test_df = df.iloc[:round(len(df) * TestSize)]
531
+ # Train_df = df.iloc[len(Test_df):]
532
+ #
533
+ # # ELIMIATING A COLUMN FROM ALL DATASETS IF ALL THE VALUES IN IT ARE THE SAME IN THE TRAIN SET
534
+ # for i in Train_df.columns:
535
+ # if Train_df[i].nunique() == 1:
536
+ # Train_df = Train_df.drop(columns=[i])
537
+ # Test_df = Test_df.drop(columns=[i])
538
+ #
539
+ # features = list(Train_df.columns.drop(['class']))
540
+ # labels = df['class'].unique()
541
+ # labels = ('class', labels)
542
+ #
543
+ # for C in [1]:#[0.1, 1, 10, 100]:
544
+ # ODT,runtime = optimal_OMT(
545
+ # df= Train_df,
546
+ # features= features,
547
+ # labels= labels,
548
+ # Splits= Splits,
549
+ # C= C,
550
+ # config=config
551
+ # )
552
+ #
553
+ # print('Runtime:',round(runtime,3),end=" ")
554
+ # print('C:',C,end=' ')
555
+ # the_tree = ODT.build_tree(ODT.root.value)
556
+ # # ODT.print_tree(the_tree)
557
+ #
558
+ # # split train into features and labels
559
+ # X_train = Train_df.drop(columns='class')
560
+ # X_train = X_train.to_dict('index')
561
+ # Y_train = Train_df['class']
562
+ #
563
+ # # split test set into features and labels
564
+ # X_test = Test_df.drop(columns='class')
565
+ # X_test = X_test.to_dict('index')
566
+ # Y_test = Test_df['class']
567
+ #
568
+ # # Predict the train set
569
+ # train_pred = ODT.predict_class(X_train, the_tree, None)
570
+ # print('Train:', round(accuracy_score(Y_train, train_pred) * 100, 2), '%',end=' ')
571
+ #
572
+ # # Predict the test set
573
+ # test_pred = ODT.predict_class(X_test, the_tree,None)
574
+ # print('Test:', round(accuracy_score(Y_test, test_pred)*100,2),'%')
575
+
576
+
577
+
@@ -0,0 +1,226 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from binarytree import build
4
+
5
+
6
+ class Node():
7
+ def __init__(self, name=None, feature=None, threshold=None, left=None, right=None, value=None):
8
+ # decision node
9
+ self.name = name
10
+ self.feature = feature
11
+ self.threshold = threshold
12
+ self.left = left
13
+ self.right = right
14
+ # leaf node
15
+ self.value = value
16
+
17
+ def __str__(self):
18
+ return f'Node {self.name}, feature {self.feature}, threshold {self.threshold}'
19
+
20
+ class OptimalTree():
21
+ def __init__(self, non_empty_nodes,splitting_nodes,depth,SplitType='Parallel',ModelTree=True,classes=[]):
22
+ self.non_emtpy_nodes = non_empty_nodes
23
+ self.splitting_nodes = splitting_nodes
24
+ self.depth = depth
25
+ self.SplitType = SplitType
26
+ self.ModelTree = ModelTree
27
+ self.nodes = [i for i in range(2 ** (depth + 1) - 1)]
28
+ self.classes = classes
29
+
30
+ self.complete_tree = build(self.nodes)
31
+ self.T_L = [i.value for i in self.complete_tree.leaves] # leave nodes
32
+ self.T_B = [i for i in self.complete_tree.values if i not in self.T_L]
33
+ self.root = self.complete_tree.levels[0][0]
34
+
35
+ def build_tree(self, current_node):
36
+
37
+ if current_node in self.splitting_nodes:
38
+ left_subtree = self.build_tree( Children(self.root,current_node)[0].value)
39
+ right_subtree = self.build_tree(Children(self.root, current_node)[1].value)
40
+ return Node(
41
+ current_node,
42
+ self.splitting_nodes[current_node]['a'],
43
+ self.splitting_nodes[current_node]['b'],
44
+ left_subtree,
45
+ right_subtree
46
+ )
47
+ elif current_node in self.non_emtpy_nodes:
48
+ return Node(
49
+ current_node,
50
+ value = self.non_emtpy_nodes[current_node]
51
+ )
52
+ else:
53
+ descendants = [i for i in self.non_emtpy_nodes if current_node in Ancestors(self.root,i)]
54
+ if len(descendants) > 0:
55
+ return Node(
56
+ current_node,
57
+ value=self.non_emtpy_nodes[descendants[0]]
58
+ )
59
+ else:
60
+ raise ValueError('THE TREE HAS TO MANY SPLITS')
61
+
62
+ def print_tree(self,tree, indent=" "):
63
+
64
+ if tree is not None:
65
+ print('Node',tree.name)
66
+ if tree.value is not None:
67
+ print(tree.value)
68
+ else:
69
+ print(f'{str(tree.feature)} < {tree.threshold} ')
70
+ print(f'{indent}left:', end="")
71
+ self.print_tree(tree.left, indent + indent)
72
+ print('%sright:' % (indent), end="")
73
+ self.print_tree(tree.right, indent + indent)
74
+ else:
75
+ print('No Node')
76
+
77
+ def predict_regr(self, X, tree,f2=None):
78
+ '''function to predict_regr a new dataset'''
79
+ predictions = [self.make_regression(x, tree,f2) for x in X.values()]
80
+ return predictions
81
+
82
+ def make_regression(self, x, tree,f2=None):
83
+ '''function to make a single prediction'''
84
+ features = f2 if f2 != None else x
85
+ if self.ModelTree:
86
+ if tree.value != None:
87
+ return sum([ tree.value['Beta'][f] * x[f] for f in features]) + tree.value['Delta']
88
+ else:
89
+ if tree.value != None:
90
+ return tree.value
91
+
92
+ if self.SplitType == 'Parallel':
93
+ if x[tree.feature] < tree.threshold:
94
+ return self.make_regression(x, tree.left,f2)
95
+ else:
96
+ return self.make_regression(x, tree.right,f2)
97
+ elif self.SplitType == 'Oblique':
98
+ if sum([x[key] * value for key, value in tree.feature.items()]) < tree.threshold:
99
+ return self.make_regression(x, tree.left,f2)
100
+ else:
101
+ return self.make_regression(x, tree.right,f2)
102
+
103
+
104
+ def predict_class(self, X, tree,f2=None):
105
+ '''function to predict_regr a new dataset'''
106
+ predictions = [self.make_classification(x, tree,f2) for x in X.values()]
107
+ return predictions
108
+
109
+ def make_classification(self, x, tree,f2=None):
110
+ '''function to make a single prediction'''
111
+ features = f2 if f2 != None else x
112
+
113
+ if self.ModelTree:
114
+ if tree.value != None:
115
+ if len(self.classes)>2:
116
+ scores = {
117
+ c:sum([tree.value[c]['Beta'][f] * x[f] for f in features]) + tree.value[c]['Delta']
118
+ for c in self.classes
119
+ }
120
+ return max(scores, key=scores.get)
121
+
122
+ else:
123
+ return 1 if sum([ tree.value['Beta'][f] * x[f] for f in features]) + tree.value['Delta'] > 0 else -1
124
+ else:
125
+ if tree.value != None:
126
+ return tree.value
127
+ if self.SplitType == 'Parallel':
128
+ if x[tree.feature] < tree.threshold:
129
+ return self.make_classification(x, tree.left,f2)
130
+ else:
131
+ return self.make_classification(x, tree.right,f2)
132
+ elif self.SplitType == 'Oblique':
133
+ if sum([x[key] * value for key, value in tree.feature.items()]) < tree.threshold:
134
+ return self.make_classification(x, tree.left,f2)
135
+ else:
136
+ return self.make_classification(x, tree.right,f2)
137
+
138
+ # Some additional functions needed to deal with binary trees
139
+
140
+ def Ancestors(root, target):
141
+ ancestors = []
142
+
143
+ def findAncestors(root, target):
144
+ # Base case
145
+ if root == None:
146
+ return False
147
+
148
+ if root.value == target:
149
+ return True
150
+
151
+ # If target is present in either left or right subtree
152
+ # of this node, then print this node
153
+ if (findAncestors(root.left, target) or
154
+ findAncestors(root.right, target)):
155
+ ancestors.append(root.value)
156
+ # print(root.value,end=' ')
157
+ return True
158
+
159
+ # Else return False
160
+ return False
161
+
162
+ findAncestors(root, target)
163
+ return ancestors
164
+
165
+ def Parent(node, val):
166
+ the_parent = []
167
+
168
+ def findParent(node, val, parent=None):
169
+ if (node is None):
170
+ return
171
+
172
+ # If current node is the required node
173
+ if (node.value == val):
174
+ # assign its parent
175
+ the_parent.append(parent)
176
+
177
+ else:
178
+ # Recursive calls for the children of the current node. current node is now the new parent
179
+ findParent(node.left, val, node.value)
180
+ findParent(node.right, val, node.value)
181
+
182
+ findParent(node, val)
183
+ return the_parent[0]
184
+
185
+ def Children(node, val):
186
+ children = {}
187
+ def findChildren(node, val):
188
+ if (node is None):
189
+ return
190
+
191
+ # # If current node is the required node
192
+ if (node.value == val):
193
+ # assign its parent
194
+ children.update({'left':node.left,'right':node.right})
195
+
196
+ else:
197
+ # Recursive calls for the children of the current node. current node is now the new parent
198
+ findChildren(node.left, val)
199
+ findChildren(node.right, val)
200
+
201
+ findChildren(node, val)
202
+ return children['left'],children['right']
203
+
204
+ def RAE(Y_labels,Y_predicted):
205
+ mean_Y = np.average(Y_labels)
206
+ numerator = sum([ abs(i-j) for i,j in zip(Y_predicted,Y_labels)])
207
+ denominator = sum([abs(mean_Y - j) for j in Y_labels])
208
+ return round(numerator/denominator,2)
209
+
210
+ def RRSE(Y_labels,Y_predicted):
211
+ mean_Y = np.average(Y_labels)
212
+ numerator = sum([ (i-j)**2 for i,j in zip(Y_predicted,Y_labels)])
213
+ denominator = sum([ (mean_Y - j)**2 for j in Y_labels])
214
+ return round(np.sqrt(numerator/denominator),2)
215
+
216
+ def Multiplier(vector):
217
+ theVec = []
218
+ for i in vector:
219
+ if '.' in str(i):
220
+ theVec.append(len(str(i).split('.')[1]))
221
+ else:
222
+ theVec.append(0)
223
+ return 10**max(theVec)
224
+
225
+ if __name__ == "__main__":
226
+ print(Multiplier([2.01,3.0101]))
@@ -0,0 +1,3 @@
1
+ from.Estimator import OptimalModelTreeClassifier
2
+
3
+ __all_= ["OptimalModelTreeClassifier"]
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: optimal-omt
3
+ Version: 0.1.0
4
+ Summary: Optimal Model Trees using OR-Tools
5
+ Requires-Python: >=3.9
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: numpy
8
+ Requires-Dist: pandas
9
+ Requires-Dist: scikit-learn
10
+ Requires-Dist: binarytree
11
+ Requires-Dist: ortools
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest; extra == "dev"
@@ -0,0 +1,12 @@
1
+ README.md
2
+ pyproject.toml
3
+ omt/DatabaseParser.py
4
+ omt/Estimator.py
5
+ omt/ORToolsClassifier.py
6
+ omt/TreeStructure.py
7
+ omt/__init__.py
8
+ optimal_omt.egg-info/PKG-INFO
9
+ optimal_omt.egg-info/SOURCES.txt
10
+ optimal_omt.egg-info/dependency_links.txt
11
+ optimal_omt.egg-info/requires.txt
12
+ optimal_omt.egg-info/top_level.txt
@@ -0,0 +1,8 @@
1
+ numpy
2
+ pandas
3
+ scikit-learn
4
+ binarytree
5
+ ortools
6
+
7
+ [dev]
8
+ pytest
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "optimal-omt"
7
+ version = "0.1.0"
8
+ description = "Optimal Model Trees using OR-Tools"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+
12
+ dependencies = [
13
+ "numpy",
14
+ "pandas",
15
+ "scikit-learn",
16
+ "binarytree",
17
+ "ortools"
18
+ ]
19
+
20
+ [project.optional-dependencies]
21
+ dev = ["pytest"]
22
+
23
+ [tool.setuptools]
24
+ packages = ["omt"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+