icol 0.7.4__py3-none-any.whl → 0.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icol/icol.py +332 -29
- icol/logistic_icol.py +1176 -0
- {icol-0.7.4.dist-info → icol-0.8.5.dist-info}/METADATA +1 -1
- icol-0.8.5.dist-info/RECORD +8 -0
- {icol-0.7.4.dist-info → icol-0.8.5.dist-info}/WHEEL +1 -1
- icol-0.7.4.dist-info/RECORD +0 -7
- {icol-0.7.4.dist-info → icol-0.8.5.dist-info}/LICENSE +0 -0
- {icol-0.7.4.dist-info → icol-0.8.5.dist-info}/top_level.txt +0 -0
icol/icol.py
CHANGED
|
@@ -14,6 +14,7 @@ from sklearn.base import clone
|
|
|
14
14
|
from sklearn.model_selection import train_test_split
|
|
15
15
|
|
|
16
16
|
from sklearn.metrics import mean_squared_error
|
|
17
|
+
from sklearn.linear_model import LogisticRegressionCV
|
|
17
18
|
|
|
18
19
|
def rmse(y_true, y_pred):
|
|
19
20
|
return mean_squared_error(y_true, y_pred, squared=False)
|
|
@@ -625,7 +626,8 @@ class ICL:
|
|
|
625
626
|
'fit_intercept': self.fit_intercept,
|
|
626
627
|
'normalize': self.normalize,
|
|
627
628
|
'pool_reset': self.pool_reset,
|
|
628
|
-
'
|
|
629
|
+
'optimize_k': self.optimize_k,
|
|
630
|
+
'track_intermediates': self.track_intermediates
|
|
629
631
|
}
|
|
630
632
|
|
|
631
633
|
def __str__(self):
|
|
@@ -693,7 +695,11 @@ class ICL:
|
|
|
693
695
|
i = 0
|
|
694
696
|
IC = np.infty
|
|
695
697
|
while i < stopping:
|
|
696
|
-
self.
|
|
698
|
+
if self.fit_intercept:
|
|
699
|
+
self.intercept_ = np.mean(res).squeeze()
|
|
700
|
+
else:
|
|
701
|
+
self.intercept_ = 0
|
|
702
|
+
|
|
697
703
|
if verbose: print('.', end='')
|
|
698
704
|
|
|
699
705
|
p, sis_i = self.sis(X=X, res=res, pool=list(pool_), verbose=verbose)
|
|
@@ -713,17 +719,20 @@ class ICL:
|
|
|
713
719
|
idx = np.nonzero(beta)[0]
|
|
714
720
|
if self.normalize:
|
|
715
721
|
coef = (beta[idx].reshape(1, -1)*self.b_y/self.b_x[idx].reshape(1, -1))
|
|
716
|
-
|
|
722
|
+
if self.fit_intercept:
|
|
723
|
+
intercept_ = self.a_y - coef.dot(self.a_x[idx])
|
|
717
724
|
else:
|
|
718
725
|
coef = beta[idx]
|
|
719
|
-
|
|
720
|
-
|
|
726
|
+
if self.fit_intercept:
|
|
727
|
+
intercept_ = self.intercept_
|
|
728
|
+
if len(coef.shape) > 1:
|
|
729
|
+
coef = coef[0]
|
|
721
730
|
expr = ''.join([('+' if float(c) >= 0 else '') + str(np.round(float(c), 3)) + str(self.feature_names_[idx][q]) for q, c in enumerate(coef)])
|
|
722
731
|
if verbose: print('Model after {0} iterations: {1}'.format(i, expr))
|
|
723
732
|
|
|
724
733
|
self.intermediates[i, 0] = deepcopy(idx)
|
|
725
734
|
self.intermediates[i, 1] = coef # deepcopy(beta[idx])
|
|
726
|
-
self.intermediates[i, 2] = intercept_
|
|
735
|
+
self.intermediates[i, 2] = intercept_ if self.fit_intercept else 0
|
|
727
736
|
self.intermediates[i, 3] = self.feature_names_[idx]
|
|
728
737
|
self.intermediates[i, 4] = expr
|
|
729
738
|
|
|
@@ -742,7 +751,7 @@ class ICL:
|
|
|
742
751
|
if verbose: print()
|
|
743
752
|
|
|
744
753
|
self.beta_ = beta
|
|
745
|
-
self.intercept_ = np.mean(res).squeeze()
|
|
754
|
+
self.intercept_ = np.mean(res).squeeze() if self.fit_intercept else 0
|
|
746
755
|
|
|
747
756
|
self.beta_idx_ = list(np.nonzero(self.beta_)[0])
|
|
748
757
|
self.beta_sparse_ = self.beta_[self.beta_idx_]
|
|
@@ -957,6 +966,7 @@ class FeatureExpansion:
|
|
|
957
966
|
if verbose: print('Estimating the creation of around {0} features'.format(self.estimate_workload(p=p, max_rung=self.rung, verbose=verbose>2)))
|
|
958
967
|
|
|
959
968
|
names, symbols, X = self.expand_aux(X=X, names=names, symbols=symbols, crung=0, prev_p=0, verbose=verbose)
|
|
969
|
+
|
|
960
970
|
if not(f is None):
|
|
961
971
|
import pandas as pd
|
|
962
972
|
df = pd.DataFrame(data=X, columns=names)
|
|
@@ -1060,7 +1070,7 @@ class FeatureExpansion:
|
|
|
1060
1070
|
new_op_symbols = sym_vect(idx2)
|
|
1061
1071
|
new_op_names = str_vectorize(new_op_symbols)
|
|
1062
1072
|
X_i = X[:, idx1]
|
|
1063
|
-
new_op_X = X_i[:, np.newaxis]*X[:, idx2]
|
|
1073
|
+
new_op_X = op_np(X_i[:, np.newaxis], X[:, idx2]) #X_i[:, np.newaxis]*X[:, idx2]
|
|
1064
1074
|
new_names, new_symbols, new_X = self.add_new(new_names=new_names, new_symbols=new_symbols, new_X=new_X,
|
|
1065
1075
|
new_name=new_op_names, new_symbol=new_op_symbols, new_X_i=new_op_X, verbose=verbose)
|
|
1066
1076
|
if not(new_names is None):
|
|
@@ -1077,27 +1087,320 @@ class FeatureExpansion:
|
|
|
1077
1087
|
if verbose: print('{0} features'.format(X.shape[1]))
|
|
1078
1088
|
|
|
1079
1089
|
return self.expand_aux(X=X, names=names, symbols=symbols, crung=crung+1, prev_p=prev_p, verbose=verbose)
|
|
1090
|
+
|
|
1091
|
+
class LOGISTIC_ICL:
|
|
1092
|
+
def __init__(self, s, so, k, pool_reset=False, track_intermediates=False, max_iter=100, tol=1e-6, eps=1e-12, damping=0.5, prec=3):
|
|
1093
|
+
self.s = s
|
|
1094
|
+
self.so = so
|
|
1095
|
+
self.k = k
|
|
1096
|
+
self.pool_reset=pool_reset
|
|
1097
|
+
self.track_intermediates = track_intermediates
|
|
1098
|
+
self.max_iter = int(max_iter)
|
|
1099
|
+
self.tol = float(tol)
|
|
1100
|
+
self.eps = float(eps)
|
|
1101
|
+
self.damping = float(damping)
|
|
1102
|
+
self.prec = prec
|
|
1103
|
+
|
|
1104
|
+
self.icl = ICL(s=s, so=so, k=k, normalize=False, fit_intercept=False, optimize_k=True, track_intermediates=self.track_intermediates)
|
|
1105
|
+
self.coef_ = None # (p,)
|
|
1106
|
+
self.intercept_ = 0.0 # scalar
|
|
1107
|
+
|
|
1108
|
+
def get_params(self, deep=False):
|
|
1109
|
+
params = {
|
|
1110
|
+
"s": self.s,
|
|
1111
|
+
"so": self.so,
|
|
1112
|
+
"k": self.k,
|
|
1113
|
+
"pool_reset": self.pool_reset,
|
|
1114
|
+
"track_intermediates": self.track_intermediates,
|
|
1115
|
+
"max_iter": self.max_iter,
|
|
1116
|
+
"tol": self.tol,
|
|
1117
|
+
"eps": self.eps,
|
|
1118
|
+
"damping": self.damping,
|
|
1119
|
+
"prec": self.prec
|
|
1120
|
+
}
|
|
1121
|
+
if deep:
|
|
1122
|
+
# expose inner ICL params using sklearn convention
|
|
1123
|
+
for key, value in self.icl.get_params(deep=True).items():
|
|
1124
|
+
params[f"icl__{key}"] = value
|
|
1125
|
+
|
|
1126
|
+
return params
|
|
1127
|
+
|
|
1128
|
+
def decision_function(self, X):
|
|
1129
|
+
X = np.asarray(X, dtype=float)
|
|
1130
|
+
if self.coef_ is None:
|
|
1131
|
+
raise ValueError("Model is not fitted yet.")
|
|
1132
|
+
return X @ self.coef_ + self.intercept_
|
|
1133
|
+
|
|
1134
|
+
def predict_proba(self, X):
|
|
1135
|
+
eta = self.decision_function(X)
|
|
1136
|
+
p1 = self._sigmoid(eta)
|
|
1137
|
+
p1 = np.clip(p1, self.eps, 1.0 - self.eps)
|
|
1138
|
+
return np.column_stack([1.0 - p1, p1])
|
|
1139
|
+
|
|
1140
|
+
def predict(self, X, threshold=0.5):
|
|
1141
|
+
return (self.predict_proba(X)[:, 1] >= threshold).astype(int)
|
|
1142
|
+
|
|
1143
|
+
def __repr__(self):
|
|
1144
|
+
return '\n'.join([('+' if beta>=0 else '') +
|
|
1145
|
+
sci(beta, self.prec) +
|
|
1146
|
+
'('+
|
|
1147
|
+
str(self.feature_names[i]) +
|
|
1148
|
+
')'
|
|
1149
|
+
for i, beta in enumerate(self.coef_) if abs(beta)>0]) + '\n' + ('+' if self.intercept_ >= 0 else '') + sci(self.intercept_, self.prec)
|
|
1150
|
+
|
|
1151
|
+
def __str__(self):
|
|
1152
|
+
return 'LOGISTIC_ICL({0})'.format(str(self.get_params()))
|
|
1153
|
+
|
|
1154
|
+
def fit(self, X, y, feature_names=None, verbose=False):
|
|
1155
|
+
n, p = X.shape
|
|
1156
|
+
|
|
1157
|
+
beta = np.zeros(p, dtype=float)
|
|
1158
|
+
b = 0.0
|
|
1159
|
+
|
|
1160
|
+
for it in range(self.max_iter):
|
|
1161
|
+
if verbose: print('iteration {0}'.format(it))
|
|
1162
|
+
# predicting with current model
|
|
1163
|
+
eta = np.dot(X, beta) + b
|
|
1164
|
+
|
|
1165
|
+
# converting to probabilties
|
|
1166
|
+
p_hat = self._sigmoid(eta)
|
|
1167
|
+
p_hat = np.clip(p_hat, self.eps, 1.0 - self.eps)
|
|
1168
|
+
|
|
1169
|
+
# row weights
|
|
1170
|
+
w = p_hat * (1.0 - p_hat)
|
|
1171
|
+
w = np.clip(w, self.eps, 1.0)
|
|
1172
|
+
|
|
1173
|
+
# reweighted x and y
|
|
1174
|
+
z = eta + (y - p_hat) / w
|
|
1175
|
+
|
|
1176
|
+
w_sum = w.sum()
|
|
1177
|
+
xbar = (w[:, None] * X).sum(axis=0) / w_sum
|
|
1178
|
+
zbar = (w * z).sum() / w_sum
|
|
1179
|
+
|
|
1180
|
+
Xc = X - xbar
|
|
1181
|
+
zc = z - zbar
|
|
1182
|
+
|
|
1183
|
+
s = np.sqrt(w)
|
|
1184
|
+
X_star = Xc * s[:, None]
|
|
1185
|
+
z_star = zc * s
|
|
1186
|
+
|
|
1187
|
+
# fitting icl model to reweighted data
|
|
1188
|
+
icl_iter = clone(self.icl)
|
|
1189
|
+
icl_iter.fit(X_star, z_star, feature_names=feature_names, verbose=verbose>1)
|
|
1190
|
+
|
|
1191
|
+
beta_new = np.zeros(p, dtype=float)
|
|
1192
|
+
beta_new[icl_iter.beta_idx_] = icl_iter.beta_
|
|
1193
|
+
|
|
1194
|
+
b_new = zbar - xbar @ beta_new
|
|
1195
|
+
|
|
1196
|
+
# updating previous solution as linear combination of past and current
|
|
1197
|
+
beta_next = beta + self.damping * (beta_new - beta)
|
|
1198
|
+
b_next = b + self.damping * (b_new - b)
|
|
1199
|
+
|
|
1200
|
+
# stopping criteria
|
|
1201
|
+
delta = np.linalg.norm(beta_next - beta, 2) + abs(b_next - b)
|
|
1202
|
+
if verbose:
|
|
1203
|
+
print(f"iter {it+1}: delta={delta:.3e}")
|
|
1204
|
+
|
|
1205
|
+
beta, b = beta_next, b_next
|
|
1206
|
+
|
|
1207
|
+
if delta <= self.tol:
|
|
1208
|
+
if verbose: print('converged')
|
|
1209
|
+
break
|
|
1080
1210
|
|
|
1081
|
-
if
|
|
1082
|
-
|
|
1211
|
+
if (verbose) and (it == self.max_iter): print('did not converge')
|
|
1212
|
+
|
|
1213
|
+
tol = 1e-10
|
|
1214
|
+
|
|
1215
|
+
# number of nonzeros in the fitted model
|
|
1216
|
+
nz_idx = np.flatnonzero(np.abs(beta) > tol)
|
|
1217
|
+
nz = int(np.sum(np.abs(beta) > tol))
|
|
1218
|
+
|
|
1219
|
+
std = X.std(axis=0, ddof=0)
|
|
1220
|
+
scores = np.abs(beta) * std
|
|
1221
|
+
|
|
1222
|
+
kk = min(self.k, nz)
|
|
1223
|
+
|
|
1224
|
+
# choose among nonzero coefficients only (avoids picking zero-weight features)
|
|
1225
|
+
if kk == 0:
|
|
1226
|
+
# degenerate: no features selected
|
|
1227
|
+
self.coef_ = np.zeros_like(beta)
|
|
1228
|
+
self.idx = np.array([], dtype=int)
|
|
1229
|
+
self.intercept_ = float(b)
|
|
1230
|
+
else:
|
|
1231
|
+
order = nz_idx[np.argsort(scores[nz_idx])[-kk:][::-1]]
|
|
1232
|
+
XS = X[:, order]
|
|
1233
|
+
|
|
1234
|
+
clf = LogisticRegressionCV(
|
|
1235
|
+
Cs=20, # or a list/array like np.logspace(-4, 4, 30)
|
|
1236
|
+
cv=5,
|
|
1237
|
+
penalty="l2",
|
|
1238
|
+
solver="lbfgs",
|
|
1239
|
+
scoring="neg_log_loss",
|
|
1240
|
+
fit_intercept=True,
|
|
1241
|
+
max_iter=5000,
|
|
1242
|
+
n_jobs=-1,
|
|
1243
|
+
refit=True,
|
|
1244
|
+
)
|
|
1245
|
+
clf.fit(XS, y)
|
|
1246
|
+
|
|
1247
|
+
self.coef_ = np.zeros_like(beta)
|
|
1248
|
+
self.coef_[order] = clf.coef_.ravel()
|
|
1249
|
+
self.idx = order
|
|
1250
|
+
self.intercept_ = float(clf.intercept_[0])
|
|
1251
|
+
|
|
1252
|
+
self.feature_names_sparse_ = feature_names[self.idx]
|
|
1253
|
+
self.feature_names = feature_names
|
|
1254
|
+
return self
|
|
1255
|
+
|
|
1256
|
+
@staticmethod
|
|
1257
|
+
def _sigmoid(eta):
|
|
1258
|
+
# stable sigmoid
|
|
1259
|
+
eta = np.asarray(eta, dtype=float)
|
|
1260
|
+
out = np.empty_like(eta, dtype=float)
|
|
1261
|
+
pos = eta >= 0
|
|
1262
|
+
neg = ~pos
|
|
1263
|
+
out[pos] = 1.0 / (1.0 + np.exp(-eta[pos]))
|
|
1264
|
+
exp_eta = np.exp(eta[neg])
|
|
1265
|
+
out[neg] = exp_eta / (1.0 + exp_eta)
|
|
1266
|
+
return out
|
|
1267
|
+
|
|
1268
|
+
def zero_one_loss(X, y, model):
|
|
1269
|
+
y = np.asarray(y)
|
|
1270
|
+
y_hat = model.predict(X)
|
|
1271
|
+
return np.mean(y_hat != y)
|
|
1272
|
+
|
|
1273
|
+
def hinge_loss(X, y, model):
|
|
1274
|
+
y = np.asarray(y)
|
|
1275
|
+
y_pm = 2*y - 1 # {0,1} -> {-1,+1}
|
|
1276
|
+
eta = model.decision_function(X)
|
|
1277
|
+
return np.mean(np.maximum(0.0, 1.0 - y_pm * eta))
|
|
1278
|
+
|
|
1279
|
+
def log_loss(X, y, model):
|
|
1280
|
+
y = np.asarray(y)
|
|
1281
|
+
eta = model.decision_function(X)
|
|
1282
|
+
return np.mean(np.logaddexp(0.0, eta) - y*eta)
|
|
1283
|
+
|
|
1284
|
+
|
|
1285
|
+
sci = lambda x, sig=3: f"{float(x):.{sig}e}"
|
|
1286
|
+
|
|
1287
|
+
if __name__ == "__main__":
|
|
1288
|
+
test = "bandgap"
|
|
1083
1289
|
random_state = 0
|
|
1084
1290
|
np.random.seed(random_state)
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1291
|
+
from sklearn.model_selection import train_test_split
|
|
1292
|
+
from sklearn.metrics import r2_score as r2
|
|
1293
|
+
|
|
1294
|
+
import pandas as pd
|
|
1295
|
+
import os
|
|
1296
|
+
|
|
1297
|
+
if test == "DIABETES":
|
|
1298
|
+
df = pd.read_csv(os.path.join(os.getcwd(), "Input", "pima.csv"))
|
|
1299
|
+
df["DIABETES"] = df["DIABETES"].map({"Y":1, "N": 0})
|
|
1300
|
+
y = df['DIABETES'].values
|
|
1301
|
+
X = df.drop(columns=['DIABETES'])
|
|
1302
|
+
feature_names = X.columns
|
|
1303
|
+
X = X.values
|
|
1304
|
+
|
|
1305
|
+
rung = 2
|
|
1306
|
+
small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
|
|
1307
|
+
big = ['six_pow', 'exp', 'add', 'mul', 'div', 'abs_diff']
|
|
1308
|
+
small = [(op, range(rung)) for op in small]
|
|
1309
|
+
big = [(op, range(1)) for op in big]
|
|
1310
|
+
ops = small+big
|
|
1311
|
+
|
|
1312
|
+
FE = FeatureExpansion(rung=rung, ops=ops)
|
|
1313
|
+
Phi_names, Phi_symbols, Phi_ = FE.expand(X=X, names=feature_names, check_pos=True, verbose=True)
|
|
1314
|
+
X_train, X_test, y_train, y_test = train_test_split(Phi_, y, test_size=0.2, random_state=random_state)
|
|
1315
|
+
|
|
1316
|
+
logistic_icl_params = {
|
|
1317
|
+
"s": 10,
|
|
1318
|
+
"so": AdaptiveLASSO(gamma=1, fit_intercept=False),
|
|
1319
|
+
"k": 5,
|
|
1320
|
+
"pool_reset": False,
|
|
1321
|
+
"track_intermediates": False,
|
|
1322
|
+
"max_iter": 1000,
|
|
1323
|
+
"tol": 1e-1,
|
|
1324
|
+
"eps": 1e-3,
|
|
1325
|
+
"damping": 0.5,
|
|
1326
|
+
"prec": 3
|
|
1327
|
+
}
|
|
1328
|
+
|
|
1329
|
+
icl_log = LOGISTIC_ICL(**logistic_icl_params)
|
|
1330
|
+
icl_log.fit(X=X_train, y=y_train, feature_names=Phi_names, verbose=1)
|
|
1331
|
+
|
|
1332
|
+
print(icl_log.__repr__())
|
|
1333
|
+
print('zero_one: {0}'.format(zero_one_loss(X_test, y_test, icl_log)))
|
|
1334
|
+
print('hinge: {0}'.format(hinge_loss(X_test, y_test, icl_log)))
|
|
1335
|
+
print('logloss: {0}'.format(log_loss(X_test, y_test, icl_log)))
|
|
1336
|
+
elif test=="Synthetic":
|
|
1337
|
+
k,n,p=3,10000,1000
|
|
1338
|
+
rng = np.random.default_rng(random_state)
|
|
1339
|
+
X = rng.standard_normal((n,p))
|
|
1340
|
+
feature_names = np.array(['X_{0}'.format(i) for i in range(p)])
|
|
1341
|
+
support = range(k)
|
|
1342
|
+
beta = np.zeros(p, dtype=float)
|
|
1343
|
+
signs = rng.choice([-1.0, 1.0], size=k)
|
|
1344
|
+
mags = rng.uniform(0.5, 1.5, size=k)
|
|
1345
|
+
beta[support] = signs * mags
|
|
1346
|
+
eta_no_b = X @ beta
|
|
1347
|
+
b = float(-np.mean(eta_no_b))
|
|
1348
|
+
eta = eta_no_b + b
|
|
1349
|
+
p1 = 1.0 / (1.0 + np.exp(-np.clip(eta, -50, 50)))
|
|
1350
|
+
y = rng.binomial(1, p1, size=n).astype(int)
|
|
1351
|
+
|
|
1352
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
|
|
1353
|
+
|
|
1354
|
+
logistic_icl_params = {
|
|
1355
|
+
"s": 10,
|
|
1356
|
+
"so": AdaptiveLASSO(gamma=1, fit_intercept=False),
|
|
1357
|
+
"k": k,
|
|
1358
|
+
"pool_reset": False,
|
|
1359
|
+
"track_intermediates": False,
|
|
1360
|
+
"max_iter": 1000,
|
|
1361
|
+
"tol": 1e-3,
|
|
1362
|
+
"eps": 1e-6,
|
|
1363
|
+
"damping": 0.8,
|
|
1364
|
+
"prec": 3
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
icl_log = LOGISTIC_ICL(**logistic_icl_params)
|
|
1368
|
+
icl_log.fit(X=X_train, y=y_train, feature_names=feature_names, verbose=1)
|
|
1369
|
+
|
|
1370
|
+
print(icl_log.__repr__())
|
|
1371
|
+
print('zero_one: {0}'.format(zero_one_loss(X_test, y_test, icl_log)))
|
|
1372
|
+
print('hinge: {0}'.format(hinge_loss(X_test, y_test, icl_log)))
|
|
1373
|
+
print('logloss: {0}'.format(log_loss(X_test, y_test, icl_log)))
|
|
1374
|
+
|
|
1375
|
+
print('True Coef: {0}'.format(beta[:k]))
|
|
1376
|
+
print('True intercept: {0}'.format(b))
|
|
1377
|
+
eta_test = icl_log.decision_function(X_test) # log-odds
|
|
1378
|
+
p_test = 1.0 / (1.0 + np.exp(-eta_test))
|
|
1379
|
+
print('Bayes error: {0}'.format(np.mean(np.minimum(p_test, 1-p_test))))
|
|
1380
|
+
elif test=='bandgap':
|
|
1381
|
+
path = os.path.join('/'.join(os.getcwd().split('/')[:-1]), 'icol_exp', 'Input', 'data_HTE.csv')
|
|
1382
|
+
df = pd.read_csv(path)
|
|
1383
|
+
y = df['Y_oxygenate'].values
|
|
1384
|
+
X = df.drop(columns=['material_and_condition', 'Y_oxygenate'])
|
|
1385
|
+
feature_names = X.columns
|
|
1386
|
+
X = X.values
|
|
1387
|
+
|
|
1388
|
+
rung = 2
|
|
1389
|
+
small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
|
|
1390
|
+
big = ['six_pow', 'exp', 'add', 'mul', 'div', 'abs_diff']
|
|
1391
|
+
small = [(op, range(rung)) for op in small]
|
|
1392
|
+
big = [(op, range(1)) for op in big]
|
|
1393
|
+
ops = small+big
|
|
1394
|
+
|
|
1395
|
+
FE = FeatureExpansion(rung=rung, ops=ops)
|
|
1396
|
+
Phi_names, Phi_symbols, Phi_ = FE.expand(X=X, names=feature_names, check_pos=True, verbose=True)
|
|
1397
|
+
|
|
1398
|
+
X_train, X_test, y_train, y_test = train_test_split(Phi_, y, test_size=0.2, random_state=random_state)
|
|
1399
|
+
for i, s in enumerate([1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,200,300,400]):
|
|
1400
|
+
icl = ICL(s=s, so=AdaptiveLASSO(gamma=1, fit_intercept=False), k=5, fit_intercept=True,
|
|
1401
|
+
normalize=True, optimize_k=True, track_intermediates=False)
|
|
1402
|
+
|
|
1403
|
+
icl.fit(X_train, y_train, feature_names=Phi_names, verbose=0)
|
|
1404
|
+
y_test_hat = icl.predict(X_test)
|
|
1405
|
+
score = r2(y_test, y_test_hat)
|
|
1406
|
+
print('model={0}, s={2}, r2={1}'.format(icl.__repr__(), score, s))
|