icol 0.7.4__py3-none-any.whl → 0.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
icol/icol.py CHANGED
@@ -14,6 +14,7 @@ from sklearn.base import clone
14
14
  from sklearn.model_selection import train_test_split
15
15
 
16
16
  from sklearn.metrics import mean_squared_error
17
+ from sklearn.linear_model import LogisticRegressionCV
17
18
 
18
19
  def rmse(y_true, y_pred):
19
20
  return mean_squared_error(y_true, y_pred, squared=False)
@@ -625,7 +626,8 @@ class ICL:
625
626
  'fit_intercept': self.fit_intercept,
626
627
  'normalize': self.normalize,
627
628
  'pool_reset': self.pool_reset,
628
- 'self.optimize_k': self.optimize_k
629
+ 'optimize_k': self.optimize_k,
630
+ 'track_intermediates': self.track_intermediates
629
631
  }
630
632
 
631
633
  def __str__(self):
@@ -693,7 +695,11 @@ class ICL:
693
695
  i = 0
694
696
  IC = np.infty
695
697
  while i < stopping:
696
- self.intercept_ = np.mean(res).squeeze()
698
+ if self.fit_intercept:
699
+ self.intercept_ = np.mean(res).squeeze()
700
+ else:
701
+ self.intercept_ = 0
702
+
697
703
  if verbose: print('.', end='')
698
704
 
699
705
  p, sis_i = self.sis(X=X, res=res, pool=list(pool_), verbose=verbose)
@@ -713,17 +719,20 @@ class ICL:
713
719
  idx = np.nonzero(beta)[0]
714
720
  if self.normalize:
715
721
  coef = (beta[idx].reshape(1, -1)*self.b_y/self.b_x[idx].reshape(1, -1))
716
- intercept_ = self.a_y - coef.dot(self.a_x[idx])
722
+ if self.fit_intercept:
723
+ intercept_ = self.a_y - coef.dot(self.a_x[idx])
717
724
  else:
718
725
  coef = beta[idx]
719
- intercept_ = self.intercept_
720
- coef = coef[0]
726
+ if self.fit_intercept:
727
+ intercept_ = self.intercept_
728
+ if len(coef.shape) > 1:
729
+ coef = coef[0]
721
730
  expr = ''.join([('+' if float(c) >= 0 else '') + str(np.round(float(c), 3)) + str(self.feature_names_[idx][q]) for q, c in enumerate(coef)])
722
731
  if verbose: print('Model after {0} iterations: {1}'.format(i, expr))
723
732
 
724
733
  self.intermediates[i, 0] = deepcopy(idx)
725
734
  self.intermediates[i, 1] = coef # deepcopy(beta[idx])
726
- self.intermediates[i, 2] = intercept_
735
+ self.intermediates[i, 2] = intercept_ if self.fit_intercept else 0
727
736
  self.intermediates[i, 3] = self.feature_names_[idx]
728
737
  self.intermediates[i, 4] = expr
729
738
 
@@ -742,7 +751,7 @@ class ICL:
742
751
  if verbose: print()
743
752
 
744
753
  self.beta_ = beta
745
- self.intercept_ = np.mean(res).squeeze()
754
+ self.intercept_ = np.mean(res).squeeze() if self.fit_intercept else 0
746
755
 
747
756
  self.beta_idx_ = list(np.nonzero(self.beta_)[0])
748
757
  self.beta_sparse_ = self.beta_[self.beta_idx_]
@@ -957,6 +966,7 @@ class FeatureExpansion:
957
966
  if verbose: print('Estimating the creation of around {0} features'.format(self.estimate_workload(p=p, max_rung=self.rung, verbose=verbose>2)))
958
967
 
959
968
  names, symbols, X = self.expand_aux(X=X, names=names, symbols=symbols, crung=0, prev_p=0, verbose=verbose)
969
+
960
970
  if not(f is None):
961
971
  import pandas as pd
962
972
  df = pd.DataFrame(data=X, columns=names)
@@ -1060,7 +1070,7 @@ class FeatureExpansion:
1060
1070
  new_op_symbols = sym_vect(idx2)
1061
1071
  new_op_names = str_vectorize(new_op_symbols)
1062
1072
  X_i = X[:, idx1]
1063
- new_op_X = X_i[:, np.newaxis]*X[:, idx2]
1073
+ new_op_X = op_np(X_i[:, np.newaxis], X[:, idx2]) #X_i[:, np.newaxis]*X[:, idx2]
1064
1074
  new_names, new_symbols, new_X = self.add_new(new_names=new_names, new_symbols=new_symbols, new_X=new_X,
1065
1075
  new_name=new_op_names, new_symbol=new_op_symbols, new_X_i=new_op_X, verbose=verbose)
1066
1076
  if not(new_names is None):
@@ -1077,27 +1087,320 @@ class FeatureExpansion:
1077
1087
  if verbose: print('{0} features'.format(X.shape[1]))
1078
1088
 
1079
1089
  return self.expand_aux(X=X, names=names, symbols=symbols, crung=crung+1, prev_p=prev_p, verbose=verbose)
1090
+
1091
+ class LOGISTIC_ICL:
1092
+ def __init__(self, s, so, k, pool_reset=False, track_intermediates=False, max_iter=100, tol=1e-6, eps=1e-12, damping=0.5, prec=3):
1093
+ self.s = s
1094
+ self.so = so
1095
+ self.k = k
1096
+ self.pool_reset=pool_reset
1097
+ self.track_intermediates = track_intermediates
1098
+ self.max_iter = int(max_iter)
1099
+ self.tol = float(tol)
1100
+ self.eps = float(eps)
1101
+ self.damping = float(damping)
1102
+ self.prec = prec
1103
+
1104
+ self.icl = ICL(s=s, so=so, k=k, normalize=False, fit_intercept=False, optimize_k=True, track_intermediates=self.track_intermediates)
1105
+ self.coef_ = None # (p,)
1106
+ self.intercept_ = 0.0 # scalar
1107
+
1108
+ def get_params(self, deep=False):
1109
+ params = {
1110
+ "s": self.s,
1111
+ "so": self.so,
1112
+ "k": self.k,
1113
+ "pool_reset": self.pool_reset,
1114
+ "track_intermediates": self.track_intermediates,
1115
+ "max_iter": self.max_iter,
1116
+ "tol": self.tol,
1117
+ "eps": self.eps,
1118
+ "damping": self.damping,
1119
+ "prec": self.prec
1120
+ }
1121
+ if deep:
1122
+ # expose inner ICL params using sklearn convention
1123
+ for key, value in self.icl.get_params(deep=True).items():
1124
+ params[f"icl__{key}"] = value
1125
+
1126
+ return params
1127
+
1128
+ def decision_function(self, X):
1129
+ X = np.asarray(X, dtype=float)
1130
+ if self.coef_ is None:
1131
+ raise ValueError("Model is not fitted yet.")
1132
+ return X @ self.coef_ + self.intercept_
1133
+
1134
+ def predict_proba(self, X):
1135
+ eta = self.decision_function(X)
1136
+ p1 = self._sigmoid(eta)
1137
+ p1 = np.clip(p1, self.eps, 1.0 - self.eps)
1138
+ return np.column_stack([1.0 - p1, p1])
1139
+
1140
+ def predict(self, X, threshold=0.5):
1141
+ return (self.predict_proba(X)[:, 1] >= threshold).astype(int)
1142
+
1143
+ def __repr__(self):
1144
+ return '\n'.join([('+' if beta>=0 else '') +
1145
+ sci(beta, self.prec) +
1146
+ '('+
1147
+ str(self.feature_names[i]) +
1148
+ ')'
1149
+ for i, beta in enumerate(self.coef_) if abs(beta)>0]) + '\n' + ('+' if self.intercept_ >= 0 else '') + sci(self.intercept_, self.prec)
1150
+
1151
+ def __str__(self):
1152
+ return 'LOGISTIC_ICL({0})'.format(str(self.get_params()))
1153
+
1154
+ def fit(self, X, y, feature_names=None, verbose=False):
1155
+ n, p = X.shape
1156
+
1157
+ beta = np.zeros(p, dtype=float)
1158
+ b = 0.0
1159
+
1160
+ for it in range(self.max_iter):
1161
+ if verbose: print('iteration {0}'.format(it))
1162
+ # predicting with current model
1163
+ eta = np.dot(X, beta) + b
1164
+
1165
+ # converting to probabilties
1166
+ p_hat = self._sigmoid(eta)
1167
+ p_hat = np.clip(p_hat, self.eps, 1.0 - self.eps)
1168
+
1169
+ # row weights
1170
+ w = p_hat * (1.0 - p_hat)
1171
+ w = np.clip(w, self.eps, 1.0)
1172
+
1173
+ # reweighted x and y
1174
+ z = eta + (y - p_hat) / w
1175
+
1176
+ w_sum = w.sum()
1177
+ xbar = (w[:, None] * X).sum(axis=0) / w_sum
1178
+ zbar = (w * z).sum() / w_sum
1179
+
1180
+ Xc = X - xbar
1181
+ zc = z - zbar
1182
+
1183
+ s = np.sqrt(w)
1184
+ X_star = Xc * s[:, None]
1185
+ z_star = zc * s
1186
+
1187
+ # fitting icl model to reweighted data
1188
+ icl_iter = clone(self.icl)
1189
+ icl_iter.fit(X_star, z_star, feature_names=feature_names, verbose=verbose>1)
1190
+
1191
+ beta_new = np.zeros(p, dtype=float)
1192
+ beta_new[icl_iter.beta_idx_] = icl_iter.beta_
1193
+
1194
+ b_new = zbar - xbar @ beta_new
1195
+
1196
+ # updating previous solution as linear combination of past and current
1197
+ beta_next = beta + self.damping * (beta_new - beta)
1198
+ b_next = b + self.damping * (b_new - b)
1199
+
1200
+ # stopping criteria
1201
+ delta = np.linalg.norm(beta_next - beta, 2) + abs(b_next - b)
1202
+ if verbose:
1203
+ print(f"iter {it+1}: delta={delta:.3e}")
1204
+
1205
+ beta, b = beta_next, b_next
1206
+
1207
+ if delta <= self.tol:
1208
+ if verbose: print('converged')
1209
+ break
1080
1210
 
1081
- if __name__ == "__main__":
1082
- from sklearn.model_selection import train_test_split
1211
+ if (verbose) and (it == self.max_iter): print('did not converge')
1212
+
1213
+ tol = 1e-10
1214
+
1215
+ # number of nonzeros in the fitted model
1216
+ nz_idx = np.flatnonzero(np.abs(beta) > tol)
1217
+ nz = int(np.sum(np.abs(beta) > tol))
1218
+
1219
+ std = X.std(axis=0, ddof=0)
1220
+ scores = np.abs(beta) * std
1221
+
1222
+ kk = min(self.k, nz)
1223
+
1224
+ # choose among nonzero coefficients only (avoids picking zero-weight features)
1225
+ if kk == 0:
1226
+ # degenerate: no features selected
1227
+ self.coef_ = np.zeros_like(beta)
1228
+ self.idx = np.array([], dtype=int)
1229
+ self.intercept_ = float(b)
1230
+ else:
1231
+ order = nz_idx[np.argsort(scores[nz_idx])[-kk:][::-1]]
1232
+ XS = X[:, order]
1233
+
1234
+ clf = LogisticRegressionCV(
1235
+ Cs=20, # or a list/array like np.logspace(-4, 4, 30)
1236
+ cv=5,
1237
+ penalty="l2",
1238
+ solver="lbfgs",
1239
+ scoring="neg_log_loss",
1240
+ fit_intercept=True,
1241
+ max_iter=5000,
1242
+ n_jobs=-1,
1243
+ refit=True,
1244
+ )
1245
+ clf.fit(XS, y)
1246
+
1247
+ self.coef_ = np.zeros_like(beta)
1248
+ self.coef_[order] = clf.coef_.ravel()
1249
+ self.idx = order
1250
+ self.intercept_ = float(clf.intercept_[0])
1251
+
1252
+ self.feature_names_sparse_ = feature_names[self.idx]
1253
+ self.feature_names = feature_names
1254
+ return self
1255
+
1256
+ @staticmethod
1257
+ def _sigmoid(eta):
1258
+ # stable sigmoid
1259
+ eta = np.asarray(eta, dtype=float)
1260
+ out = np.empty_like(eta, dtype=float)
1261
+ pos = eta >= 0
1262
+ neg = ~pos
1263
+ out[pos] = 1.0 / (1.0 + np.exp(-eta[pos]))
1264
+ exp_eta = np.exp(eta[neg])
1265
+ out[neg] = exp_eta / (1.0 + exp_eta)
1266
+ return out
1267
+
1268
+ def zero_one_loss(X, y, model):
1269
+ y = np.asarray(y)
1270
+ y_hat = model.predict(X)
1271
+ return np.mean(y_hat != y)
1272
+
1273
+ def hinge_loss(X, y, model):
1274
+ y = np.asarray(y)
1275
+ y_pm = 2*y - 1 # {0,1} -> {-1,+1}
1276
+ eta = model.decision_function(X)
1277
+ return np.mean(np.maximum(0.0, 1.0 - y_pm * eta))
1278
+
1279
+ def log_loss(X, y, model):
1280
+ y = np.asarray(y)
1281
+ eta = model.decision_function(X)
1282
+ return np.mean(np.logaddexp(0.0, eta) - y*eta)
1283
+
1284
+
1285
+ sci = lambda x, sig=3: f"{float(x):.{sig}e}"
1286
+
1287
+ if __name__ == "__main__":
1288
+ test = "bandgap"
1083
1289
  random_state = 0
1084
1290
  np.random.seed(random_state)
1085
- n, p = 10000, 10
1086
- X = np.random.random(size=(n,p))
1087
- y = np.sqrt(X[:, 0]) - np.cbrt(X[:, 0]) + X[:, 0]**3 - np.log(X[:, 0]) + np.sin(X[:, 0]) + 1
1088
- names = ['X_{0}'.format(i) for i in range(p)]
1089
-
1090
- rung = 1
1091
- small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
1092
- big = ['six_pow', 'exp', 'add', 'mul', 'div', 'abs_diff']
1093
- small = [(op, range(rung)) for op in small]
1094
- big = [(op, range(1)) for op in big]
1095
- ops = small+big
1096
-
1097
- FE = FeatureExpansion(rung=rung, ops=ops)
1098
- Phi_names, Phi_symbols, Phi_ = FE.expand(X=X, names=names, check_pos=True, verbose=True)
1099
- X_train, X_test, y_train, y_test = train_test_split(Phi_, y, test_size=0.2, random_state=random_state)
1100
- for i, s in enumerate([5]):
1101
- icl = ICL(s=s, so=AdaptiveLASSO(gamma=1), k=5, fit_intercept=True, normalize=True, optimize_k=False, track_intermediates=True)
1102
- icl.fit(X=X_train, y=y_train, feature_names = Phi_names, verbose=False)
1103
- print(icl.repr_ensemble())
1291
+ from sklearn.model_selection import train_test_split
1292
+ from sklearn.metrics import r2_score as r2
1293
+
1294
+ import pandas as pd
1295
+ import os
1296
+
1297
+ if test == "DIABETES":
1298
+ df = pd.read_csv(os.path.join(os.getcwd(), "Input", "pima.csv"))
1299
+ df["DIABETES"] = df["DIABETES"].map({"Y":1, "N": 0})
1300
+ y = df['DIABETES'].values
1301
+ X = df.drop(columns=['DIABETES'])
1302
+ feature_names = X.columns
1303
+ X = X.values
1304
+
1305
+ rung = 2
1306
+ small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
1307
+ big = ['six_pow', 'exp', 'add', 'mul', 'div', 'abs_diff']
1308
+ small = [(op, range(rung)) for op in small]
1309
+ big = [(op, range(1)) for op in big]
1310
+ ops = small+big
1311
+
1312
+ FE = FeatureExpansion(rung=rung, ops=ops)
1313
+ Phi_names, Phi_symbols, Phi_ = FE.expand(X=X, names=feature_names, check_pos=True, verbose=True)
1314
+ X_train, X_test, y_train, y_test = train_test_split(Phi_, y, test_size=0.2, random_state=random_state)
1315
+
1316
+ logistic_icl_params = {
1317
+ "s": 10,
1318
+ "so": AdaptiveLASSO(gamma=1, fit_intercept=False),
1319
+ "k": 5,
1320
+ "pool_reset": False,
1321
+ "track_intermediates": False,
1322
+ "max_iter": 1000,
1323
+ "tol": 1e-1,
1324
+ "eps": 1e-3,
1325
+ "damping": 0.5,
1326
+ "prec": 3
1327
+ }
1328
+
1329
+ icl_log = LOGISTIC_ICL(**logistic_icl_params)
1330
+ icl_log.fit(X=X_train, y=y_train, feature_names=Phi_names, verbose=1)
1331
+
1332
+ print(icl_log.__repr__())
1333
+ print('zero_one: {0}'.format(zero_one_loss(X_test, y_test, icl_log)))
1334
+ print('hinge: {0}'.format(hinge_loss(X_test, y_test, icl_log)))
1335
+ print('logloss: {0}'.format(log_loss(X_test, y_test, icl_log)))
1336
+ elif test=="Synthetic":
1337
+ k,n,p=3,10000,1000
1338
+ rng = np.random.default_rng(random_state)
1339
+ X = rng.standard_normal((n,p))
1340
+ feature_names = np.array(['X_{0}'.format(i) for i in range(p)])
1341
+ support = range(k)
1342
+ beta = np.zeros(p, dtype=float)
1343
+ signs = rng.choice([-1.0, 1.0], size=k)
1344
+ mags = rng.uniform(0.5, 1.5, size=k)
1345
+ beta[support] = signs * mags
1346
+ eta_no_b = X @ beta
1347
+ b = float(-np.mean(eta_no_b))
1348
+ eta = eta_no_b + b
1349
+ p1 = 1.0 / (1.0 + np.exp(-np.clip(eta, -50, 50)))
1350
+ y = rng.binomial(1, p1, size=n).astype(int)
1351
+
1352
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
1353
+
1354
+ logistic_icl_params = {
1355
+ "s": 10,
1356
+ "so": AdaptiveLASSO(gamma=1, fit_intercept=False),
1357
+ "k": k,
1358
+ "pool_reset": False,
1359
+ "track_intermediates": False,
1360
+ "max_iter": 1000,
1361
+ "tol": 1e-3,
1362
+ "eps": 1e-6,
1363
+ "damping": 0.8,
1364
+ "prec": 3
1365
+ }
1366
+
1367
+ icl_log = LOGISTIC_ICL(**logistic_icl_params)
1368
+ icl_log.fit(X=X_train, y=y_train, feature_names=feature_names, verbose=1)
1369
+
1370
+ print(icl_log.__repr__())
1371
+ print('zero_one: {0}'.format(zero_one_loss(X_test, y_test, icl_log)))
1372
+ print('hinge: {0}'.format(hinge_loss(X_test, y_test, icl_log)))
1373
+ print('logloss: {0}'.format(log_loss(X_test, y_test, icl_log)))
1374
+
1375
+ print('True Coef: {0}'.format(beta[:k]))
1376
+ print('True intercept: {0}'.format(b))
1377
+ eta_test = icl_log.decision_function(X_test) # log-odds
1378
+ p_test = 1.0 / (1.0 + np.exp(-eta_test))
1379
+ print('Bayes error: {0}'.format(np.mean(np.minimum(p_test, 1-p_test))))
1380
+ elif test=='bandgap':
1381
+ path = os.path.join('/'.join(os.getcwd().split('/')[:-1]), 'icol_exp', 'Input', 'data_HTE.csv')
1382
+ df = pd.read_csv(path)
1383
+ y = df['Y_oxygenate'].values
1384
+ X = df.drop(columns=['material_and_condition', 'Y_oxygenate'])
1385
+ feature_names = X.columns
1386
+ X = X.values
1387
+
1388
+ rung = 2
1389
+ small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
1390
+ big = ['six_pow', 'exp', 'add', 'mul', 'div', 'abs_diff']
1391
+ small = [(op, range(rung)) for op in small]
1392
+ big = [(op, range(1)) for op in big]
1393
+ ops = small+big
1394
+
1395
+ FE = FeatureExpansion(rung=rung, ops=ops)
1396
+ Phi_names, Phi_symbols, Phi_ = FE.expand(X=X, names=feature_names, check_pos=True, verbose=True)
1397
+
1398
+ X_train, X_test, y_train, y_test = train_test_split(Phi_, y, test_size=0.2, random_state=random_state)
1399
+ for i, s in enumerate([1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,200,300,400]):
1400
+ icl = ICL(s=s, so=AdaptiveLASSO(gamma=1, fit_intercept=False), k=5, fit_intercept=True,
1401
+ normalize=True, optimize_k=True, track_intermediates=False)
1402
+
1403
+ icl.fit(X_train, y_train, feature_names=Phi_names, verbose=0)
1404
+ y_test_hat = icl.predict(X_test)
1405
+ score = r2(y_test, y_test_hat)
1406
+ print('model={0}, s={2}, r2={1}'.format(icl.__repr__(), score, s))