icol 0.8.4__tar.gz → 0.8.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: icol
3
- Version: 0.8.4
3
+ Version: 0.8.5
4
4
  Summary: Iterative Correlation Learning implementation
5
5
  Author-email: Simon Teshuva <simon.teshuva@gmail.com>
6
6
  License: MIT
@@ -1070,7 +1070,7 @@ class FeatureExpansion:
1070
1070
  new_op_symbols = sym_vect(idx2)
1071
1071
  new_op_names = str_vectorize(new_op_symbols)
1072
1072
  X_i = X[:, idx1]
1073
- new_op_X = X_i[:, np.newaxis]*X[:, idx2]
1073
+ new_op_X = op_np(X_i[:, np.newaxis], X[:, idx2]) #X_i[:, np.newaxis]*X[:, idx2]
1074
1074
  new_names, new_symbols, new_X = self.add_new(new_names=new_names, new_symbols=new_symbols, new_X=new_X,
1075
1075
  new_name=new_op_names, new_symbol=new_op_symbols, new_X_i=new_op_X, verbose=verbose)
1076
1076
  if not(new_names is None):
@@ -1285,10 +1285,12 @@ def log_loss(X, y, model):
1285
1285
  sci = lambda x, sig=3: f"{float(x):.{sig}e}"
1286
1286
 
1287
1287
  if __name__ == "__main__":
1288
- test = "Synthetic"
1288
+ test = "bandgap"
1289
1289
  random_state = 0
1290
1290
  np.random.seed(random_state)
1291
1291
  from sklearn.model_selection import train_test_split
1292
+ from sklearn.metrics import r2_score as r2
1293
+
1292
1294
  import pandas as pd
1293
1295
  import os
1294
1296
 
@@ -1330,8 +1332,7 @@ if __name__ == "__main__":
1330
1332
  print(icl_log.__repr__())
1331
1333
  print('zero_one: {0}'.format(zero_one_loss(X_test, y_test, icl_log)))
1332
1334
  print('hinge: {0}'.format(hinge_loss(X_test, y_test, icl_log)))
1333
- print('logloss: {0}'.format(log_loss(X_test, y_test, icl_log)))
1334
-
1335
+ print('logloss: {0}'.format(log_loss(X_test, y_test, icl_log)))
1335
1336
  elif test=="Synthetic":
1336
1337
  k,n,p=3,10000,1000
1337
1338
  rng = np.random.default_rng(random_state)
@@ -1376,3 +1377,30 @@ if __name__ == "__main__":
1376
1377
  eta_test = icl_log.decision_function(X_test) # log-odds
1377
1378
  p_test = 1.0 / (1.0 + np.exp(-eta_test))
1378
1379
  print('Bayes error: {0}'.format(np.mean(np.minimum(p_test, 1-p_test))))
1380
+ elif test=='bandgap':
1381
+ path = os.path.join('/'.join(os.getcwd().split('/')[:-1]), 'icol_exp', 'Input', 'data_HTE.csv')
1382
+ df = pd.read_csv(path)
1383
+ y = df['Y_oxygenate'].values
1384
+ X = df.drop(columns=['material_and_condition', 'Y_oxygenate'])
1385
+ feature_names = X.columns
1386
+ X = X.values
1387
+
1388
+ rung = 2
1389
+ small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
1390
+ big = ['six_pow', 'exp', 'add', 'mul', 'div', 'abs_diff']
1391
+ small = [(op, range(rung)) for op in small]
1392
+ big = [(op, range(1)) for op in big]
1393
+ ops = small+big
1394
+
1395
+ FE = FeatureExpansion(rung=rung, ops=ops)
1396
+ Phi_names, Phi_symbols, Phi_ = FE.expand(X=X, names=feature_names, check_pos=True, verbose=True)
1397
+
1398
+ X_train, X_test, y_train, y_test = train_test_split(Phi_, y, test_size=0.2, random_state=random_state)
1399
+ for i, s in enumerate([1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,200,300,400]):
1400
+ icl = ICL(s=s, so=AdaptiveLASSO(gamma=1, fit_intercept=False), k=5, fit_intercept=True,
1401
+ normalize=True, optimize_k=True, track_intermediates=False)
1402
+
1403
+ icl.fit(X_train, y_train, feature_names=Phi_names, verbose=0)
1404
+ y_test_hat = icl.predict(X_test)
1405
+ score = r2(y_test, y_test_hat)
1406
+ print('model={0}, s={2}, r2={1}'.format(icl.__repr__(), score, s))
@@ -0,0 +1,1176 @@
1
+ import warnings
2
+ warnings.filterwarnings('ignore')
3
+
4
+ from time import time
5
+ from copy import deepcopy
6
+ from itertools import combinations, permutations
7
+
8
+ import numpy as np
9
+ import sympy as sp
10
+
11
+ from sklearn.linear_model import lars_path, Ridge, Lars
12
+ from sklearn.preprocessing import PolynomialFeatures
13
+ from sklearn.base import clone
14
+ from sklearn.model_selection import train_test_split
15
+
16
+ from sklearn.metrics import mean_squared_error, log_loss, zero_one_loss, hinge_loss, roc_auc_score
17
+ from sklearn.linear_model import LogisticRegression
18
+
19
+ def rmse(y_true, y_pred):
20
+ return np.sqrt(np.mean((y_true - y_pred) ** 2))
21
+
22
+ def corr(X, g):
23
+ sigma_X = np.std(X, axis=0)
24
+ sigma_Y = np.std(g)
25
+
26
+ XY = X*g.reshape(-1, 1)
27
+ E_XY = np.mean(XY, axis=0)
28
+ E_X = np.mean(X, axis=0)
29
+ E_Y = np.mean(g)
30
+ cov = E_XY - E_X*E_Y
31
+ sigma = sigma_X*sigma_Y
32
+ pearsons = cov/sigma
33
+ absolute_pearsons = np.abs(pearsons)
34
+ absolute_pearsons[np.isnan(absolute_pearsons)] = 0 # setting all rows of constants to have 0 correlation
35
+ absolute_pearsons[np.isinf(absolute_pearsons)] = 0 # setting all rows of constants to have 0 correlation
36
+ absolute_pearsons[np.isneginf(absolute_pearsons)] = 0
37
+
38
+ return absolute_pearsons
39
+
40
+ def squared(X, y, model):
41
+ y_hat = model.predict(X).ravel()
42
+ res = y - y_hat
43
+ return corr(X, res)
44
+
45
+ def df_log_loss(X, y, model):
46
+ eta = model.decision_function(X).ravel() # real-valued score
47
+ p = 1.0 / (1.0 + np.exp(-np.clip(eta, -30, 30)))
48
+ g = y - p
49
+ return corr(X, g)
50
+
51
+ def _sigmoid(z, clp=30):
52
+ z = np.clip(z, -clp, clp)
53
+ return 1.0 / (1.0 + np.exp(-z))
54
+
55
+ OBJECTIVE_DICT = {
56
+ 'squared': squared,
57
+ 'logistic': df_log_loss
58
+ }
59
+
60
+ LOSS_DICT = {
61
+ 'squared': rmse,
62
+ 'zero_one': zero_one_loss,
63
+ 'hinge': hinge_loss,
64
+ 'logloss': log_loss,
65
+ 'logistic': log_loss
66
+ }
67
+
68
+ OP_DICT = {
69
+ 'sin': {
70
+ 'op': sp.sin,
71
+ 'op_np': np.sin,
72
+ 'inputs': 1,
73
+ 'commutative': True,
74
+ 'cares_units': False
75
+ },
76
+ 'cos': {
77
+ 'op': sp.cos,
78
+ 'op_np': np.cos,
79
+ 'inputs': 1,
80
+ 'commutative': True,
81
+ 'cares_units': False
82
+ },
83
+ 'log': {
84
+ 'op': sp.log,
85
+ 'op_np': np.log,
86
+ 'inputs': 1,
87
+ 'commutative': True,
88
+ 'cares_units': False
89
+ },
90
+ 'exp': {
91
+ 'op': sp.exp,
92
+ 'op_np': np.exp,
93
+ 'inputs': 1,
94
+ 'commutative': True,
95
+ 'cares_units': False
96
+ },
97
+ 'abs': {
98
+ 'op': sp.Abs,
99
+ 'op_np': np.abs,
100
+ 'inputs': 1,
101
+ 'commutative': True,
102
+ 'cares_units': False
103
+ },
104
+ 'sqrt': {
105
+ 'op': sp.sqrt,
106
+ 'op_np': np.sqrt,
107
+ 'inputs': 1,
108
+ 'commutative': True,
109
+ 'cares_units': False
110
+ },
111
+ 'cbrt': {
112
+ 'op': lambda x: sp.Pow(x, sp.Rational(1, 3)),
113
+ 'op_np': lambda x: np.power(x, 1/3),
114
+ 'inputs': 1,
115
+ 'commutative': True,
116
+ 'cares_units': False
117
+ },
118
+ 'sq': {
119
+ 'op': lambda x: sp.Pow(x, 2),
120
+ 'op_np': lambda x: np.power(x, 2),
121
+ 'inputs': 1,
122
+ 'commutative': True,
123
+ 'cares_units': False
124
+ },
125
+ 'cb': {
126
+ 'op': lambda x: sp.Pow(x, 3),
127
+ 'op_np': lambda x: np.power(x, 3),
128
+ 'inputs': 1,
129
+ 'commutative': True,
130
+ 'cares_units': False
131
+ },
132
+ 'six_pow': {
133
+ 'op': lambda x: sp.Pow(x, 6),
134
+ 'op_np': lambda x: np.power(x, 6),
135
+ 'inputs': 1,
136
+ 'commutative': True,
137
+ 'cares_units': False
138
+ },
139
+ 'inv': {
140
+ 'op': lambda x: 1/x,
141
+ 'op_np': lambda x: 1/x,
142
+ 'inputs': 1,
143
+ 'commutative': True,
144
+ 'cares_units': False
145
+ },
146
+ 'mul': {
147
+ 'op': sp.Mul,
148
+ 'op_np': np.multiply,
149
+ 'inputs': 2,
150
+ 'commutative': True,
151
+ 'cares_units': False
152
+ },
153
+ 'div': {
154
+ 'op': lambda x, y: sp.Mul(x, 1/y),
155
+ 'op_np': lambda x, y: np.multiply(x, 1/y),
156
+ 'inputs': 2,
157
+ 'commutative': False,
158
+ 'cares_units': False
159
+ },
160
+ 'add': {
161
+ 'op': sp.Add,
162
+ 'op_np': lambda x, y: x+y,
163
+ 'inputs': 2,
164
+ 'commutative': True,
165
+ 'cares_units': False
166
+ },
167
+ 'sub': {
168
+ 'op': lambda x, y: sp.Add(x, -y),
169
+ 'op_np': lambda x, y: x-y,
170
+ 'inputs': 2,
171
+ 'commutative': False,
172
+ 'cares_units': False
173
+ },
174
+ 'abs_diff': {
175
+ 'op': lambda x, y: sp.Abs(sp.Add(x, -y)),
176
+ 'op_np': lambda x, y: np.abs(x-y),
177
+ 'inputs': 2,
178
+ 'commutative': True,
179
+ 'cares_units': False
180
+ },
181
+ }
182
+
183
+ class FeatureExpansion:
184
+ def __init__(self, ops, rung, printrate=1000):
185
+ self.ops = ops
186
+ self.rung = rung
187
+ self.printrate = printrate
188
+ self.prev_print = 0
189
+ for i, op in enumerate(self.ops):
190
+ if type(op) == str:
191
+ self.ops[i] = (op, range(rung))
192
+
193
+ def remove_redundant_features(self, symbols, names, X):
194
+ sorted_idxs = np.argsort(names)
195
+ for i, idx in enumerate(sorted_idxs):
196
+ if i == 0:
197
+ unique = [idx]
198
+ elif names[idx] != names[sorted_idxs[i-1]]:
199
+ unique += [idx]
200
+ unique_original_order = np.sort(unique)
201
+
202
+ return symbols[unique_original_order], names[unique_original_order], X[:, unique_original_order]
203
+
204
+ def expand(self, X, names=None, verbose=False, f=None, check_pos=False):
205
+ n, p = X.shape
206
+ if (names is None) or (len(names) != p):
207
+ names = ['x_{0}'.format(i) for i in range(X.shape[1])]
208
+
209
+ if check_pos == False:
210
+ symbols = sp.symbols(' '.join(name.replace(' ', '.') for name in names))
211
+ else:
212
+ symbols = []
213
+ for i, name in enumerate(names):
214
+ name = name.replace(' ', '.')
215
+ if np.all(X[:, i] > 0):
216
+ sym = sp.symbols(name, real=True, positive=True)
217
+ else:
218
+ sym = sp.symbols(name, real=True)
219
+ symbols.append(sym)
220
+
221
+ symbols = np.array(symbols)
222
+ names = np.array(names)
223
+
224
+ if verbose: print('Estimating the creation of around {0} features'.format(self.estimate_workload(p=p, max_rung=self.rung, verbose=verbose>2)))
225
+
226
+ names, symbols, X = self.expand_aux(X=X, names=names, symbols=symbols, crung=0, prev_p=0, verbose=verbose)
227
+
228
+ if not(f is None):
229
+ import pandas as pd
230
+ df = pd.DataFrame(data=X, columns=names)
231
+ df['y'] = y
232
+ df.to_csv(f)
233
+
234
+ return names, symbols, X
235
+
236
+ def estimate_workload(self, p, max_rung,verbose=False):
237
+ p0 = 0
238
+ p1 = p
239
+ for rung in range(max_rung):
240
+ if verbose: print('Applying rung {0} expansion'.format(rung))
241
+ new_u, new_bc, new_bn = 0, 0, 0
242
+ for (op, rung_range) in self.ops:
243
+ if rung in rung_range:
244
+ if verbose: print('Applying {0} to {1} features will result in approximately '.format(op, p1-p0))
245
+ if OP_DICT[op]['inputs'] == 1:
246
+ new_u += p1
247
+ if verbose: print('{0} new features'.format(p1))
248
+ elif OP_DICT[op]['commutative'] == True:
249
+ new_bc += (1/2)*(p1 - p0 + 1)*(p0 + p1 + 2)
250
+ if verbose: print('{0} new features'.format((1/2)*(p1 - p0 + 1)*(p0 + p1 + 2)))
251
+ else:
252
+ new_bn += (p1 - p0 + 1)*(p0 + p1 + 2)
253
+ if verbose: print('{0} new features'.format((p1 - p0 + 1)*(p0 + p1 + 2)))
254
+ p0 = p1
255
+ p1 = p1 + new_u + new_bc + new_bn
256
+ if verbose: print('For a total of {0} features by rung {1}'.format(p1, rung))
257
+ return p1
258
+
259
+ def add_new(self, new_names, new_symbols, new_X, new_name, new_symbol, new_X_i, verbose=False):
260
+ valid = (np.isnan(new_X_i).sum(axis=0) + np.isposinf(new_X_i).sum(axis=0) + np.isneginf(new_X_i).sum(axis=0)) == 0
261
+ if new_names is None:
262
+ new_names = np.array(new_name[valid])
263
+ new_symbols = np.array(new_symbol[valid])
264
+ new_X = np.array(new_X_i[:, valid])
265
+ else:
266
+ new_names = np.concatenate((new_names, new_name[valid]))
267
+ new_symbols = np.concatenate((new_symbols, new_symbol[valid]))
268
+ new_X = np.hstack([new_X, new_X_i[:, valid]])
269
+ # if (verbose > 1) and not(new_names is None) and (len(new_names) % self.printrate == 0): print('Created {0} features so far'.format(len(new_names)))
270
+ if (verbose > 1) and not(new_names is None) and (len(new_names) - self.prev_print >= self.printrate):
271
+ self.prev_print = len(new_names)
272
+ elapsed = np.round(time() - self.start_time, 2)
273
+ print('Created {0} features so far in {1} seconds'.format(len(new_names),elapsed))
274
+ return new_names, new_symbols, new_X
275
+
276
+ def expand_aux(self, X, names, symbols, crung, prev_p, verbose=False):
277
+
278
+ str_vectorize = np.vectorize(str)
279
+
280
+ def simplify_nested_powers(expr):
281
+ # Replace (x**n)**(1/n) with x
282
+ def flatten_pow_chain(e):
283
+ if isinstance(e, sp.Pow) and isinstance(e.base, sp.Pow):
284
+ base, inner_exp = e.base.args
285
+ outer_exp = e.exp
286
+ combined_exp = inner_exp * outer_exp
287
+ if sp.simplify(combined_exp) == 1:
288
+ return base
289
+ return sp.Pow(base, combined_exp)
290
+ elif isinstance(e, sp.Pow) and sp.simplify(e.exp) == 1:
291
+ return e.base
292
+ return e
293
+ # Apply recursively
294
+ return expr.replace(
295
+ lambda e: isinstance(e, sp.Pow),
296
+ flatten_pow_chain
297
+ )
298
+
299
+ if crung == 0:
300
+ self.start_time = time()
301
+ symbols, names, X = self.remove_redundant_features(X=X, names=names, symbols=symbols)
302
+ if crung==self.rung:
303
+ if verbose: print('Completed {0} rounds of feature transformations'.format(self.rung))
304
+ return symbols, names, X
305
+ else:
306
+ if verbose: print('Applying round {0} of feature transformations'.format(crung+1))
307
+ # if verbose: print('Estimating the creation of {0} features this iteration'.format(self.estimate_workload(p=X.shape[1], max_rung=1)))
308
+
309
+ new_names, new_symbols, new_X = None, None, None
310
+
311
+ for (op_key, rung_range) in self.ops:
312
+ if crung in rung_range:
313
+ if verbose>1: print('Applying operator {0} to {1} features'.format(op_key, X.shape[1]))
314
+ op_params = OP_DICT[op_key]
315
+ op_sym, op_np, inputs, comm = op_params['op'], op_params['op_np'], op_params['inputs'], op_params['commutative']
316
+ if inputs == 1:
317
+ sym_vect = np.vectorize(op_sym)
318
+ new_op_symbols = sym_vect(symbols[prev_p:])
319
+ new_op_X = op_np(X[:, prev_p:])
320
+ new_op_names = str_vectorize(new_op_symbols)
321
+ new_names, new_symbols, new_X = self.add_new(new_names=new_names, new_symbols=new_symbols, new_X=new_X,
322
+ new_name=new_op_names, new_symbol=new_op_symbols, new_X_i=new_op_X, verbose=verbose)
323
+ elif inputs == 2:
324
+ for idx1 in range(prev_p, X.shape[1]):
325
+ sym_vect = np.vectorize(lambda idx2: op_sym(symbols[idx1], symbols[idx2]))
326
+ idx2 = range(idx1 if comm else X.shape[1])
327
+ if len(idx2) > 0:
328
+ new_op_symbols = sym_vect(idx2)
329
+ new_op_names = str_vectorize(new_op_symbols)
330
+ X_i = X[:, idx1]
331
+ new_op_X = X_i[:, np.newaxis]*X[:, idx2]
332
+ new_names, new_symbols, new_X = self.add_new(new_names=new_names, new_symbols=new_symbols, new_X=new_X,
333
+ new_name=new_op_names, new_symbol=new_op_symbols, new_X_i=new_op_X, verbose=verbose)
334
+ if not(new_names is None):
335
+ names = np.concatenate((names, new_names))
336
+ symbols = np.concatenate((symbols, new_symbols))
337
+ prev_p = X.shape[1]
338
+ X = np.hstack([X, new_X])
339
+ else:
340
+ prev_p = X.shape[1]
341
+
342
+ if verbose: print('After applying rounds {0} of feature transformations there are {1} features'.format(crung+1, X.shape[1]))
343
+ if verbose: print('Removing redundant features leaves... ', end='')
344
+ symbols, names, X = self.remove_redundant_features(X=X, names=names, symbols=symbols)
345
+ if verbose: print('{0} features'.format(X.shape[1]))
346
+
347
+ return self.expand_aux(X=X, names=names, symbols=symbols, crung=crung+1, prev_p=prev_p, verbose=verbose)
348
+
349
+ class generalised_SIS:
350
+ def __init__(self, s, obj='squared'):
351
+ self.s=s
352
+ self.obj=obj
353
+
354
+ def __str__(self):
355
+ return 'SIS(s={0}, obj={1})'.format(self.s, self.obj)
356
+
357
+ def __repr__(self):
358
+ return self.__str__()
359
+
360
+ def __call__(self, X, y, model, pool):
361
+ scores = OBJECTIVE_DICT[self.obj](X=X, y=y, model=model)
362
+ idxs = np.argsort(scores)[::-1]
363
+
364
+ pool_set = set(pool)
365
+ chosen = []
366
+ for j in idxs:
367
+ if j not in pool_set:
368
+ chosen.append(j)
369
+ if len(chosen) == self.s:
370
+ break
371
+
372
+ chosen = np.array(chosen, dtype=int)
373
+ return scores[chosen], chosen
374
+
375
+ class LOGISTIC_LASSO:
376
+ def __init__(self, C_grid=np.logspace(-4, 2, 100), solver="saga",
377
+ class_weight=None, max_iter=5000, tol=1e-4, eps_nnz=1e-12,
378
+ clp=30, random_state=None):
379
+ self.C_grid = np.sort(np.asarray(C_grid, dtype=float))
380
+ self.solver = solver
381
+ self.class_weight = class_weight
382
+ self.max_iter = max_iter
383
+ self.tol = tol
384
+ self.eps_nnz = eps_nnz
385
+ self.random_state = random_state
386
+ self.clp = clp
387
+
388
+ self.models = np.array([LogisticRegression(C=c,
389
+ solver=self.solver, class_weight=self.class_weight,
390
+ max_iter=self.max_iter, tol=self.tol, random_state=random_state,
391
+ penalty='l1', l1_ratio=1, fit_intercept=False,
392
+ ) for c in self.C_grid], dtype=object)
393
+
394
+ def get_params(self, deep=True):
395
+ return {
396
+ "C_grid": self.C_grid,
397
+ "solver": self.solver,
398
+ "class_weight": self.class_weight,
399
+ "max_iter": self.max_iter,
400
+ "tol": self.tol,
401
+ "eps_nnz": self.eps_nnz,
402
+ "random_state": self.random_state,
403
+ "penalty": "l1",
404
+ "l1_ratio": 1,
405
+ "fit_intercept": False,
406
+ 'clp': self.clp
407
+ }
408
+
409
+ def __str__(self):
410
+ params = self.get_params()
411
+ params_str = ", ".join(f"{k}={params[k]!r}" for k in sorted(params))
412
+ return f"LogisticLasso({params_str})"
413
+
414
+ def fit(self, X, y, d, feature_names=None, verbose=False):
415
+ self.feature_names = ['X_{0}'.format(i) for i in range(X.shape[1])] if feature_names is None else feature_names
416
+ best_idx = 0
417
+ for i, model in enumerate(self.models):
418
+ if verbose: print('Fitting model {0} of {1} with C={2} and has '.format(i, len(self.models), model.C), end='')
419
+ model.fit(X, y)
420
+ nnz = self._count_nnz(model.coef_)
421
+ if verbose: print('{0} nonzero terms'.format(nnz))
422
+ if nnz<=d:
423
+ best_idx = i
424
+ else:
425
+ break
426
+
427
+ self.model_idx = best_idx
428
+ self.model = self.models[self.model_idx]
429
+ self.coef_ = self.model.coef_.ravel()
430
+ self.coef_idx_ = np.arange(len(self.coef_))[np.abs(np.ravel(self.coef_)) > self.eps_nnz]
431
+ return self
432
+
433
+ def _count_nnz(self, coef):
434
+ return int(np.sum(
435
+ np.abs(np.ravel(coef)) > self.eps_nnz
436
+ ))
437
+
438
+ def __repr__(self, prec=3):
439
+ coef = self.model.coef_.ravel()
440
+ return ''.join([('+' if c > 0 else '') + sci(c, sig=prec) + '(' + self.feature_names[i] + ')' for i, c in enumerate(coef) if (np.abs(coef[i]) > self.eps_nnz)])
441
+
442
+ def decision_function(self, X):
443
+ return np.dot(X, self.model.coef_.ravel())
444
+
445
+ def predict_proba(self, X):
446
+ z = self.decision_function(X)
447
+ z = np.clip(z, -self.clp, self.clp) # numerical stability
448
+ p1 = 1.0 / (1.0 + np.exp(-z))
449
+ p0 = 1.0 - p1
450
+ return np.column_stack([p0, p1])
451
+
452
+ def predict(self, X , threshold=0.5):
453
+ proba = self.predict_proba(X)
454
+ p1 = proba[:, 1]
455
+ return (p1 >= threshold).astype(int)
456
+
457
+ class GENERALISED_ICL:
458
+ def __init__(self, sis, so, k, fit_intercept=True, normalize=True, pool_reset=False, optimize_k=True, track_intermediates=False, clp=30):
459
+ self.sis = sis
460
+ self.so = so
461
+ self.k = int(k)
462
+
463
+ self.fit_intercept = bool(fit_intercept)
464
+ self.normalize = bool(normalize) and (self.sis.obj.lower() != 'logistic')
465
+ self.pool_reset = bool(pool_reset)
466
+ self.optimize_k = bool(optimize_k)
467
+ self.track_intermediates = bool(track_intermediates)
468
+ self.clp = clp
469
+
470
+ self.pool_ = None
471
+ self.feature_names_ = None
472
+ self.intercept_ = 0.0
473
+ self.coef_ = None
474
+
475
+ self.beta_ = None
476
+ self.beta_idx_ = None
477
+ self.beta_sparse_ = None
478
+ self.feature_names_sparse_ = None
479
+
480
+ if self.optimize_k or self.track_intermediates: self.intermediates = np.empty(shape=(self.k, 5), dtype=object)
481
+
482
+ def get_params(self, deep=True):
483
+ params = {
484
+ "sis": self.sis,
485
+ "so": self.so,
486
+ "k": self.k,
487
+ "fit_intercept": self.fit_intercept,
488
+ "normalize": self.normalize,
489
+ "pool_reset": self.pool_reset,
490
+ "optimize_k": self.optimize_k,
491
+ "track_intermediates": self.track_intermediates,
492
+ 'clp': self.clp
493
+ }
494
+
495
+ return params
496
+
497
+ def __str__(self):
498
+ return 'ICL({0})'.format(self.get_params())
499
+
500
+ def __repr__(self, prec=3):
501
+ ret = []
502
+ for i, name in enumerate(self.feature_names_sparse_):
503
+ ret += [('+' if self.coef_[0, i] > 0 else '') +
504
+ str(np.format_float_scientific(self.coef_[0, i], precision=prec, unique=False))
505
+ + ' (' + str(name) + ')' + '\n']
506
+ ret += [('+' if self.intercept_>0 else '') + str(np.format_float_scientific(self.intercept_, precision=prec, unique=False))]
507
+
508
+ return ''.join(ret)
509
+
510
+ def solve_norm_coef(self, X, y):
511
+ n, p = X.shape
512
+ obj = self.sis.obj.lower()
513
+
514
+ # Logistic: no y-normalization; with your init guard, normalize is already False.
515
+ if obj == "logistic":
516
+ self.a_x = X.mean(axis=0) if self.fit_intercept else np.zeros(p)
517
+ self.b_x = X.std(axis=0)
518
+ self.b_x = np.where(self.b_x == 0, 1.0, self.b_x)
519
+ self.a_y = 0.0
520
+ self.b_y = 1.0
521
+ return self
522
+ # Squared (regression): optionally normalize
523
+ if self.fit_intercept:
524
+ a_x = X.mean(axis=0)
525
+ a_y = float(np.mean(y))
526
+ else:
527
+ a_x = np.zeros(p)
528
+ a_y = 0.0
529
+
530
+ if self.normalize:
531
+ b_x = X.std(axis=0)
532
+ b_y = float(np.std(y))
533
+ # avoid division by zero for constant columns / constant y
534
+ b_x = np.where(b_x == 0, 1.0, b_x)
535
+ b_y = 1.0 if b_y == 0 else b_y
536
+ else:
537
+ b_x = np.ones(p)
538
+ b_y = 1.0
539
+
540
+ self.a_x, self.b_x, self.a_y, self.b_y = a_x, b_x, a_y, b_y
541
+ return self
542
+
543
+ def normalize_Xy(self, X, y):
544
+ obj = self.sis.obj.lower()
545
+
546
+ Xn = (X - self.a_x) / self.b_x
547
+
548
+ if obj == "logistic":
549
+ # keep y in {0,1}
550
+ yn = y
551
+ else:
552
+ yn = (y - self.a_y) / self.b_y
553
+
554
+ return Xn, yn
555
+
556
+ def coef(self):
557
+ """
558
+ Set self.coef_ (on original feature scale) and self.intercept_.
559
+ Uses self.beta_sparse_ and self.beta_idx_ (support).
560
+ """
561
+ obj = self.sis.obj.lower()
562
+
563
+ if self.beta_idx_ is None or len(self.beta_idx_) == 0:
564
+ self.coef_ = np.zeros((1, 0))
565
+ # intercept_ already set in loop; for squared you may want a_y too
566
+ return self
567
+
568
+ idx = np.asarray(self.beta_idx_, dtype=int)
569
+ beta_s = np.asarray(self.beta_sparse_, dtype=float).ravel()
570
+
571
+ # Logistic: no y scaling; coefficients are on the X scale after X normalization.
572
+ # With your current design logistic has normalize=False, so this is just the native scale.
573
+ if obj == "logistic":
574
+ if self.normalize:
575
+ coef = beta_s / self.b_x[idx]
576
+ self.coef_ = coef.reshape(1, -1)
577
+ if self.fit_intercept:
578
+ self.intercept_ = float(self.intercept_ - self.a_x[idx] @ coef)
579
+ else:
580
+ self.intercept_ = 0.0
581
+ else:
582
+ self.coef_ = beta_s.reshape(1, -1)
583
+ if not self.fit_intercept:
584
+ self.intercept_ = 0.0
585
+ return self
586
+ # Squared regression: if we normalized, undo it.
587
+ if self.normalize:
588
+ coef = beta_s * (self.b_y / self.b_x[idx])
589
+ self.coef_ = coef.reshape(1, -1)
590
+ if self.fit_intercept:
591
+ self.intercept_ = self.a_y - float(self.coef_ @ self.a_x[idx].reshape(-1, 1))
592
+ else:
593
+ self.intercept_ = 0.0
594
+ else:
595
+ self.coef_ = beta_s.reshape(1, -1)
596
+ # intercept_ should already be set in loop; ensure it exists
597
+ if not self.fit_intercept:
598
+ self.intercept_ = 0.0
599
+
600
+ return self
601
+
602
+ def _set_x_transform(self, X):
603
+ p = X.shape[1]
604
+ if not self.fit_intercept and not self.normalize:
605
+ self.a_x = np.zeros(p)
606
+ self.b_x = np.ones(p)
607
+ return
608
+
609
+ if self.normalize:
610
+ self.a_x = X.mean(axis=0) if self.fit_intercept else np.zeros(p)
611
+ self.b_x = X.std(axis=0)
612
+ self.b_x = np.where(self.b_x == 0, 1.0, self.b_x)
613
+ else:
614
+ # if not normalizing, don't change X at all
615
+ self.a_x = np.zeros(p)
616
+ self.b_x = np.ones(p)
617
+
618
+
619
+ def _transform_X(self, X):
620
+ return (X - self.a_x) / self.b_x
621
+
622
+ def filter_invalid_cols(self, X):
623
+ nans = np.isnan(X).sum(axis=0) > 0
624
+ infs = np.isinf(X).sum(axis=0) > 0
625
+ ninfs = np.isneginf(X).sum(axis=0) > 0
626
+
627
+ nanidx = np.where(nans==True)[0]
628
+ infidx = np.where(infs==True)[0]
629
+ ninfidx = np.where(ninfs==True)[0]
630
+
631
+ bad_cols = np.hstack([nanidx, infidx, ninfidx])
632
+ bad_cols = np.unique(bad_cols)
633
+
634
+ return bad_cols
635
+
636
+ def _maybe_filter_X(self, X):
637
+ X = np.asarray(X)
638
+ # If already filtered, don't delete again
639
+ if hasattr(self, "p_filtered_") and X.shape[1] == self.p_filtered_:
640
+ return X
641
+ return np.delete(X, self.bad_col, axis=1)
642
+
643
+ def _sigmoid_stable(self, z):
644
+ # stable sigmoid without hard clipping
645
+ out = np.empty_like(z, dtype=float)
646
+ pos = z >= 0
647
+ out[pos] = 1.0 / (1.0 + np.exp(-z[pos]))
648
+ ez = np.exp(z[~pos])
649
+ out[~pos] = ez / (1.0 + ez)
650
+ return out
651
+
652
+ def update_intercept(self, X, y, beta=None, n_steps=50, eps=1e-12, tol=1e-10):
653
+ if not self.fit_intercept:
654
+ self.intercept_ = 0.0
655
+ return self
656
+
657
+ y = np.asarray(y).ravel()
658
+
659
+
660
+ if self.sis.obj.lower() == "squared":
661
+ if beta is None:
662
+ self.intercept_ = float(np.mean(y))
663
+ else:
664
+ xb = np.asarray(X @ beta).ravel()
665
+ self.intercept_ = float(np.mean(y - xb))
666
+ return self
667
+
668
+ if self.sis.obj.lower() != "logistic":
669
+ raise ValueError(f"Unknown objective {self.sis.obj.lower()}")
670
+
671
+
672
+ # logistic
673
+ if beta is None:
674
+ pbar = float(np.mean(y))
675
+ pbar = min(max(pbar, eps), 1.0 - eps)
676
+ self.intercept_ = float(np.log(pbar / (1.0 - pbar)))
677
+ return self
678
+
679
+
680
+ xb = np.asarray(X @ beta).ravel()
681
+ b = float(getattr(self, "intercept_", 0.0)) # warm start
682
+
683
+
684
+ # Newton-Raphson on b: solve sum(y - sigmoid(xb+b)) = 0
685
+ for _ in range(int(n_steps)):
686
+ eta = xb + b
687
+ p = self._sigmoid_stable(eta)
688
+
689
+
690
+ g = np.sum(y - p) # gradient of log-likelihood wrt b
691
+ h = np.sum(p * (1.0 - p)) # negative second derivative (positive)
692
+
693
+
694
+ # If h is tiny, Newton becomes unstable / ineffective -> break to bisection
695
+ if h <= eps:
696
+ break
697
+
698
+
699
+ step = g / h
700
+ b_new = b + step
701
+
702
+
703
+ if abs(b_new - b) < tol:
704
+ b = b_new
705
+ self.intercept_ = float(b)
706
+ return self
707
+
708
+ b = b_new
709
+
710
+
711
+ # --- Bisection fallback (monotone root find) ---
712
+ # f(b) = sum(y - sigmoid(xb+b)) is strictly decreasing in b.
713
+ def f(bb):
714
+ return float(np.sum(y - self._sigmoid_stable(xb + bb)))
715
+
716
+
717
+ # Build a bracket that spans the root.
718
+ # Start near current b and expand.
719
+ lo, hi = b - 1.0, b + 1.0
720
+ flo, fhi = f(lo), f(hi)
721
+
722
+
723
+ # We want flo >= 0 and fhi <= 0 (or vice versa); expand until sign change
724
+ expand = 0
725
+ while flo * fhi > 0 and expand < 60:
726
+ lo -= 2.0 ** expand
727
+ hi += 2.0 ** expand
728
+ flo, fhi = f(lo), f(hi)
729
+ expand += 1
730
+
731
+
732
+ # If still no sign change, just keep current b (pathological separation)
733
+ if flo * fhi > 0:
734
+ self.intercept_ = float(b)
735
+ return self
736
+
737
+
738
+ # Bisection
739
+ for _ in range(80):
740
+ mid = 0.5 * (lo + hi)
741
+ fmid = f(mid)
742
+ if abs(fmid) < 1e-12:
743
+ b = mid
744
+ break
745
+ if flo * fmid > 0:
746
+ lo, flo = mid, fmid
747
+ else:
748
+ hi, fhi = mid, fmid
749
+ if abs(hi - lo) < 1e-10:
750
+ b = 0.5 * (lo + hi)
751
+ break
752
+ self.intercept_ = float(b)
753
+ return self
754
+
755
+ def refit_logistic_intercept(self, xb, y, b0=0.0, max_iter=50, tol=1e-10):
756
+ b = b0
757
+ for _ in range(max_iter):
758
+ eta = xb + b
759
+ # stable sigmoid
760
+ p = np.where(
761
+ eta >= 0,
762
+ 1.0 / (1.0 + np.exp(-eta)),
763
+ np.exp(eta) / (1.0 + np.exp(eta)),
764
+ )
765
+ g = np.mean(y - p)
766
+ h = np.mean(p * (1 - p))
767
+ if h < 1e-12:
768
+ break
769
+ step = g / h
770
+ b += step
771
+ if abs(step) < tol:
772
+ break
773
+ return b
774
+
775
+ def fitting(self, X, y, feature_names=None, verbose=False, track_pool=False, opt_k = None):
776
+ self.feature_names_ = feature_names
777
+ n,p = X.shape
778
+ stopping = self.k if opt_k is None else opt_k
779
+ if verbose: print('Stopping after {0} iterations'.format(stopping))
780
+
781
+ pool_ = set()
782
+ if track_pool: self.pool = []
783
+ beta = np.zeros(X.shape[1], dtype=float) # empty model coefficients
784
+ self.update_intercept(X, y, beta=None)
785
+
786
+
787
+ res = y
788
+ i = 0
789
+ IC = np.infty
790
+ while i < stopping:
791
+ self.update_intercept(X, y, beta=beta)
792
+
793
+ if verbose: print('.', end='')
794
+
795
+ p, sis_i = self.sis(X=X, y=y, model=self, pool=list(pool_))
796
+ pool_old = deepcopy(pool_)
797
+ pool_.update(sis_i)
798
+ pool_lst = list(pool_)
799
+ if track_pool: self.pool = pool_lst
800
+ if str(self.so) == 'EffAdaLASSO(gamma=1)':
801
+ self.so(X=X, y=y, d=i+1, feature_names=feature_names[pool_lst], idx_old = list(pool_old), idx_new=sis_i, verbose=verbose)
802
+ else:
803
+ self.so.fit(X=X[:, pool_lst], y=y, d=i+1, feature_names=feature_names[pool_lst], verbose=verbose)
804
+ beta_i = self.so.coef_
805
+
806
+ beta = np.zeros(X.shape[1], dtype=float)
807
+ beta[pool_lst] = beta_i
808
+
809
+ # NOW update intercept using the newly solved beta
810
+ self.update_intercept(X, y, beta=beta)
811
+
812
+ if self.optimize_k or self.track_intermediates:
813
+ idx = np.nonzero(beta)[0]
814
+ if self.normalize:
815
+ coef = (beta[idx].reshape(1, -1)*self.b_y/self.b_x[idx].reshape(1, -1))
816
+ if self.fit_intercept:
817
+ intercept_ = self.a_y - coef.dot(self.a_x[idx])
818
+ else:
819
+ coef = beta[idx]
820
+ if self.fit_intercept:
821
+ intercept_ = self.intercept_
822
+ if len(coef.shape) > 1:
823
+ coef = coef[0]
824
+ expr = ''.join([('+' if float(c) >= 0 else '') + str(np.round(float(c), 3)) + str(self.feature_names_[idx][q]) for q, c in enumerate(coef)])
825
+ if verbose: print('Model after {0} iterations: {1}'.format(i, expr))
826
+
827
+ self.intermediates[i, 0] = deepcopy(idx)
828
+ self.intermediates[i, 1] = coef # deepcopy(beta[idx])
829
+ self.intermediates[i, 2] = intercept_ if self.fit_intercept else 0
830
+ self.intermediates[i, 3] = self.feature_names_[idx]
831
+ self.intermediates[i, 4] = expr
832
+
833
+ if self.pool_reset:
834
+ idx = np.abs(beta_i) > 0
835
+ beta_i = beta_i[idx]
836
+ pool_lst = np.array(pool_lst)[idx]
837
+ pool_lst = pool_lst.ravel().tolist()
838
+ pool_ = set(pool_lst)
839
+
840
+ self.beta_ = beta
841
+ self.beta_idx_ = list(np.nonzero(self.beta_)[0])
842
+ self.beta_sparse_ = self.beta_[self.beta_idx_]
843
+ self.feature_names_sparse_ = np.array(self.feature_names_)[self.beta_idx_]
844
+ self.coef()
845
+
846
+ i += 1
847
+ if self.optimize_k or self.track_intermediates: self.intermediates = self.intermediates[:, :i]
848
+
849
+ if verbose: print()
850
+
851
+ self.beta_ = beta
852
+ self.update_intercept(X, y, beta=beta)
853
+
854
+ self.beta_idx_ = list(np.nonzero(self.beta_)[0])
855
+ self.beta_sparse_ = self.beta_[self.beta_idx_]
856
+ self.feature_names_sparse_ = np.array(self.feature_names_)[self.beta_idx_]
857
+
858
+ return self
859
+
860
+ def fit(self, X, y, val_size=0.1, feature_names=None, timer=False, verbose=False, track_pool=False, random_state=None):
861
+ if verbose: print('removing invalid features')
862
+ self.bad_col = self.filter_invalid_cols(X)
863
+ X_ = np.delete(X, self.bad_col, axis=1)
864
+ self.p_filtered_ = X_.shape[1]
865
+
866
+ have_valid_names = not(feature_names is None) and X.shape[1] == len(feature_names)
867
+ feature_names_ = np.delete(np.array(feature_names), self.bad_col) if have_valid_names else np.array(['X_{0}'.format(i) for i in range(X_.shape[1])])
868
+
869
+ if verbose: print('Feature normalisation')
870
+ self.solve_norm_coef(X_, y)
871
+ X_, y_ = self.normalize_Xy(X_, y)
872
+
873
+ if verbose: print('Fitting ICL model')
874
+ if timer: start=time()
875
+ if self.optimize_k == False:
876
+ self.fitting(X=X_, y=y_, feature_names=feature_names_, verbose=verbose, track_pool = track_pool)
877
+ else:
878
+ if verbose: print('Finding optimal model size')
879
+ X_train, X_val, y_train, y_val = train_test_split(
880
+ X_, y_, test_size=val_size, random_state=random_state
881
+ )
882
+
883
+ self.fitting(X=X_train, y=y_train, feature_names=feature_names_, verbose=verbose, track_pool=track_pool)
884
+
885
+ best_k, best_loss = 0, np.inf
886
+
887
+ for kk in range(self.intermediates.shape[0]): # number of fitted iterations
888
+ idx = self.intermediates[kk, 0]
889
+ coef = np.asarray(self.intermediates[kk, 1]).ravel()
890
+ inter = float(self.intermediates[kk, 2])
891
+
892
+ # raw score
893
+ eta_val = (X_val[:, idx] @ coef) + inter
894
+
895
+ if self.sis.obj == "squared":
896
+ # regression prediction
897
+ y_pred = eta_val
898
+ loss_val = rmse(y_val.ravel(), y_pred.ravel())
899
+
900
+ elif self.sis.obj == "logistic":
901
+ # classification probability for class 1
902
+ eta_val = np.clip(eta_val, -30, 30)
903
+ p1 = 1.0 / (1.0 + np.exp(-eta_val))
904
+ loss_val = log_loss(y_val.ravel(), p1.ravel())
905
+
906
+ else:
907
+ raise ValueError(f"Unknown objective '{self.sis.obj}'")
908
+
909
+ if loss_val < best_loss:
910
+ best_k, best_loss = kk + 1, loss_val
911
+
912
+ if verbose: print(f'refitting with k={best_k} (val loss={best_loss})')
913
+ self.fitting(X=X_, y=y_, feature_names=feature_names_, verbose=verbose, track_pool=track_pool, opt_k=best_k)
914
+
915
+ if timer: self.fit_time=time()-start
916
+ if timer and verbose: print(self.fit_time)
917
+
918
+ self.beta_so_ = self.beta_sparse_
919
+ self.feature_names = self.feature_names_sparse_
920
+ obj = self.sis.obj.lower()
921
+
922
+
923
+ if obj == "squared":
924
+ coef_hat, _, _, _ = np.linalg.lstsq(
925
+ a=X_[:, self.beta_idx_],
926
+ b=y_.reshape(-1, 1),
927
+ rcond=None
928
+ )
929
+ self.beta_sparse_ = coef_hat.ravel()
930
+ elif obj == "logistic":
931
+ if self.beta_idx_ is None or len(self.beta_idx_) == 0:
932
+ # intercept-only fallback (no features selected)
933
+ self.beta_sparse_ = np.zeros(0, dtype=float)
934
+ # keep intercept_ from the iterative updates
935
+ else:
936
+ # eta = X_[:, self.beta_idx_] @ self.beta_so_
937
+ # self.intercept = self.refit_logistic_intercept(xb=eta, y=y, b0=self.intercept_)
938
+ lr = LogisticRegression(penalty=None, solver='lbfgs', fit_intercept=True)
939
+ lr.fit(X_[:, self.beta_idx_], y_)
940
+ coef_s = lr.coef_.ravel()
941
+ self.intercept_ = float(lr.intercept_[0])
942
+ self.beta_sparse_ = coef_s
943
+
944
+
945
+ else:
946
+ raise ValueError(f"Unknown objective '{self.sis.obj.lower()}'")
947
+
948
+ self.beta_ = np.zeros(X_.shape[1], dtype=float)
949
+ if len(self.beta_idx_) > 0:
950
+ self.beta_[self.beta_idx_] = self.beta_sparse_
951
+
952
+ if verbose: print('Inverse Transform of Feature Space')
953
+ self.coef()
954
+
955
+ return self
956
+
957
+ def decision_function(self, X):
958
+ X_ = self._maybe_filter_X(X)
959
+
960
+ if self.beta_idx_ is None or len(self.beta_idx_) == 0:
961
+ return np.full(X_.shape[0], self.intercept_, dtype=float)
962
+
963
+ coef = self.coef_.ravel()
964
+ eta = X_[:, self.beta_idx_] @ coef + self.intercept_
965
+
966
+ return eta
967
+
968
+ def predict_proba(self, X):
969
+ eta = self.decision_function(X)
970
+
971
+ # numerical stability
972
+ eta = np.clip(eta, -30, 30)
973
+ p1 = 1.0 / (1.0 + np.exp(-eta))
974
+ p0 = 1.0 - p1
975
+
976
+ return np.column_stack([p0, p1])
977
+
978
+ def predict(self, X, threshold=0.5):
979
+ obj = self.sis.obj.lower()
980
+ if obj == "squared":
981
+ return self.decision_function(X)
982
+ elif obj == "logistic":
983
+ p1 = self.predict_proba(X)[:, 1]
984
+ return (p1 >= threshold).astype(int)
985
+ else:
986
+ raise ValueError(f"Unknown objective '{self.sis.obj.lower()}'")
987
+
988
+ def predict_score(self, X):
989
+ return self.decision_function(X)
990
+
991
+ def negative_gradient(self, X, y):
992
+ if self.sis.obj.lower() == "squared":
993
+ return y - self.decision_function(X)
994
+
995
+ elif self.sis.obj.lower() == "logistic":
996
+ eta = self.decision_function(X)
997
+ p = 1.0 / (1.0 + np.exp(-np.clip(eta, -self.clp, self.clp)))
998
+ return y - p
999
+
1000
+ else:
1001
+ raise ValueError(f"Unknown objective {self.sis.obj.lower()}")
1002
+
1003
+ sci = lambda x, sig=3: f"{float(x):.{sig}e}"
1004
+
1005
+ if __name__ == "__main__":
1006
+ from sklearn.model_selection import train_test_split
1007
+ import pandas as pd
1008
+ import os
1009
+
1010
+ random_state = 0
1011
+ np.random.seed(random_state)
1012
+
1013
+ test_num = 1
1014
+ test = "SYNTHETIC" if test_num == 0 else ("DIABETES" if test_num == 1 else "EARTHQUAKE")
1015
+
1016
+ if test == "DIABETES":
1017
+ df = pd.read_csv(os.path.join(os.getcwd(), "Input", "pima.csv"))
1018
+ df["DIABETES"] = df["DIABETES"].map({"Y":1, "N": 0})
1019
+ y = df['DIABETES'].values
1020
+ X = df.drop(columns=['DIABETES'])
1021
+ feature_names = X.columns
1022
+ X = X.values
1023
+
1024
+ rung = 2
1025
+ small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
1026
+ big = ['six_pow', 'exp', 'add', 'mul', 'div', 'abs_diff']
1027
+
1028
+ small = ['log', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
1029
+ big = ['mul', 'div']
1030
+
1031
+ small = [(op, range(rung)) for op in small]
1032
+ big = [(op, range(1)) for op in big]
1033
+ ops = small+big
1034
+
1035
+ FE = FeatureExpansion(rung=rung, ops=ops)
1036
+ Phi_names, Phi_symbols, Phi_ = FE.expand(X=X, names=feature_names, check_pos=True, verbose=True)
1037
+ X_train, X_test, y_train, y_test = train_test_split(Phi_, y, test_size=0.2, random_state=random_state)
1038
+
1039
+ s = 20
1040
+ k=5
1041
+ so = LOGISTIC_LASSO()
1042
+ sis = generalised_SIS(s=s, obj='logistic')
1043
+ model = GENERALISED_ICL(sis=sis, so=so, k=5, optimize_k=False, normalize=True)
1044
+
1045
+ model.fit(X_train, y_train, feature_names = Phi_names, verbose=True)
1046
+ print(model.__repr__())
1047
+ print('zero_one: {0}'.format(zero_one_loss(y_test, model.predict(X_test))))
1048
+ print('logloss: {0}'.format(log_loss(y_test, model.predict_proba(X_test)[:, 1])))
1049
+ print('auc: {0}'.format(roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])))
1050
+
1051
+
1052
+ p = model.predict_proba(X_test)[:, 1]
1053
+ print("min/max:", p.min(), p.max())
1054
+ print("quantiles:", np.quantile(p, np.linspace(start=0, stop=1, num=50)))
1055
+ print("mean:", p.mean())
1056
+ print("predicted positives @0.5:", np.mean(p >= 0.5))
1057
+
1058
+ elif test == "SYNTHETIC":
1059
+ k,n,p=5,10000,1000
1060
+ rng = np.random.default_rng(random_state)
1061
+ X = rng.standard_normal((n,p))
1062
+ feature_names = np.array(['X_{0}'.format(i) for i in range(p)])
1063
+ support = range(k)
1064
+ beta = np.zeros(p, dtype=float)
1065
+ signs = rng.choice([-1.0, 1.0], size=k)
1066
+ mags = rng.uniform(0.5, 1.5, size=k)
1067
+ beta[support] = signs * mags
1068
+ eta_no_b = X @ beta
1069
+ b = float(-np.mean(eta_no_b))
1070
+ eta = eta_no_b + b
1071
+ p1 = 1.0 / (1.0 + np.exp(-np.clip(eta, -50, 50)))
1072
+ y = rng.binomial(1, p1, size=n).astype(int)
1073
+
1074
+ X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2)
1075
+
1076
+ s = 2
1077
+ so = LOGISTIC_LASSO()
1078
+ sis = generalised_SIS(s=s, obj='logistic')
1079
+ model = GENERALISED_ICL(sis=sis, so=so, k=k, optimize_k=False, normalize=False)
1080
+
1081
+ model.fit(X_train, y_train, feature_names=feature_names, verbose=True)
1082
+ print(model.__repr__())
1083
+
1084
+ print('True Coef: {0}'.format(beta[:k]))
1085
+ print('True intercept: {0}'.format(b))
1086
+ elif test == "EARTHQUAKE":
1087
+ import pickle
1088
+ from sklearn.model_selection import GroupKFold
1089
+
1090
+ start_total = time()
1091
+
1092
+ path = os.path.join(os.getcwd(), 'Input', 'Features_new.pkl')
1093
+ df = pickle.load(open(os.path.join(path), "rb"))
1094
+
1095
+ random_state = 0
1096
+ rng = np.random.default_rng(random_state)
1097
+
1098
+ use_groups = 5
1099
+
1100
+ all_ids = df["ID"].unique()
1101
+ chosen_ids = rng.choice(all_ids, size=use_groups, replace=False)
1102
+ print("Chosen IDs:", chosen_ids)
1103
+
1104
+ df_sub = df[df["ID"].isin(chosen_ids)].copy()
1105
+ print("Subset shape:", df_sub.shape)
1106
+ print("Class balance:", df_sub["aftershocksyn"].mean())
1107
+
1108
+ y = df_sub['aftershocksyn'].values
1109
+ X = df_sub.drop(columns=['ID', 'aftershocksyn'])
1110
+ feature_names_raw = X.columns
1111
+ X = X.values
1112
+
1113
+ groups = df_sub["ID"].to_numpy()
1114
+ unique_events = np.unique(groups)
1115
+
1116
+ gkf = GroupKFold(n_splits=len(unique_events))
1117
+
1118
+ verbose=2
1119
+ rung = 1
1120
+ # small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
1121
+ # big = ['exp', 'six_pow', 'mul', 'div', 'abs_diff', 'add']
1122
+
1123
+ small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
1124
+ big = ['exp', 'six_pow', 'mul', 'div', 'abs_diff', 'add']
1125
+ small = [(op, range(rung)) for op in small]
1126
+ big = [(op, range(1)) for op in big]
1127
+ ops = small+big
1128
+ check_pos = True
1129
+
1130
+ FE = FeatureExpansion(ops=ops, rung=rung)
1131
+ Phi_names, Phi_symbols, Phi = FE.expand(X=X, names=feature_names_raw, verbose=verbose, check_pos=check_pos)
1132
+ Phi.shape
1133
+
1134
+ s=20
1135
+ k=3
1136
+ so = LOGISTIC_LASSO()
1137
+ sis = generalised_SIS(s=s, obj='logistic')
1138
+ icl_base = GENERALISED_ICL(sis=sis, so=so, k=5, optimize_k=False, normalize=False)
1139
+
1140
+ aucs = []
1141
+ coefs = [] # optional: store coefficients to see recovery stability
1142
+
1143
+ models = []
1144
+
1145
+
1146
+ for fold, (train_idx, test_idx) in enumerate(gkf.split(Phi, y, groups)):
1147
+ print(f"Fold {fold+1}/{len(unique_events)}")
1148
+
1149
+ X_train, X_test = Phi[train_idx], Phi[test_idx]
1150
+ y_train, y_test = y[train_idx], y[test_idx]
1151
+
1152
+ n,p = X_train.shape
1153
+ print(n, p)
1154
+
1155
+ model = GENERALISED_ICL(sis=generalised_SIS(s=s, obj='logistic'), so=LOGISTIC_LASSO(C_grid=np.logspace(-4, 2, 250)), k=k, optimize_k=False, normalize=False)
1156
+ # model = clone(icl_base)
1157
+ start_fit = time()
1158
+ model.fit(X_train, y_train, feature_names=Phi_names, verbose=verbose>1)
1159
+ fit_time = time() - start_fit
1160
+ total_time = time() - start_total
1161
+ print(model.__repr__())
1162
+ models.append(model.__repr__())
1163
+ print('Fitted in {0} seconds, total time elapsed {1} seconds'.format(np.round(fit_time, 4), np.round(total_time, 4)))
1164
+
1165
+ y_prob = model.predict_proba(X_test)[:, 1]
1166
+ auc = roc_auc_score(y_test, y_prob)
1167
+ aucs.append(auc)
1168
+
1169
+ # Optional: store coefficients for recovery analysis
1170
+ if hasattr(model, "coef_"):
1171
+ coefs.append(model.coef_.copy())
1172
+
1173
+ print("Mean AUC:", np.mean(aucs))
1174
+ print("Std AUC :", np.std(aucs))
1175
+ for model in models:
1176
+ print(model)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: icol
3
- Version: 0.8.4
3
+ Version: 0.8.5
4
4
  Summary: Iterative Correlation Learning implementation
5
5
  Author-email: Simon Teshuva <simon.teshuva@gmail.com>
6
6
  License: MIT
@@ -3,6 +3,7 @@ README.md
3
3
  pyproject.toml
4
4
  icol/__init__.py
5
5
  icol/icol.py
6
+ icol/logistic_icol.py
6
7
  icol.egg-info/PKG-INFO
7
8
  icol.egg-info/SOURCES.txt
8
9
  icol.egg-info/dependency_links.txt
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "icol"
3
- version = "0.8.4"
3
+ version = "0.8.5"
4
4
  description = "Iterative Correlation Learning implementation"
5
5
  authors = [
6
6
  { name = "Simon Teshuva", email = "simon.teshuva@gmail.com" }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes