icol 0.1.6__py3-none-any.whl → 0.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
icol/icol.py CHANGED
@@ -8,9 +8,10 @@ from itertools import combinations, permutations
8
8
  import numpy as np
9
9
  import sympy as sp
10
10
 
11
- from sklearn.linear_model import lars_path
11
+ from sklearn.linear_model import lars_path, Ridge, Lars
12
12
  from sklearn.preprocessing import PolynomialFeatures
13
13
  from sklearn.base import clone
14
+ from sklearn.model_selection import train_test_split
14
15
 
15
16
  from sklearn.metrics import mean_squared_error
16
17
 
@@ -21,10 +22,135 @@ def LL(res):
21
22
  n = len(res)
22
23
  return n*np.log(np.sum(res**2)/n)
23
24
 
25
+ def initialize_ols(D, y, init_idx):
26
+ """
27
+ Fit initial OLS solution on selected columns of D.
28
+
29
+ Parameters
30
+ ----------
31
+ D : (n, d) ndarray
32
+ Full dictionary matrix.
33
+ y : (n,) ndarray
34
+ Response vector.
35
+ init_idx : list[int]
36
+ Indices of columns from D to use initially.
37
+
38
+ Returns
39
+ -------
40
+ beta : (p,) ndarray
41
+ OLS coefficients for selected columns.
42
+ A_inv : (p, p) ndarray
43
+ Inverse Gram matrix for selected columns.
44
+ XT : (p, n) ndarray
45
+ Transposed design matrix of selected columns.
46
+ active_idx : list[int]
47
+ Current indices of D included in the model.
48
+ """
49
+ X = D[:, init_idx]
50
+ A = X.T @ X
51
+ try:
52
+ A_inv = np.linalg.inv(A)
53
+ except np.linalg.LinAlgError:
54
+ A_inv = np.linalg.pinv(A)
55
+ beta = A_inv @ (X.T @ y)
56
+ XT = X.T
57
+ return beta, A_inv, XT, list(init_idx)
58
+
59
+ def sweep_update_from_D(beta, A_inv, XT, active_idx, D, y, new_idx):
60
+ # Generated with ChatGPT using the commands;
61
+ # 1. write me a function which takes in an n by p dimension matrix X, for which we already have an OLS solution, beta.
62
+ # Additionally, a second input is a data matrix Z with n rows and q columns.
63
+ # Add the Z matrix of columns to the OLS solution using SWEEP
64
+ # 2. Are we also able to efficiently update the gram and its inverse with this procedure for X augmented with Z
65
+ # 3. Ok, imagine that I need to update my SWEEP solution multiple times.
66
+ # Adjust the inputs and return values so that everything can be used again in the next SWEEP update.
67
+ # Then update the function to make use of these previous computations
68
+ # 4. Lets make some changes for the sake of indexing. Imagine that we have a large matrix D, with d columns.
69
+ # Through some selection procedure we select p of those columns to form an initial OLS solution.
70
+ # We then iteratively select p new columns and incorporate those into the ols solution using sweep.
71
+ # Update the code to reflect this change while also tracking the indices of columns in the original D matrix
72
+ # and their mapping to the respective betas.
73
+
74
+ """
75
+ Update OLS solution by adding new columns from D.
76
+
77
+ Parameters
78
+ ----------
79
+ beta : (p,) ndarray
80
+ Current OLS coefficients.
81
+ A_inv : (p, p) ndarray
82
+ Inverse Gram matrix for current features.
83
+ XT : (p, n) ndarray
84
+ Transposed design matrix for current features.
85
+ active_idx : list[int]
86
+ Current indices of columns in D that are in the model.
87
+ D : (n, d) ndarray
88
+ Full dictionary matrix.
89
+ y : (n,) ndarray
90
+ Response vector.
91
+ new_idx : list[int]
92
+ Indices of new columns in D to add.
93
+
94
+ Returns
95
+ -------
96
+ beta_new : (p+q,) ndarray
97
+ Updated OLS coefficients.
98
+ A_tilde_inv : (p+q, p+q) ndarray
99
+ Updated inverse Gram matrix.
100
+ XT_new : (p+q, n) ndarray
101
+ Updated design matrix transpose.
102
+ active_idx_new : list[int]
103
+ Updated indices of active columns in D.
104
+ """
105
+ p = beta.shape[0]
106
+ Z = D[:, new_idx] # n x q
107
+ q = Z.shape[1]
108
+
109
+ # Cross products
110
+ B = XT @ Z # p x q
111
+ C = Z.T @ Z # q x q
112
+ yZ = Z.T @ y # q x 1
113
+
114
+ # Schur complement
115
+ S = C - B.T @ (A_inv @ B)
116
+
117
+ # Solve for new coefficients (numerically stable)
118
+ rhs = yZ - B.T @ beta
119
+ try:
120
+ beta_Z = np.linalg.solve(S, rhs)
121
+ except np.linalg.LinAlgError:
122
+ beta_Z = np.linalg.pinv(S) @ rhs
123
+
124
+ # Update old coefficients
125
+ beta_X_new = beta - A_inv @ (B @ beta_Z)
126
+ beta_new = np.concatenate([beta_X_new, beta_Z])
127
+
128
+ # Update Gram inverse
129
+ try:
130
+ S_inv = np.linalg.inv(S) # small q x q
131
+ except np.linalg.LinAlgError:
132
+ S_inv = np.linalg.pinv(S)
133
+
134
+ top_left = A_inv + A_inv @ B @ S_inv @ B.T @ A_inv
135
+ top_right = -A_inv @ B @ S_inv
136
+ bottom_left = -S_inv @ B.T @ A_inv
137
+ bottom_right = S_inv
138
+
139
+ A_tilde_inv = np.block([
140
+ [top_left, top_right],
141
+ [bottom_left, bottom_right]
142
+ ])
143
+
144
+ # Update XT and active indices
145
+ XT_new = np.vstack([XT, Z.T])
146
+ active_idx_new = active_idx + list(new_idx)
147
+
148
+ return beta_new, A_tilde_inv, XT_new, active_idx_new
149
+
24
150
  IC_DICT = {
25
151
  'AIC': lambda res, k: LL(res) + 2*k,
26
152
  'HQIC': lambda res, k: LL(res) + np.log(np.log(len(res)))*k,
27
- 'AIC': lambda res, k: LL(res) + 2*k,
153
+ 'BIC': lambda res, k, n: LL(res) + 2*k*np.log(n),
28
154
  'CAIC': lambda res, k: LL(res) + (np.log(len(res))+1)*k,
29
155
  'AICc': lambda res, k: LL(res) + 2*k + 2*k*(k+1)/(len(res)-k-1)
30
156
  }
@@ -168,7 +294,7 @@ class PolynomialFeaturesICL:
168
294
 
169
295
  def get_feature_names_out(self):
170
296
  return self.PolynomialFeatures.get_feature_names_out()
171
-
297
+
172
298
  class BSS:
173
299
  def __init__(self):
174
300
  pass
@@ -232,13 +358,97 @@ class BSS:
232
358
  beta_ret = np.zeros(p)
233
359
  beta_ret[list(best_comb)] = beta.reshape(1, -1)
234
360
  return beta_ret
235
-
361
+
362
+ class EfficientAdaptiveLASSO:
363
+ def __init__(self, gamma=1, fit_intercept=False, default_d=5, rcond=-1, alpha=0):
364
+ self.gamma = gamma
365
+ self.fit_intercept = fit_intercept
366
+ self.default_d = default_d
367
+ self.rcond=rcond
368
+ self.alpha=alpha
369
+ self.A_inv = None
370
+ self.XT = None
371
+ self.beta_ols = None
372
+ self.active_idx = None
373
+
374
+ def __str__(self):
375
+ return ('EffAda' if self.gamma != 0 else '') + ('LASSO') + ('(gamma={0})'.format(self.gamma) if self.gamma != 0 else '')
376
+
377
+ def __repr__(self):
378
+ return self.__str__()
379
+
380
+ def get_params(self, deep=False):
381
+ return {'gamma': self.gamma,
382
+ 'fit_intercept': self.fit_intercept,
383
+ 'default_d': self.default_d,
384
+ 'rcond': self.rcond}
385
+
386
+ def set_default_d(self, d):
387
+ self.default_d = d
388
+
389
+ def __call__(self, X, y, d, idx_old = None, idx_new=None, verbose=False):
390
+
391
+ self.set_default_d(d)
392
+ nonancols = np.isnan(X).sum(axis=0)==0
393
+ noinfcols = np.isinf(X).sum(axis=0)==0
394
+ valcols = np.logical_and(nonancols, noinfcols)
395
+ idx_ala = list(idx_new) + list(idx_old)
396
+
397
+ if np.abs(self.gamma)<1e-10:
398
+ beta_ols = np.ones(X.shape[1])
399
+ w_hat = np.ones(X.shape[1])
400
+ X_star_star = X.copy()
401
+ else:
402
+ X_valcols = X[:, valcols]
403
+ if not idx_old:
404
+ self.beta_ols, self.A_inv, self.XT, self.active_idx = initialize_ols(X_valcols, y, init_idx=idx_new)
405
+ else:
406
+ self.beta_ols, self.A_inv, self.XT, self.active_idx = sweep_update_from_D(beta = self.beta_ols, A_inv=self.A_inv,
407
+ XT=self.XT, active_idx=self.active_idx, D=X, y=y,
408
+ new_idx=idx_new)
409
+
410
+ w_hat = 1/np.power(np.abs(self.beta_ols), self.gamma)
411
+ X_star_star = np.zeros_like(X_valcols[:, idx_ala])
412
+ for j in range(X_star_star.shape[1]): # vectorise
413
+ X_j = X_valcols[:, j]/w_hat[j]
414
+ X_star_star[:, j] = X_j
415
+
416
+ _, _, coefs, _ = lars_path(X_star_star, y.ravel(), return_n_iter=True, max_iter=d, method='lasso')
417
+ # alphas, active, coefs = lars_path(X_star_star, y.ravel(), method='lasso')
418
+ try:
419
+ beta_hat_star_star = coefs[:, d]
420
+ except IndexError: # in the event that a solution with d components cant be found, use the next largest.
421
+ beta_hat_star_star = coefs[:, -1]
422
+
423
+ beta_hat_star_n_old_new = np.array([beta_hat_star_star[j]/w_hat[j] for j in range(len(beta_hat_star_star))])
424
+ # beta_hat_star_n = np.zeros(X.shape[1])
425
+ # beta_hat_star_n[idx_ala] = beta_hat_star_n_old_new
426
+
427
+ # beta_hat_star_n[valcols] = beta_hat_star_n_valcol
428
+ # ret = beta_hat_star_n.reshape(1, -1).squeeze()
429
+ return beta_hat_star_n_old_new.squeeze()
430
+
431
+ def fit(self, X, y, verbose=False):
432
+ self.mu = y.mean() if self.fit_intercept else 0
433
+ beta = self.__call__(X=X, y=y-self.mu, d=self.default_d, verbose=verbose)
434
+ self.beta = beta.reshape(-1, 1)
435
+
436
+ def predict(self, X):
437
+ return np.dot(X, self.beta) + self.mu
438
+
439
+ def s_max(self, k, n, p, c1=1, c0=0):
440
+ if self.gamma==0:
441
+ return c1*(p/(k**2)) + c0
442
+ else:
443
+ return c1*min(np.power(p, 1/2)/k, np.power(p*n, 1/3)/k) + c0
444
+
236
445
  class AdaptiveLASSO:
237
- def __init__(self, gamma=1, fit_intercept=False, default_d=5, rcond=-1):
446
+ def __init__(self, gamma=1, fit_intercept=False, default_d=5, rcond=-1, alpha=0):
238
447
  self.gamma = gamma
239
448
  self.fit_intercept = fit_intercept
240
449
  self.default_d = default_d
241
450
  self.rcond=rcond
451
+ self.alpha=0
242
452
 
243
453
  def __str__(self):
244
454
  return ('Ada' if self.gamma != 0 else '') + ('LASSO') + ('(gamma={0})'.format(self.gamma) if self.gamma != 0 else '')
@@ -255,21 +465,26 @@ class AdaptiveLASSO:
255
465
  def set_default_d(self, d):
256
466
  self.default_d = d
257
467
 
258
- def __call__(self, X, y, d, rcond=None, verbose=False):
468
+ def __call__(self, X, y, d, verbose=False):
259
469
 
260
470
  self.set_default_d(d)
261
471
 
472
+ nonancols = np.isnan(X).sum(axis=0)==0
473
+ noinfcols = np.isinf(X).sum(axis=0)==0
474
+ valcols = np.logical_and(nonancols, noinfcols)
262
475
  if np.abs(self.gamma)<1e-10:
263
476
  beta_hat = np.ones(X.shape[1])
264
477
  w_hat = np.ones(X.shape[1])
265
478
  X_star_star = X.copy()
266
479
  else:
267
- beta_hat, _, _, _ = np.linalg.lstsq(X, y, rcond=self.rcond)
480
+
481
+ X_valcols = X[:, valcols]
482
+ beta_hat, _, _, _ = np.linalg.lstsq(X_valcols, y, rcond=self.rcond)
268
483
 
269
484
  w_hat = 1/np.power(np.abs(beta_hat), self.gamma)
270
- X_star_star = np.zeros_like(X)
485
+ X_star_star = np.zeros_like(X_valcols)
271
486
  for j in range(X_star_star.shape[1]): # vectorise
272
- X_j = X[:, j]/w_hat[j]
487
+ X_j = X_valcols[:, j]/w_hat[j]
273
488
  X_star_star[:, j] = X_j
274
489
 
275
490
  _, _, coefs, _ = lars_path(X_star_star, y.ravel(), return_n_iter=True, max_iter=d, method='lasso')
@@ -278,7 +493,10 @@ class AdaptiveLASSO:
278
493
  beta_hat_star_star = coefs[:, d]
279
494
  except IndexError:
280
495
  beta_hat_star_star = coefs[:, -1]
281
- beta_hat_star_n = np.array([beta_hat_star_star[j]/w_hat[j] for j in range(len(beta_hat_star_star))])
496
+
497
+ beta_hat_star_n_valcol = np.array([beta_hat_star_star[j]/w_hat[j] for j in range(len(beta_hat_star_star))])
498
+ beta_hat_star_n = np.zeros(X.shape[1])
499
+ beta_hat_star_n[valcols] = beta_hat_star_n_valcol
282
500
  return beta_hat_star_n.reshape(1, -1).squeeze()
283
501
 
284
502
  def fit(self, X, y, verbose=False):
@@ -295,6 +513,27 @@ class AdaptiveLASSO:
295
513
  else:
296
514
  return c1*min(np.power(p, 1/2)/k, np.power(p*n, 1/3)/k) + c0
297
515
 
516
+ class LARS:
517
+ def __init__(self, default_d=None):
518
+ self.default_d=default_d
519
+
520
+ def __repr__(self):
521
+ return 'Lars'
522
+
523
+ def __str__(self):
524
+ return 'Lars'
525
+
526
+ def set_default_d(self, default_d):
527
+ self.default_d = default_d
528
+
529
+ def get_params(self, deep=False):
530
+ return {'default_d': self.default_d}
531
+
532
+ def __call__(self, X, y, d, verbose=False):
533
+ self.lars = Lars(fit_intercept=False, fit_path=False, verbose=verbose, n_nonzero_coefs=d, copy_X=True)
534
+ self.lars.fit(X, y)
535
+ return self.lars.coef_
536
+
298
537
  class ThresholdedLeastSquares:
299
538
  def __init__(self, default_d=None):
300
539
  self.default_d=default_d
@@ -368,38 +607,38 @@ class SIS:
368
607
  return best_corr, best_idxs
369
608
 
370
609
  class ICL:
371
- def __init__(self, s, so, d, fit_intercept=True, normalize=True, pool_reset=False, information_criteria=None): #, track_intermediates=False):
610
+ def __init__(self, s, so, k, fit_intercept=True, normalize=True, pool_reset=False, optimize_k=False, track_intermediates=False):
372
611
  self.s = s
373
612
  self.sis = SIS(n_sis=s)
374
613
  self.so = so
375
- self.d = d
614
+ self.k = k
376
615
  self.fit_intercept = fit_intercept
377
616
  self.normalize=normalize
378
617
  self.pool_reset = pool_reset
379
- self.information_criteria = information_criteria if information_criteria in IC_DICT.keys() else None
380
- # self.track_intermediates = track_intermediates
381
-
618
+ self.optimize_k = optimize_k
619
+ self.track_intermediates = track_intermediates
620
+
382
621
  def get_params(self, deep=False):
383
622
  return {'s': self.s,
384
623
  'so': self.so,
385
- 'd': self.d,
624
+ 'k': self.k,
386
625
  'fit_intercept': self.fit_intercept,
387
626
  'normalize': self.normalize,
388
627
  'pool_reset': self.pool_reset,
389
- 'information_criteria': self.information_criteria
628
+ 'self.optimize_k': self.optimize_k
390
629
  }
391
630
 
392
631
  def __str__(self):
393
- return 'SISSO(n_sis={0}, SO={1}, d={2})'.format(self.s, str(self.so), self.d)
632
+ return 'ICL(n_sis={0}, SO={1}, k={2})'.format(self.s, str(self.so), self.k)
394
633
 
395
634
  def __repr__(self, prec=3):
396
635
  ret = []
397
636
  for i, name in enumerate(self.feature_names_sparse_):
398
- ret += [('+' if self.coef_[0, i] > 0 else '') + str(np.round(self.coef_[0, i], prec)) + str(name)]
399
- ret += ['+' + str(float(np.round(self.intercept_, prec)))]
637
+ ret += [('+' if self.coef_[0, i] > 0 else '') +
638
+ str(np.format_float_scientific(self.coef_[0, i], precision=prec, unique=False))
639
+ + ' (' + str(name) + ')' + '\n']
640
+ ret += [('+' if self.intercept_>0 else '') + str(float(np.round(self.intercept_, prec)))]
400
641
  return ''.join(ret)
401
-
402
- # return '+'.join(['{0}({1})'.format(str(np.round(b, 3)), self.feature_names_sparse_[i]) for i, b in enumerate(self.coef_) if np.abs(b) > 0]+[str(self.intercept_)])
403
642
 
404
643
  def solve_norm_coef(self, X, y):
405
644
  n, p = X.shape
@@ -440,32 +679,37 @@ class ICL:
440
679
 
441
680
  return bad_cols
442
681
 
443
- def fitting(self, X, y, feature_names=None, verbose=False, track_pool=False, track_intermediates=False):
682
+ def fitting(self, X, y, feature_names=None, verbose=False, track_pool=False, opt_k = None):
444
683
  self.feature_names_ = feature_names
445
684
  n,p = X.shape
685
+ stopping = self.k if opt_k is None else opt_k
686
+ if verbose: print('Stopping after {0} iterations'.format(stopping))
446
687
 
447
688
  pool_ = set()
448
689
  if track_pool: self.pool = []
449
- if track_intermediates: self.intermediates = np.empty(shape=(self.d, 5), dtype=object)
690
+ if self.optimize_k or self.track_intermediates: self.intermediates = np.empty(shape=(self.k, 5), dtype=object)
691
+
450
692
  res = y
451
693
  i = 0
452
694
  IC = np.infty
453
- cont = True
454
- while i < self.d and cont:
695
+ while i < stopping:
455
696
  self.intercept_ = np.mean(res).squeeze()
456
697
  if verbose: print('.', end='')
457
698
 
458
699
  p, sis_i = self.sis(X=X, res=res, pool=list(pool_), verbose=verbose)
700
+ pool_old = deepcopy(pool_)
459
701
  pool_.update(sis_i)
460
702
  pool_lst = list(pool_)
461
-
462
703
  if track_pool: self.pool = pool_lst
463
- beta_i = self.so(X=X[:, pool_lst], y=y, d=i+1, verbose=verbose)
704
+ if str(self.so) == 'EffAdaLASSO(gamma=1)':
705
+ beta_i = self.so(X=X, y=y, d=i+1, idx_old = list(pool_old), idx_new=sis_i, verbose=verbose)
706
+ else:
707
+ beta_i = self.so(X=X[:, pool_lst], y=y, d=i+1, verbose=verbose)
464
708
 
465
709
  beta = np.zeros(shape=(X.shape[1]))
466
710
  beta[pool_lst] = beta_i
467
711
 
468
- if track_intermediates:
712
+ if self.optimize_k or self.track_intermediates:
469
713
  idx = np.nonzero(beta)[0]
470
714
  if self.normalize:
471
715
  coef = (beta[idx].reshape(1, -1)*self.b_y/self.b_x[idx].reshape(1, -1))
@@ -474,7 +718,7 @@ class ICL:
474
718
  coef = beta[idx]
475
719
  intercept_ = self.intercept_
476
720
  coef = coef[0]
477
- expr = ''.join([('+' if float(c) >= 0 else '') + str(np.round(float(c), 3)) + self.feature_names_[idx][q] for q, c in enumerate(coef)])
721
+ expr = ''.join([('+' if float(c) >= 0 else '') + str(np.round(float(c), 3)) + str(self.feature_names_[idx][q]) for q, c in enumerate(coef)])
478
722
  if verbose: print('Model after {0} iterations: {1}'.format(i, expr))
479
723
 
480
724
  self.intermediates[i, 0] = deepcopy(idx)
@@ -491,14 +735,9 @@ class ICL:
491
735
  pool_ = set(pool_lst)
492
736
 
493
737
  res = (y.reshape(1, -1) - (np.dot(X, beta).reshape(1, -1)+self.intercept_) ).T
494
- if not(self.information_criteria is None):
495
- IC_old = IC
496
- IC = IC_DICT[self.information_criteria](res=res, k=i+1)
497
- if verbose: print('{0}={1}'.format(self.information_criteria, IC))
498
- cont = IC < IC_old
499
738
 
500
739
  i += 1
501
- if track_intermediates: self.intermediates = self.intermediates[:, :i]
740
+ if self.optimize_k or self.track_intermediates: self.intermediates = self.intermediates[:, :i]
502
741
 
503
742
  if verbose: print()
504
743
 
@@ -511,7 +750,7 @@ class ICL:
511
750
 
512
751
  return self
513
752
 
514
- def fit(self, X, y, feature_names=None, timer=False, verbose=False, track_pool=False, track_intermediates=False):
753
+ def fit(self, X, y, val_size=0.1, feature_names=None, timer=False, verbose=False, track_pool=False, random_state=None):
515
754
  if verbose: print('removing invalid features')
516
755
  self.bad_col = self.filter_invalid_cols(X)
517
756
  X_ = np.delete(X, self.bad_col, axis=1)
@@ -522,9 +761,27 @@ class ICL:
522
761
  self.solve_norm_coef(X_, y)
523
762
  X_, y_ = self.normalize_Xy(X_, y)
524
763
 
525
- if verbose: print('Fitting SISSO model')
764
+ if verbose: print('Fitting ICL model')
526
765
  if timer: start=time()
527
- self.fitting(X=X_, y=y_, feature_names=feature_names_, verbose=verbose, track_pool = track_pool, track_intermediates=track_intermediates)
766
+ if self.optimize_k == False:
767
+ self.fitting(X=X_, y=y_, feature_names=feature_names_, verbose=verbose, track_pool = track_pool)
768
+ else:
769
+ if verbose: print('Finding optimal model size')
770
+ X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=val_size, random_state=random_state)
771
+ self.fitting(X=X_train, y=y_train, feature_names=feature_names_, verbose=verbose, track_pool = track_pool)
772
+ best_k, best_e2 = 0, np.infty
773
+ for k in range(self.k):
774
+ idx = self.intermediates[k, 0]
775
+ coef = self.intermediates[k, 1]
776
+ inter = self.intermediates[k, 2]
777
+ X_pred = np.delete(X_val, self.bad_col, axis=1)
778
+ y_hat = (np.dot(X_pred[:, idx], coef.squeeze()) + inter).reshape(-1, 1)
779
+ e2_val = rmse(y_hat, y_val)
780
+ if e2_val < best_e2:
781
+ best_k, best_e2 = k+1, e2_val
782
+ if verbose: print('refitting with k={0}'.format(best_k))
783
+ self.fitting(X=X_, y=y_, feature_names=feature_names_, verbose=verbose, track_pool = track_pool, opt_k = best_k)
784
+
528
785
  if timer: self.fit_time=time()-start
529
786
  if timer and verbose: print(self.fit_time)
530
787
 
@@ -544,9 +801,40 @@ class ICL:
544
801
  X_ = np.delete(X, self.bad_col, axis=1)
545
802
  return (np.dot(X_[:, self.beta_idx_], self.coef_.squeeze()) + self.intercept_).reshape(-1, 1)
546
803
 
804
+ def predict_ensemble(self, X):
805
+ y_hat = np.zeros(shape=(X.shape[0], self.k))
806
+ for k in range(self.k):
807
+ idx = self.intermediates[k, 0]
808
+ coef = self.intermediates[k, 1]
809
+ inter = self.intermediates[k, 2]
810
+ X_pred = np.delete(X, self.bad_col, axis=1)
811
+ y_hat[:, k]=(np.dot(X_pred[:, idx], coef) + inter).reshape(-1, 1).squeeze()
812
+ return y_hat
813
+
814
+ def repr_ensemble(self, prec=3):
815
+ ret = []
816
+ for k in range(self.k):
817
+ idx = self.intermediates[k, 0]
818
+ coef = self.intermediates[k, 1]
819
+ inter = self.intermediates[k, 2]
820
+ feat = self.intermediates[k, 3]
821
+ model_k = []
822
+ for i, name in enumerate(feat):
823
+ model_k += [('+' if coef[i] > 0 else '') +
824
+ str(np.format_float_scientific(coef[i], precision=prec, unique=False))
825
+ + ' (' + str(name) + ')' + '\n']
826
+ model_k += [('+' if inter > 0 else '') + str(float(np.round(inter, prec)))]
827
+ model_k = ''.join(model_k)
828
+ ret += [model_k]
829
+ return ';\n\n'.join(ret)
830
+
547
831
  def score(self, X, y, scorer=rmse):
548
832
  return scorer(self.predict(X), y)
549
833
 
834
+ def score_ensemble(self, X, y):
835
+ y_hat_ens = self.predict_ensemble(X)
836
+ return np.mean((y_hat_ens - y.reshape(-1,1))**2, axis=0)
837
+
550
838
  class BOOTSTRAP:
551
839
  def __init__(self, X, y=None, random_state=None):
552
840
  self.X = X
@@ -630,20 +918,12 @@ class FeatureExpansion:
630
918
  self.ops = ops
631
919
  self.rung = rung
632
920
  self.printrate = printrate
633
-
634
- def __call__(self, X, feature_names=None, verbose=False, f=None):
635
- if verbose: print('Prepping Symbols')
636
- if feature_names is None: feature_names = sp.symbols(' '.join(['x_{0}'.format(i) for i in range(X.shape[1])]))
637
- if verbose: print('Performing Feature Expansion')
638
- if verbose: print('Estimating the creation of {0} features with duplicates'.format(self.extimate_workload(X=X, max_rung=self.rung)))
639
- spnames, names, X_ = self.FE_aux(X=X, feature_names=feature_names, rung=self.rung, max_rung=self.rung, prev_start = -1, verbose=verbose)
640
- if verbose: print('Created {0} features, now removing duplicate features'.format(X_.shape[1]))
641
- spnames, names, X_ = self.remove_redundant_features(spnames, names, X_)
642
- if f:
643
- pass
644
- return spnames, names, X_
645
-
646
- def remove_redundant_features(self, spnames, names, X_):
921
+ self.prev_print = 0
922
+ for i, op in enumerate(self.ops):
923
+ if type(op) == str:
924
+ self.ops[i] = (op, range(rung))
925
+
926
+ def remove_redundant_features(self, symbols, names, X):
647
927
  sorted_idxs = np.argsort(names)
648
928
  for i, idx in enumerate(sorted_idxs):
649
929
  if i == 0:
@@ -651,28 +931,83 @@ class FeatureExpansion:
651
931
  elif names[idx] != names[sorted_idxs[i-1]]:
652
932
  unique += [idx]
653
933
  unique_original_order = np.sort(unique)
654
- return spnames[unique_original_order], names[unique_original_order], X_[:, unique_original_order]
655
-
656
- def extimate_workload(self, X, max_rung):
657
- rung = max_rung
658
- p = X.shape[1]
659
- p_prev = X.shape[1]
660
- unary = 0
661
- binary = 0
662
- for op in self.ops:
663
- if OP_DICT[op]['inputs'] == 1:
664
- unary += 1
665
- elif OP_DICT[op]['inputs'] == 2:
666
- binary += 1
667
- while rung > 0:
668
- new_unary = unary*(p-p_prev) if rung != max_rung else unary*p
669
- new_binary = int(binary*(p-p_prev)*(p-1)) if rung != max_rung else int(binary*p*(p-1)/2)
670
- p_prev = p
671
- p = p + new_unary + new_binary
672
- rung -= 1
673
- return p
674
-
675
- def FE_aux(self, X, feature_names, prev_start, rung=0, max_rung=0, verbose=False):
934
+
935
+ return symbols[unique_original_order], names[unique_original_order], X[:, unique_original_order]
936
+
937
+ def expand(self, X, names=None, verbose=False, f=None, check_pos=False):
938
+ n, p = X.shape
939
+ if (names is None) or (len(names) != p):
940
+ names = ['x_{0}'.format(i) for i in range(X.shape[1])]
941
+
942
+ if check_pos == False:
943
+ symbols = sp.symbols(' '.join(name.replace(' ', '.') for name in names))
944
+ else:
945
+ symbols = []
946
+ for i, name in enumerate(names):
947
+ name = name.replace(' ', '.')
948
+ if np.all(X[:, i] > 0):
949
+ sym = sp.symbols(name, real=True, positive=True)
950
+ else:
951
+ sym = sp.symbols(name, real=True)
952
+ symbols.append(sym)
953
+
954
+ symbols = np.array(symbols)
955
+ names = np.array(names)
956
+
957
+ if verbose: print('Estimating the creation of around {0} features'.format(self.estimate_workload(p=p, max_rung=self.rung, verbose=verbose>2)))
958
+
959
+ names, symbols, X = self.expand_aux(X=X, names=names, symbols=symbols, crung=0, prev_p=0, verbose=verbose)
960
+ if not(f is None):
961
+ import pandas as pd
962
+ df = pd.DataFrame(data=X, columns=names)
963
+ df['y'] = y
964
+ df.to_csv(f)
965
+
966
+ return names, symbols, X
967
+
968
+ def estimate_workload(self, p, max_rung,verbose=False):
969
+ p0 = 0
970
+ p1 = p
971
+ for rung in range(max_rung):
972
+ if verbose: print('Applying rung {0} expansion'.format(rung))
973
+ new_u, new_bc, new_bn = 0, 0, 0
974
+ for (op, rung_range) in self.ops:
975
+ if rung in rung_range:
976
+ if verbose: print('Applying {0} to {1} features will result in approximately '.format(op, p1-p0))
977
+ if OP_DICT[op]['inputs'] == 1:
978
+ new_u += p1
979
+ if verbose: print('{0} new features'.format(p1))
980
+ elif OP_DICT[op]['commutative'] == True:
981
+ new_bc += (1/2)*(p1 - p0 + 1)*(p0 + p1 + 2)
982
+ if verbose: print('{0} new features'.format((1/2)*(p1 - p0 + 1)*(p0 + p1 + 2)))
983
+ else:
984
+ new_bn += (p1 - p0 + 1)*(p0 + p1 + 2)
985
+ if verbose: print('{0} new features'.format((p1 - p0 + 1)*(p0 + p1 + 2)))
986
+ p0 = p1
987
+ p1 = p1 + new_u + new_bc + new_bn
988
+ if verbose: print('For a total of {0} features by rung {1}'.format(p1, rung))
989
+ return p1
990
+
991
+ def add_new(self, new_names, new_symbols, new_X, new_name, new_symbol, new_X_i, verbose=False):
992
+ valid = (np.isnan(new_X_i).sum(axis=0) + np.isposinf(new_X_i).sum(axis=0) + np.isneginf(new_X_i).sum(axis=0)) == 0
993
+ if new_names is None:
994
+ new_names = np.array(new_name[valid])
995
+ new_symbols = np.array(new_symbol[valid])
996
+ new_X = np.array(new_X_i[:, valid])
997
+ else:
998
+ new_names = np.concatenate((new_names, new_name[valid]))
999
+ new_symbols = np.concatenate((new_symbols, new_symbol[valid]))
1000
+ new_X = np.hstack([new_X, new_X_i[:, valid]])
1001
+ # if (verbose > 1) and not(new_names is None) and (len(new_names) % self.printrate == 0): print('Created {0} features so far'.format(len(new_names)))
1002
+ if (verbose > 1) and not(new_names is None) and (len(new_names) - self.prev_print >= self.printrate):
1003
+ self.prev_print = len(new_names)
1004
+ elapsed = np.round(time() - self.start_time, 2)
1005
+ print('Created {0} features so far in {1} seconds'.format(len(new_names),elapsed))
1006
+ return new_names, new_symbols, new_X
1007
+
1008
+ def expand_aux(self, X, names, symbols, crung, prev_p, verbose=False):
1009
+
1010
+ str_vectorize = np.vectorize(str)
676
1011
 
677
1012
  def simplify_nested_powers(expr):
678
1013
  # Replace (x**n)**(1/n) with x
@@ -693,82 +1028,76 @@ class FeatureExpansion:
693
1028
  flatten_pow_chain
694
1029
  )
695
1030
 
696
- # if rung == max_rung:
697
- # feature_names = np.array(feature_names)
698
- # sympy_names = np.array([str(name) for name in feature_names])
699
-
700
- if rung <= 0:
701
- return (np.array(feature_names),
702
- np.array(
703
- [str(sp.simplify(simplify_nested_powers(name)))for name in feature_names]),
704
- X)
1031
+ if crung == 0:
1032
+ self.start_time = time()
1033
+ symbols, names, X = self.remove_redundant_features(X=X, names=names, symbols=symbols)
1034
+ if crung==self.rung:
1035
+ if verbose: print('Completed {0} rounds of feature transformations'.format(self.rung))
1036
+ return symbols, names, X
705
1037
  else:
706
- if verbose: print('Creating rung {0} features'.format(max_rung - rung+1))
707
- new_names = ()
708
- for op_key in self.ops:
709
- if OP_DICT[op_key]['inputs'] == 1:
710
- for i in range(prev_start, len(feature_names)):
711
- if verbose and ((len(feature_names) + len(new_names)) % self.printrate == 0): print('Created {0} Features'.format(len(feature_names) + len(new_names)))
712
- if len(new_names) == 0:
713
- new_X = OP_DICT[op_key]['op_np'](X[:, i]).reshape(X.shape[0], 1)
714
- else:
715
- new_X = np.hstack([new_X, OP_DICT[op_key]['op_np'](X[:, i]).reshape(X.shape[0], 1)])
716
- new_names += (OP_DICT[op_key]['op'](feature_names[i]), )
717
- if verbose>1: print(new_names[-1])
718
- elif OP_DICT[op_key]['inputs'] == 2:
719
- pairings = combinations if OP_DICT[op_key]['commutative'] else permutations
720
- for idx1, idx2 in pairings(range(len(feature_names)), 2):
721
- if verbose and ((len(feature_names) + len(new_names)) % self.printrate == 0): print('Created {0} Features'.format(len(feature_names) + len(new_names)))
722
- # make sure at least one of the features if from the new features
723
- if idx1 >= prev_start or idx2 >= prev_start:
724
- new_col = OP_DICT[op_key]['op_np'](X[:, idx1], X[:, idx2]).reshape(X.shape[0], 1).reshape(X.shape[0], 1)
725
- new_X = new_col if len(new_names) == 0 else np.hstack([new_X,new_col])
726
- new_name = OP_DICT[op_key]['op'](feature_names[idx1],feature_names[idx2])
727
- new_names += (new_name, )
728
- if verbose > 1: print(new_name)
729
- if new_names == ():
730
- return self.FE_aux(X = X, feature_names=feature_names, rung=rung-1, prev_start=len(feature_names), max_rung=max_rung, verbose=verbose)
1038
+ if verbose: print('Applying round {0} of feature transformations'.format(crung+1))
1039
+ # if verbose: print('Estimating the creation of {0} features this iteration'.format(self.estimate_workload(p=X.shape[1], max_rung=1)))
1040
+
1041
+ new_names, new_symbols, new_X = None, None, None
1042
+
1043
+ for (op_key, rung_range) in self.ops:
1044
+ if crung in rung_range:
1045
+ if verbose>1: print('Applying operator {0} to {1} features'.format(op_key, X.shape[1]))
1046
+ op_params = OP_DICT[op_key]
1047
+ op_sym, op_np, inputs, comm = op_params['op'], op_params['op_np'], op_params['inputs'], op_params['commutative']
1048
+ if inputs == 1:
1049
+ sym_vect = np.vectorize(op_sym)
1050
+ new_op_symbols = sym_vect(symbols[prev_p:])
1051
+ new_op_X = op_np(X[:, prev_p:])
1052
+ new_op_names = str_vectorize(new_op_symbols)
1053
+ new_names, new_symbols, new_X = self.add_new(new_names=new_names, new_symbols=new_symbols, new_X=new_X,
1054
+ new_name=new_op_names, new_symbol=new_op_symbols, new_X_i=new_op_X, verbose=verbose)
1055
+ elif inputs == 2:
1056
+ for idx1 in range(prev_p, X.shape[1]):
1057
+ sym_vect = np.vectorize(lambda idx2: op_sym(symbols[idx1], symbols[idx2]))
1058
+ idx2 = range(idx1 if comm else X.shape[1])
1059
+ if len(idx2) > 0:
1060
+ new_op_symbols = sym_vect(idx2)
1061
+ new_op_names = str_vectorize(new_op_symbols)
1062
+ X_i = X[:, idx1]
1063
+ new_op_X = X_i[:, np.newaxis]*X[:, idx2]
1064
+ new_names, new_symbols, new_X = self.add_new(new_names=new_names, new_symbols=new_symbols, new_X=new_X,
1065
+ new_name=new_op_names, new_symbol=new_op_symbols, new_X_i=new_op_X, verbose=verbose)
1066
+ if not(new_names is None):
1067
+ names = np.concatenate((names, new_names))
1068
+ symbols = np.concatenate((symbols, new_symbols))
1069
+ prev_p = X.shape[1]
1070
+ X = np.hstack([X, new_X])
731
1071
  else:
732
- return self.FE_aux(X = np.hstack([X, new_X]), feature_names=feature_names+new_names, rung=rung-1, prev_start=len(feature_names), max_rung=max_rung, verbose=verbose)
733
-
1072
+ prev_p = X.shape[1]
1073
+
1074
+ if verbose: print('After applying rounds {0} of feature transformations there are {1} features'.format(crung+1, X.shape[1]))
1075
+ if verbose: print('Removing redundant features leaves... ', end='')
1076
+ symbols, names, X = self.remove_redundant_features(X=X, names=names, symbols=symbols)
1077
+ if verbose: print('{0} features'.format(X.shape[1]))
1078
+
1079
+ return self.expand_aux(X=X, names=names, symbols=symbols, crung=crung+1, prev_p=prev_p, verbose=verbose)
1080
+
734
1081
  if __name__ == "__main__":
1082
+ from sklearn.model_selection import train_test_split
735
1083
  random_state = 0
736
- n = 100
737
- p = 10
738
- rung = 3
739
- s = 5
740
- d = 4
741
-
742
1084
  np.random.seed(random_state)
743
- X_train = np.random.normal(size=(n, p))
744
-
745
- y = lambda X: X[:, 0] + 2*X[:, 1]**2 - X[:, 0]*X[:, 1] + 3*X[:, 2]**3
746
- y_train = y(X_train)
747
-
748
- # Initialise and fit the ICL model
749
- FE = PolynomialFeaturesICL(rung=rung, include_bias=False)
750
- so = AdaptiveLASSO(gamma=1, fit_intercept=False)
751
- information_criteria='BIC'
752
-
753
- X_train_transformed = FE.fit_transform(X_train, y)
754
- feature_names = FE.get_feature_names_out()
755
-
756
- icl = ICL(s=s, so=so, d=d, fit_intercept=True, normalize=True, pool_reset=False, information_criteria=information_criteria)
757
- icl.fit(X_train_transformed, y_train, feature_names=feature_names, verbose=True, track_intermediates=True)
758
-
759
- # Compute the train and test error and print the model to verify that we have reproduced the data generating function
760
- print(icl)
761
- print(icl.__repr__())
762
-
763
- y_hat_train = icl.predict(X_train_transformed)
764
-
765
- print("Train rmse: " + str(rmse(y_hat_train, y_train)))
766
-
767
- X_test = np.random.normal(size=(100*n, p))
768
- X_test_transformed = FE.transform(X_test)
769
- y_test = y(X_test)
770
- y_hat_test = icl.predict(X_test_transformed)
771
- print("Test rmse: " + str(rmse(y_hat_test, y_test)))
772
- print("k={0}".format(len(icl.coef_[0])))
773
-
774
- # print(icl.intermediates)
1085
+ n, p = 10000, 10
1086
+ X = np.random.random(size=(n,p))
1087
+ y = np.sqrt(X[:, 0]) - np.cbrt(X[:, 0]) + X[:, 0]**3 - np.log(X[:, 0]) + np.sin(X[:, 0]) + 1
1088
+ names = ['X_{0}'.format(i) for i in range(p)]
1089
+
1090
+ rung = 1
1091
+ small = ['sin', 'cos', 'log', 'abs', 'sqrt', 'cbrt', 'sq', 'cb', 'inv']
1092
+ big = ['six_pow', 'exp', 'add', 'mul', 'div', 'abs_diff']
1093
+ small = [(op, range(rung)) for op in small]
1094
+ big = [(op, range(1)) for op in big]
1095
+ ops = small+big
1096
+
1097
+ FE = FeatureExpansion(rung=rung, ops=ops)
1098
+ Phi_names, Phi_symbols, Phi_ = FE.expand(X=X, names=names, check_pos=True, verbose=True)
1099
+ X_train, X_test, y_train, y_test = train_test_split(Phi_, y, test_size=0.2, random_state=random_state)
1100
+ for i, s in enumerate([5]):
1101
+ icl = ICL(s=s, so=AdaptiveLASSO(gamma=1), k=5, fit_intercept=True, normalize=True, optimize_k=False, track_intermediates=True)
1102
+ icl.fit(X=X_train, y=y_train, feature_names = Phi_names, verbose=False)
1103
+ print(icl.repr_ensemble())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: icol
3
- Version: 0.1.6
3
+ Version: 0.7.4
4
4
  Summary: Iterative Correlation Learning implementation
5
5
  Author-email: Simon Teshuva <simon.teshuva@gmail.com>
6
6
  License: MIT
@@ -13,7 +13,7 @@ Requires-Dist: scikit-learn>=1.2.2
13
13
  # icol
14
14
  ** Iterative Correlation Learning in Python **
15
15
 
16
- `icol` allows one to fit extremly sparse linear models from very high dimensional datasets in a computationally efficient manner. Given a feature transformation, it can also be used to fit Symbolic Regression models
16
+ `icol` allows one to fit extremly sparse linear models from very high dimensional datasets in a computationally efficient manner. We also include two feature expansion methods, allowing icol to be used as a Symbolic Regression tool.
17
17
 
18
18
  ---
19
19
 
@@ -0,0 +1,7 @@
1
+ icol/__init__.py,sha256=nnhJPjnFCpho8OB-5q-Mq8J91EeCV_o3KVO-lLC8tQY,173
2
+ icol/icol.py,sha256=59HIf4VKznrTKMVI46iz6eRXGLvvSfbGS1lQoLlJT1c,42179
3
+ icol-0.7.4.dist-info/LICENSE,sha256=aD00NFSvGfojy-IWFmtKpeSg262O0dWzmsfXAaT0xuk,1070
4
+ icol-0.7.4.dist-info/METADATA,sha256=ZE20mOaTldgxJtiMOHyVOsh23VjDIKk8r_Tmo8JHwGM,1977
5
+ icol-0.7.4.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
6
+ icol-0.7.4.dist-info/top_level.txt,sha256=OKisIKQUWtt2x-hxR53qbTr2AR3kdeRfTChIdmn2sDY,5
7
+ icol-0.7.4.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- icol/__init__.py,sha256=nnhJPjnFCpho8OB-5q-Mq8J91EeCV_o3KVO-lLC8tQY,173
2
- icol/icol.py,sha256=Yh3xf64Z4vjo0aFiHUgTxAhVylNtZbyWHe3_4b6fnN8,28387
3
- icol-0.1.6.dist-info/LICENSE,sha256=aD00NFSvGfojy-IWFmtKpeSg262O0dWzmsfXAaT0xuk,1070
4
- icol-0.1.6.dist-info/METADATA,sha256=CexfevglpUbzgZUrINQ5GW38fj1YJsh2_GPwFO00SNs,1960
5
- icol-0.1.6.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
6
- icol-0.1.6.dist-info/top_level.txt,sha256=OKisIKQUWtt2x-hxR53qbTr2AR3kdeRfTChIdmn2sDY,5
7
- icol-0.1.6.dist-info/RECORD,,
File without changes
File without changes