icol 0.8.5__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icol/feature_expansion.py +321 -0
- icol/icol.py +2 -622
- icol/logistic_icol.py +230 -956
- {icol-0.8.5.dist-info → icol-0.9.2.dist-info}/METADATA +1 -1
- icol-0.9.2.dist-info/RECORD +9 -0
- icol-0.8.5.dist-info/RECORD +0 -8
- {icol-0.8.5.dist-info → icol-0.9.2.dist-info}/LICENSE +0 -0
- {icol-0.8.5.dist-info → icol-0.9.2.dist-info}/WHEEL +0 -0
- {icol-0.8.5.dist-info → icol-0.9.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
|
|
3
|
+
from time import time
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from itertools import combinations, permutations
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import sympy as sp
|
|
9
|
+
|
|
10
|
+
from sklearn.preprocessing import PolynomialFeatures
|
|
11
|
+
from sklearn.base import clone
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
OP_DICT = {
|
|
15
|
+
'sin': {
|
|
16
|
+
'op': sp.sin,
|
|
17
|
+
'op_np': np.sin,
|
|
18
|
+
'inputs': 1,
|
|
19
|
+
'commutative': True,
|
|
20
|
+
'cares_units': False
|
|
21
|
+
},
|
|
22
|
+
'cos': {
|
|
23
|
+
'op': sp.cos,
|
|
24
|
+
'op_np': np.cos,
|
|
25
|
+
'inputs': 1,
|
|
26
|
+
'commutative': True,
|
|
27
|
+
'cares_units': False
|
|
28
|
+
},
|
|
29
|
+
'log': {
|
|
30
|
+
'op': sp.log,
|
|
31
|
+
'op_np': np.log,
|
|
32
|
+
'inputs': 1,
|
|
33
|
+
'commutative': True,
|
|
34
|
+
'cares_units': False
|
|
35
|
+
},
|
|
36
|
+
'exp': {
|
|
37
|
+
'op': sp.exp,
|
|
38
|
+
'op_np': np.exp,
|
|
39
|
+
'inputs': 1,
|
|
40
|
+
'commutative': True,
|
|
41
|
+
'cares_units': False
|
|
42
|
+
},
|
|
43
|
+
'abs': {
|
|
44
|
+
'op': sp.Abs,
|
|
45
|
+
'op_np': np.abs,
|
|
46
|
+
'inputs': 1,
|
|
47
|
+
'commutative': True,
|
|
48
|
+
'cares_units': False
|
|
49
|
+
},
|
|
50
|
+
'sqrt': {
|
|
51
|
+
'op': sp.sqrt,
|
|
52
|
+
'op_np': np.sqrt,
|
|
53
|
+
'inputs': 1,
|
|
54
|
+
'commutative': True,
|
|
55
|
+
'cares_units': False
|
|
56
|
+
},
|
|
57
|
+
'cbrt': {
|
|
58
|
+
'op': lambda x: sp.Pow(x, sp.Rational(1, 3)),
|
|
59
|
+
'op_np': lambda x: np.power(x, 1/3),
|
|
60
|
+
'inputs': 1,
|
|
61
|
+
'commutative': True,
|
|
62
|
+
'cares_units': False
|
|
63
|
+
},
|
|
64
|
+
'sq': {
|
|
65
|
+
'op': lambda x: sp.Pow(x, 2),
|
|
66
|
+
'op_np': lambda x: np.power(x, 2),
|
|
67
|
+
'inputs': 1,
|
|
68
|
+
'commutative': True,
|
|
69
|
+
'cares_units': False
|
|
70
|
+
},
|
|
71
|
+
'cb': {
|
|
72
|
+
'op': lambda x: sp.Pow(x, 3),
|
|
73
|
+
'op_np': lambda x: np.power(x, 3),
|
|
74
|
+
'inputs': 1,
|
|
75
|
+
'commutative': True,
|
|
76
|
+
'cares_units': False
|
|
77
|
+
},
|
|
78
|
+
'six_pow': {
|
|
79
|
+
'op': lambda x: sp.Pow(x, 6),
|
|
80
|
+
'op_np': lambda x: np.power(x, 6),
|
|
81
|
+
'inputs': 1,
|
|
82
|
+
'commutative': True,
|
|
83
|
+
'cares_units': False
|
|
84
|
+
},
|
|
85
|
+
'inv': {
|
|
86
|
+
'op': lambda x: 1/x,
|
|
87
|
+
'op_np': lambda x: 1/x,
|
|
88
|
+
'inputs': 1,
|
|
89
|
+
'commutative': True,
|
|
90
|
+
'cares_units': False
|
|
91
|
+
},
|
|
92
|
+
'mul': {
|
|
93
|
+
'op': sp.Mul,
|
|
94
|
+
'op_np': np.multiply,
|
|
95
|
+
'inputs': 2,
|
|
96
|
+
'commutative': True,
|
|
97
|
+
'cares_units': False
|
|
98
|
+
},
|
|
99
|
+
'div': {
|
|
100
|
+
'op': lambda x, y: sp.Mul(x, 1/y),
|
|
101
|
+
'op_np': lambda x, y: np.multiply(x, 1/y),
|
|
102
|
+
'inputs': 2,
|
|
103
|
+
'commutative': False,
|
|
104
|
+
'cares_units': False
|
|
105
|
+
},
|
|
106
|
+
'add': {
|
|
107
|
+
'op': sp.Add,
|
|
108
|
+
'op_np': lambda x, y: x+y,
|
|
109
|
+
'inputs': 2,
|
|
110
|
+
'commutative': True,
|
|
111
|
+
'cares_units': False
|
|
112
|
+
},
|
|
113
|
+
'sub': {
|
|
114
|
+
'op': lambda x, y: sp.Add(x, -y),
|
|
115
|
+
'op_np': lambda x, y: x-y,
|
|
116
|
+
'inputs': 2,
|
|
117
|
+
'commutative': False,
|
|
118
|
+
'cares_units': False
|
|
119
|
+
},
|
|
120
|
+
'abs_diff': {
|
|
121
|
+
'op': lambda x, y: sp.Abs(sp.Add(x, -y)),
|
|
122
|
+
'op_np': lambda x, y: np.abs(x-y),
|
|
123
|
+
'inputs': 2,
|
|
124
|
+
'commutative': True,
|
|
125
|
+
'cares_units': False
|
|
126
|
+
},
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class PolynomialFeaturesICL:
|
|
131
|
+
def __init__(self, rung, include_bias=False):
|
|
132
|
+
self.rung = rung
|
|
133
|
+
self.include_bias = include_bias
|
|
134
|
+
self.PolynomialFeatures = PolynomialFeatures(degree=self.rung, include_bias=self.include_bias)
|
|
135
|
+
|
|
136
|
+
def __str__(self):
|
|
137
|
+
return 'PolynomialFeatures(degree={0}, include_bias={1})'.format(self.rung, self.include_bias)
|
|
138
|
+
|
|
139
|
+
def __repr__(self):
|
|
140
|
+
return self.__str__()
|
|
141
|
+
|
|
142
|
+
def fit(self, X, y=None):
|
|
143
|
+
self.PolynomialFeatures.fit(X, y)
|
|
144
|
+
return self
|
|
145
|
+
|
|
146
|
+
def transform(self, X):
|
|
147
|
+
return self.PolynomialFeatures.transform(X)
|
|
148
|
+
|
|
149
|
+
def fit_transform(self, X, y=None):
|
|
150
|
+
return self.PolynomialFeatures.fit_transform(X, y)
|
|
151
|
+
|
|
152
|
+
def get_feature_names_out(self):
|
|
153
|
+
return self.PolynomialFeatures.get_feature_names_out()
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class FeatureExpansion:
|
|
157
|
+
def __init__(self, ops, rung, printrate=1000):
|
|
158
|
+
self.ops = ops
|
|
159
|
+
self.rung = rung
|
|
160
|
+
self.printrate = printrate
|
|
161
|
+
self.prev_print = 0
|
|
162
|
+
for i, op in enumerate(self.ops):
|
|
163
|
+
if type(op) == str:
|
|
164
|
+
self.ops[i] = (op, range(rung))
|
|
165
|
+
|
|
166
|
+
def remove_redundant_features(self, symbols, names, X):
|
|
167
|
+
sorted_idxs = np.argsort(names)
|
|
168
|
+
for i, idx in enumerate(sorted_idxs):
|
|
169
|
+
if i == 0:
|
|
170
|
+
unique = [idx]
|
|
171
|
+
elif names[idx] != names[sorted_idxs[i-1]]:
|
|
172
|
+
unique += [idx]
|
|
173
|
+
unique_original_order = np.sort(unique)
|
|
174
|
+
|
|
175
|
+
return symbols[unique_original_order], names[unique_original_order], X[:, unique_original_order]
|
|
176
|
+
|
|
177
|
+
def expand(self, X, y=None, names=None, verbose=False, f=None, check_pos=False):
|
|
178
|
+
n, p = X.shape
|
|
179
|
+
if (names is None) or (len(names) != p):
|
|
180
|
+
names = ['x_{0}'.format(i) for i in range(X.shape[1])]
|
|
181
|
+
|
|
182
|
+
if check_pos == False:
|
|
183
|
+
symbols = sp.symbols(' '.join(name.replace(' ', '.') for name in names))
|
|
184
|
+
else:
|
|
185
|
+
symbols = []
|
|
186
|
+
for i, name in enumerate(names):
|
|
187
|
+
name = name.replace(' ', '.')
|
|
188
|
+
if np.all(X[:, i] > 0):
|
|
189
|
+
sym = sp.symbols(name, real=True, positive=True)
|
|
190
|
+
else:
|
|
191
|
+
sym = sp.symbols(name, real=True)
|
|
192
|
+
symbols.append(sym)
|
|
193
|
+
|
|
194
|
+
symbols = np.array(symbols)
|
|
195
|
+
names = np.array(names)
|
|
196
|
+
|
|
197
|
+
if verbose: print('Estimating the creation of around {0} features'.format(self.estimate_workload(p=p, max_rung=self.rung, verbose=verbose>2)))
|
|
198
|
+
|
|
199
|
+
names, symbols, X = self.expand_aux(X=X, names=names, symbols=symbols, crung=0, prev_p=0, verbose=verbose)
|
|
200
|
+
|
|
201
|
+
if not(f is None):
|
|
202
|
+
import pandas as pd
|
|
203
|
+
df = pd.DataFrame(data=X, columns=names)
|
|
204
|
+
df['y'] = y
|
|
205
|
+
df.to_csv(f)
|
|
206
|
+
|
|
207
|
+
return names, symbols, X
|
|
208
|
+
|
|
209
|
+
def estimate_workload(self, p, max_rung,verbose=False):
|
|
210
|
+
p0 = 0
|
|
211
|
+
p1 = p
|
|
212
|
+
for rung in range(max_rung):
|
|
213
|
+
if verbose: print('Applying rung {0} expansion'.format(rung))
|
|
214
|
+
new_u, new_bc, new_bn = 0, 0, 0
|
|
215
|
+
for (op, rung_range) in self.ops:
|
|
216
|
+
if rung in rung_range:
|
|
217
|
+
if verbose: print('Applying {0} to {1} features will result in approximately '.format(op, p1-p0))
|
|
218
|
+
if OP_DICT[op]['inputs'] == 1:
|
|
219
|
+
new_u += p1
|
|
220
|
+
if verbose: print('{0} new features'.format(p1))
|
|
221
|
+
elif OP_DICT[op]['commutative'] == True:
|
|
222
|
+
new_bc += (1/2)*(p1 - p0 + 1)*(p0 + p1 + 2)
|
|
223
|
+
if verbose: print('{0} new features'.format((1/2)*(p1 - p0 + 1)*(p0 + p1 + 2)))
|
|
224
|
+
else:
|
|
225
|
+
new_bn += (p1 - p0 + 1)*(p0 + p1 + 2)
|
|
226
|
+
if verbose: print('{0} new features'.format((p1 - p0 + 1)*(p0 + p1 + 2)))
|
|
227
|
+
p0 = p1
|
|
228
|
+
p1 = p1 + new_u + new_bc + new_bn
|
|
229
|
+
if verbose: print('For a total of {0} features by rung {1}'.format(p1, rung))
|
|
230
|
+
return p1
|
|
231
|
+
|
|
232
|
+
def add_new(self, new_names, new_symbols, new_X, new_name, new_symbol, new_X_i, verbose=False):
|
|
233
|
+
valid = (np.isnan(new_X_i).sum(axis=0) + np.isposinf(new_X_i).sum(axis=0) + np.isneginf(new_X_i).sum(axis=0)) == 0
|
|
234
|
+
if new_names is None:
|
|
235
|
+
new_names = np.array(new_name[valid])
|
|
236
|
+
new_symbols = np.array(new_symbol[valid])
|
|
237
|
+
new_X = np.array(new_X_i[:, valid])
|
|
238
|
+
else:
|
|
239
|
+
new_names = np.concatenate((new_names, new_name[valid]))
|
|
240
|
+
new_symbols = np.concatenate((new_symbols, new_symbol[valid]))
|
|
241
|
+
new_X = np.hstack([new_X, new_X_i[:, valid]])
|
|
242
|
+
# if (verbose > 1) and not(new_names is None) and (len(new_names) % self.printrate == 0): print('Created {0} features so far'.format(len(new_names)))
|
|
243
|
+
if (verbose > 1) and not(new_names is None) and (len(new_names) - self.prev_print >= self.printrate):
|
|
244
|
+
self.prev_print = len(new_names)
|
|
245
|
+
elapsed = np.round(time() - self.start_time, 2)
|
|
246
|
+
print('Created {0} features so far in {1} seconds'.format(len(new_names),elapsed))
|
|
247
|
+
return new_names, new_symbols, new_X
|
|
248
|
+
|
|
249
|
+
def expand_aux(self, X, names, symbols, crung, prev_p, verbose=False):
|
|
250
|
+
|
|
251
|
+
str_vectorize = np.vectorize(str)
|
|
252
|
+
|
|
253
|
+
def simplify_nested_powers(expr):
|
|
254
|
+
# Replace (x**n)**(1/n) with x
|
|
255
|
+
def flatten_pow_chain(e):
|
|
256
|
+
if isinstance(e, sp.Pow) and isinstance(e.base, sp.Pow):
|
|
257
|
+
base, inner_exp = e.base.args
|
|
258
|
+
outer_exp = e.exp
|
|
259
|
+
combined_exp = inner_exp * outer_exp
|
|
260
|
+
if sp.simplify(combined_exp) == 1:
|
|
261
|
+
return base
|
|
262
|
+
return sp.Pow(base, combined_exp)
|
|
263
|
+
elif isinstance(e, sp.Pow) and sp.simplify(e.exp) == 1:
|
|
264
|
+
return e.base
|
|
265
|
+
return e
|
|
266
|
+
# Apply recursively
|
|
267
|
+
return expr.replace(
|
|
268
|
+
lambda e: isinstance(e, sp.Pow),
|
|
269
|
+
flatten_pow_chain
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
if crung == 0:
|
|
273
|
+
self.start_time = time()
|
|
274
|
+
symbols, names, X = self.remove_redundant_features(X=X, names=names, symbols=symbols)
|
|
275
|
+
if crung==self.rung:
|
|
276
|
+
if verbose: print('Completed {0} rounds of feature transformations'.format(self.rung))
|
|
277
|
+
return symbols, names, X
|
|
278
|
+
else:
|
|
279
|
+
if verbose: print('Applying round {0} of feature transformations'.format(crung+1))
|
|
280
|
+
# if verbose: print('Estimating the creation of {0} features this iteration'.format(self.estimate_workload(p=X.shape[1], max_rung=1)))
|
|
281
|
+
|
|
282
|
+
new_names, new_symbols, new_X = None, None, None
|
|
283
|
+
|
|
284
|
+
for (op_key, rung_range) in self.ops:
|
|
285
|
+
if crung in rung_range:
|
|
286
|
+
if verbose>1: print('Applying operator {0} to {1} features'.format(op_key, X.shape[1]))
|
|
287
|
+
op_params = OP_DICT[op_key]
|
|
288
|
+
op_sym, op_np, inputs, comm = op_params['op'], op_params['op_np'], op_params['inputs'], op_params['commutative']
|
|
289
|
+
if inputs == 1:
|
|
290
|
+
sym_vect = np.vectorize(op_sym)
|
|
291
|
+
new_op_symbols = sym_vect(symbols[prev_p:])
|
|
292
|
+
new_op_X = op_np(X[:, prev_p:])
|
|
293
|
+
new_op_names = str_vectorize(new_op_symbols)
|
|
294
|
+
new_names, new_symbols, new_X = self.add_new(new_names=new_names, new_symbols=new_symbols, new_X=new_X,
|
|
295
|
+
new_name=new_op_names, new_symbol=new_op_symbols, new_X_i=new_op_X, verbose=verbose)
|
|
296
|
+
elif inputs == 2:
|
|
297
|
+
for idx1 in range(prev_p, X.shape[1]):
|
|
298
|
+
sym_vect = np.vectorize(lambda idx2: op_sym(symbols[idx1], symbols[idx2]))
|
|
299
|
+
idx2 = range(idx1 if comm else X.shape[1])
|
|
300
|
+
if len(idx2) > 0:
|
|
301
|
+
new_op_symbols = sym_vect(idx2)
|
|
302
|
+
new_op_names = str_vectorize(new_op_symbols)
|
|
303
|
+
X_i = X[:, idx1]
|
|
304
|
+
new_op_X = op_np(X_i[:, np.newaxis], X[:, idx2]) #X_i[:, np.newaxis]*X[:, idx2]
|
|
305
|
+
new_names, new_symbols, new_X = self.add_new(new_names=new_names, new_symbols=new_symbols, new_X=new_X,
|
|
306
|
+
new_name=new_op_names, new_symbol=new_op_symbols, new_X_i=new_op_X, verbose=verbose)
|
|
307
|
+
if not(new_names is None):
|
|
308
|
+
names = np.concatenate((names, new_names))
|
|
309
|
+
symbols = np.concatenate((symbols, new_symbols))
|
|
310
|
+
prev_p = X.shape[1]
|
|
311
|
+
X = np.hstack([X, new_X])
|
|
312
|
+
else:
|
|
313
|
+
prev_p = X.shape[1]
|
|
314
|
+
|
|
315
|
+
if verbose: print('After applying rounds {0} of feature transformations there are {1} features'.format(crung+1, X.shape[1]))
|
|
316
|
+
if verbose: print('Removing redundant features leaves... ', end='')
|
|
317
|
+
symbols, names, X = self.remove_redundant_features(X=X, names=names, symbols=symbols)
|
|
318
|
+
if verbose: print('{0} features'.format(X.shape[1]))
|
|
319
|
+
|
|
320
|
+
return self.expand_aux(X=X, names=names, symbols=symbols, crung=crung+1, prev_p=prev_p, verbose=verbose)
|
|
321
|
+
|