DaeFinder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Manu Jayadharan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.2
2
+ Name: DaeFinder
3
+ Version: 0.1.0
4
+ Summary: A Python package to discover Differential Algebraic Equations from data.
5
+ Home-page: https://github.com/mjayadharan/DAE-FINDER_dev
6
+ Author: Manu Jayadharan
7
+ Author-email: manu.jayadharan@gmail.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.7
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: numpy
16
+ Requires-Dist: scipy
17
+ Requires-Dist: pandas
18
+ Requires-Dist: sympy
19
+ Requires-Dist: scikit-learn
20
+ Requires-Dist: matplotlib
21
+ Requires-Dist: joblib
22
+ Dynamic: author
23
+ Dynamic: author-email
24
+ Dynamic: classifier
25
+ Dynamic: description
26
+ Dynamic: description-content-type
27
+ Dynamic: home-page
28
+ Dynamic: license
29
+ Dynamic: requires-dist
30
+ Dynamic: requires-python
31
+ Dynamic: summary
32
+
33
+ # DaeFinder
34
+
35
+ DaeFinder is a Python package to discover Differential Algebraic Equations (DAEs) from data.
36
+
37
+ ## Features
38
+ - Solve and analyze toy enzyme kinetics models.
39
+ - Smooth noisy data and calculate derivatives.
40
+ - Generate polynomial features for regression models.
41
+ - Support for sparse feature coupling.
42
+
43
+ ## Installation
44
+ Install the package via pip:
45
+ ```bash
46
+ pip install DaeFinder
@@ -0,0 +1,7 @@
1
+ daeFinder/__init__.py,sha256=ooXd2okg64x6gwxGFXZb5cJG0iTqSwS6XJg1U5wjewo,26
2
+ daeFinder/dae_finder.py,sha256=qOx-Zt5lg4v4SidbapTGbEjPTU4erMCp3XXzePNN9Ew,47943
3
+ DaeFinder-0.1.0.dist-info/LICENSE,sha256=GqjANvOy8FvtSb2tRaa_ylQAeBGInvXIEVkju5WBKlI,1072
4
+ DaeFinder-0.1.0.dist-info/METADATA,sha256=PVvHmbiv33xQkG82kYWf0yuTZzBVSrN9EwoLHt-59KY,1259
5
+ DaeFinder-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
6
+ DaeFinder-0.1.0.dist-info/top_level.txt,sha256=1Q7Anr1UEe0tZ_RZ4owQqqTUDsnXqHzNcrL8hrICwGU,10
7
+ DaeFinder-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ daeFinder
daeFinder/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .dae_finder import *
@@ -0,0 +1,1066 @@
1
+ import itertools
2
+
3
+ from sklearn.preprocessing import PolynomialFeatures
4
+ from sklearn.base import BaseEstimator, TransformerMixin
5
+ from sklearn.base import MultiOutputMixin, RegressorMixin
6
+ from sklearn import linear_model
7
+ from sklearn.preprocessing import StandardScaler
8
+ from sklearn.utils.validation import (
9
+ FLOAT_DTYPES,
10
+ _check_feature_names_in,
11
+ check_is_fitted,
12
+ )
13
+
14
+ import pandas as pd
15
+ import numpy as np
16
+ import warnings
17
+ import operator
18
+ from copy import deepcopy
19
+ from itertools import permutations
20
+
21
+ from scipy.integrate import odeint
22
+ from scipy import interpolate
23
+ from scipy.sparse import coo_array
24
+
25
+
26
+ import sympy
27
+ from sympy import prod, Poly
28
+
29
+ import matplotlib.pyplot as plt
30
+
31
+ from joblib import delayed, Parallel
32
+
33
+
34
+ """Data Generation functions
35
+ """
36
+
37
+
38
+ def toyEnzRHS(y, t, k_rates):
39
+ # Unpack states, params
40
+ S, E, ES, P = y
41
+ k, kr, kcat = k_rates['k'], k_rates['kr'], k_rates['kcat']
42
+
43
+ dydt = [kr * ES - k * E * S,
44
+ (kr + kcat) * ES - k * S * E,
45
+ k * E * S - (kr + kcat) * ES,
46
+ kcat * ES]
47
+ return dydt
48
+
49
+
50
+ def solveToyEnz(init_cond, k_rates, solvedT, tsID, print_to_file=False):
51
+ y0 = [init_cond["S"], init_cond["E"],
52
+ init_cond["ES"], init_cond["P"]]
53
+ sol = odeint(lambda y, t: toyEnzRHS(y, t, k_rates), y0, solvedT)
54
+
55
+ paramID = "".join(str(k_rates.values).strip("()").split())
56
+ if print_to_file:
57
+ np.savetxt('data/toyEnzData_' + paramID + '_' + tsID + '.txt', sol)
58
+ return sol
59
+
60
+
61
+ def toyMM_RHS(y, t, k_rates, IC):
62
+ # Unpack states, params
63
+ # S, E, ES, P = y
64
+ S, P = y
65
+ E_0 = IC["E"]
66
+
67
+ k, kr, kcat = k_rates['k'], k_rates['kr'], k_rates['kcat']
68
+
69
+ # dydt = [S*(-k + kr/(kr+kcat))*(E_0 - (k*E_0*S)/(kr+kcat+k*S)),
70
+ # (k*kcat*E_0*S)/(kr+kcat+k*S)]
71
+
72
+ dydt = [-(k * kcat * E_0 * S) / (kr + kcat + k * S),
73
+ (k * kcat * E_0 * S) / (kr + kcat + k * S)]
74
+ return dydt
75
+
76
+
77
+ def solveMM(init_cond, k_rates, solvedT, tsID, print_to_scr = False, print_to_file=False):
78
+ if print_to_scr:
79
+ print("Solving for Initial Conditions: {} \n and k_rates: {}".format(init_cond, k_rates))
80
+ y0 = [init_cond["S"], init_cond["P"]]
81
+ E_0 = init_cond["E"]
82
+ k, kr, kcat = k_rates['k'], k_rates['kr'], k_rates['kcat']
83
+ sol = odeint(lambda y, t: toyMM_RHS(y, t, k_rates, init_cond), y0, solvedT)
84
+ # print(sol[-5:,1])
85
+ ES_sol = k * E_0 * sol[:, 0] / (kr + kcat + k * sol[:, 0])
86
+ E_sol = E_0 - ES_sol
87
+
88
+ final_sol = np.column_stack((sol[:, 0], E_sol, ES_sol, sol[:, 1]))
89
+
90
+ # paramID = "".join(str(k_rates.values).strip("()").split())
91
+ # print(paramID)
92
+ if print_to_file:
93
+ np.savetxt('data/MM_Data_' + 'k_' + str(k_rates.values) + '__' + str(init_cond.values) + '_' + tsID + '.txt',
94
+ final_sol)
95
+ return final_sol
96
+
97
+ def plotToyEnz(solT, sol, title = ""):
98
+ plt.plot(solT, sol[:, 0], '-ob', label='S(t)', ms=3)
99
+ plt.plot(solT, sol[:, 1], '-og', label='E(t)', ms=3)
100
+ plt.plot(solT, sol[:, 2], '-or', label='ES(t)', ms=3)
101
+ plt.plot(solT, sol[:, 3], '-ok', label='P(t)', ms=3)
102
+ plt.legend(loc='best')
103
+ plt.xlabel('t')
104
+ plt.grid()
105
+ plt.title(title)
106
+ plt.show()
107
+ return
108
+
109
+ def plotToy_MM(solT, sol, title =""):
110
+ plt.plot(solT, sol[:, 0], '-ob', label='S(t)', ms=3)
111
+ # plt.plot(solT, sol[:, 1], '-og', label='E(t)', ms=3)
112
+ # plt.plot(solT, sol[:, 2], '-or', label='ES(t)', ms=3)
113
+ plt.plot(solT, sol[:, 1], '-ok', label='P(t)', ms=3)
114
+ plt.legend(loc='best')
115
+ plt.xlabel('t')
116
+ plt.title(title)
117
+ plt.grid()
118
+ plt.show()
119
+ return
120
+
121
+ def add_noise_to_df(data_df, noise_perc, make_copy=True,
122
+ random_seed = None, method= "std"):
123
+ """
124
+ data_df: pandas df with columns representing features.
125
+ Add noise to each feature column in the data matrix using a Gaussian distribution with mean zero and standard deviation equal to
126
+ noise_percentage/100 * std of the feature.
127
+ """
128
+
129
+ if random_seed:
130
+ np.random.seed(random_seed)
131
+ if make_copy:
132
+ data_df_new = deepcopy(data_df)
133
+ else:
134
+ data_df_new = data_df
135
+ if method == "std":
136
+ std_features = data_df_new.std()
137
+ for feature in data_df_new:
138
+ noise_level = std_features[feature] * noise_perc/100
139
+ data_df_new[feature] += np.random.normal(loc=0.0, scale=noise_level, size=data_df_new[feature].shape)
140
+
141
+ return data_df_new
142
+
143
+
144
+ def get_der_names(feature_list, get_list=False):
145
+ """
146
+ Utility function to get a strings denoting the derivatives of the features in the feature_list
147
+ :param feature_list: ['A', 'B', 'C'] or any iterable of strings
148
+ :param get_list: If True, a list of strings are returned, else a dictionary is returned.
149
+ :return: dictionary of the form {'A': 'd(A) /dt'}.
150
+ """
151
+ if get_list:
152
+ return ["d(" + feature + ") /dt" for feature in feature_list]
153
+ return {feature: "d(" + feature + ") /dt" for feature in feature_list}
154
+
155
+
156
+ def der_matrix_calculator(data_matrix, delta_t, rename_feat=True):
157
+ """
158
+ Utility function to calculate the derivative matrix from a data matrix.
159
+ The data is assumed to be evenly spaced with a time interval delta_t in between.
160
+ Frist order forward difference is then used to find the derivative using (f(t+delta_t)-f(t))/delta_t
161
+ :param data_matrix: pd.DataFrame with features.
162
+ :param delta_t: time difference between subsequent data points.
163
+ :param rename_feat: if True, the features are renamed to reflected the derivative notation in the output.
164
+ :return: pd.DataFrame with len = len(data_matrix)-1.
165
+ """
166
+ assert delta_t > 1.e-10, "delta_t cannot be too small or negative"
167
+ derr_matrix = (data_matrix.iloc[1:].reset_index(drop=True) -
168
+ data_matrix.iloc[:-1].reset_index(drop=True)) / delta_t
169
+ if rename_feat:
170
+ derr_names = get_der_names(data_matrix.columns)
171
+ derr_matrix.rename(columns=derr_names, inplace=True)
172
+
173
+ return derr_matrix
174
+
175
+
176
+ def der_label(feature, der=1):
177
+ if der == 0:
178
+ return feature
179
+ elif der == 1:
180
+ return "d({}) /dt".format(feature)
181
+ else:
182
+ return "d^{}({}) /dt^{}".format(der, feature, der)
183
+
184
+
185
+ def smooth_data(data_matrix,
186
+ domain_var="t",
187
+ smooth_method ="spline",
188
+ s_param_=None,
189
+ noise_perc=0,
190
+ derr_order=1,
191
+ eval_points=[],
192
+ num_time_points=0,
193
+ silent =True):
194
+ """
195
+ :param data_matrix: Data matrix to smoothen. nxp data frame structure is assumed where n is the number of
196
+ data points and p is the number of features (predictors).
197
+ :param domain_var: Domain variable with respect to which the data needs to be smoothened. Default is assumed to be
198
+ "t" (time).
199
+ :param smooth_method: Numerical method used for smoothening.
200
+ :param s_param: smoothening parameter.
201
+ :param noise_perc: optional estimate of noise to signal ratio %
202
+ :param derr_order: Number of derivatives need to be calculated, wrt the domain variable, after smoothening the data.
203
+ :param eval_points: option list of points at which the smoothened data and derivatives will be evaluated for output
204
+ :return: pd.DataFrame of size len(eval_points) x k where k is the number of features and their derivatives.
205
+ """
206
+ assert domain_var in data_matrix, "domain variable not found in the data matrix"
207
+ s_param = deepcopy(s_param_)
208
+ data_t = data_matrix[domain_var]
209
+ if num_time_points == 0:
210
+ num_time_points = len(data_matrix)
211
+ if len(eval_points) == 0:
212
+ eval_points = np.linspace(data_t.iloc[0], data_t.iloc[-1], num_time_points)
213
+ t_eval_new = eval_points
214
+
215
+ data_matrix_ = data_matrix.drop(domain_var, axis=1)
216
+ data_matrix_std = data_matrix_.std()
217
+
218
+ data_matrix_smooth = pd.DataFrame(t_eval_new, columns=[domain_var])
219
+
220
+ if smooth_method == "spline":
221
+ if s_param:
222
+ s_param_list = [s_param for feature in data_matrix_]
223
+ else:
224
+ s_param_list = [num_time_points * (0.01 * noise_perc * data_matrix_std[feature]) ** 2 for
225
+ feature in data_matrix_]
226
+ smoothened_values_list = [np.hstack([interpolate.splev(t_eval_new, interpolate.splrep(data_t,
227
+ data_matrix_[feature],
228
+ s=s_param_val), der=der_ind) [:, None]
229
+ for der_ind in range(derr_order + 1)])
230
+ for feature, s_param_val in zip(data_matrix_, s_param_list)]
231
+ smoothened_values = np.hstack(smoothened_values_list)
232
+ column_label_list = [[der_label(feature, der_ind) for der_ind in range(derr_order + 1)]
233
+ for feature in data_matrix_]
234
+ column_label_list = list(itertools.chain.from_iterable(column_label_list))
235
+ smoothened_df = pd.DataFrame(smoothened_values, columns=column_label_list)
236
+ data_matrix_smooth = pd.concat([data_matrix_smooth, smoothened_df], axis=1)
237
+
238
+ # for feature in data_matrix_:
239
+ # if not s_param:
240
+ # # smoothing parameter: when equal weightage: num_data_points * std of data
241
+ # s_param = num_time_points * (0.01 * noise_perc * data_matrix_std[feature]) ** 2
242
+ # tck = interpolate.splrep(data_t, data_matrix_[feature], s=s_param)
243
+ # for der_ind in range(derr_order + 1):
244
+ # smoothed_data = interpolate.splev(t_eval_new, tck, der=der_ind)
245
+ # data_matrix_smooth[der_label(feature, der_ind)] = smoothed_data
246
+ else:
247
+ raise "Smoothening type not supported"
248
+
249
+ if not silent:
250
+ print("Returning the smoothened data")
251
+ return data_matrix_smooth
252
+
253
+ def remove_paranth_from_feat(feature_list):
254
+ """
255
+ Utility function to remove the parenthesis from the name of the feature if they exists.
256
+ If either "[", or "]" are not present, the feature string is returned unchanged.
257
+ :param feature_list: ["[E]", "[ES]"]
258
+ :return: ["E", "ES"]
259
+ """
260
+ result_list = list(feature_list)
261
+ for ind, feat in enumerate(result_list):
262
+ if "[" in feat and "]" in feat:
263
+ result_list[ind] = feat.replace("[", "").replace("]", "")
264
+
265
+ return result_list
266
+
267
+
268
+ def poly_to_scipy(exp_list):
269
+ """
270
+ Utility function to convert the power symbol "^" from monomial strings to scipy compatible "**"
271
+ symbol for power.
272
+ :param exp_list: ["A^2", "A*B^3"]
273
+ :return: ["A**2", "A*B**3"]
274
+ """
275
+ return [exp.replace(" ", "*").replace("^", "**") for exp in exp_list]
276
+
277
+
278
+ def get_factor_feat(factor_exp, feat_dict):
279
+ """
280
+ Utility function to return the list of expressions from expr_list which has factor_exp as a factor
281
+ factor_exp: sympy expression eg: [ES]**2
282
+ feat_dict : {'[ES]*[S]^2': [ES]*[S]**2}
283
+ """
284
+ return [feat for feat, feat_sym in feat_dict.items() if sympy.fraction(feat_sym / factor_exp)[1] == 1]
285
+
286
+
287
+ def get_refined_lib(factor_exp, data_matrix_df_, candidate_library_, get_dropped_feat=False):
288
+ """
289
+ Utility function to get the refined library by removing all features in the candidate library which
290
+ has factor_exp as a factor in it.
291
+ :param factor_exp: sympy expression eg. S*ES
292
+ :param data_matrix_df_ (pd.DataFrame): data matrix containing all the state variables as column labels
293
+ :param candidate_library_ (pd.DataFrame): candidate library that needs to be refined.
294
+ :param get_dropped_feat: if True, both the dropped features and the refined library is returned,
295
+ else only the refined library is returned
296
+ :return:
297
+ """
298
+ # Adding the state variables as scipy symbols
299
+ feat_list = list(data_matrix_df_.columns)
300
+ feat_list_str = ", ".join(remove_paranth_from_feat(data_matrix_df_.columns))
301
+ exec(feat_list_str + "= sympy.symbols(" + str(feat_list) + ")")
302
+
303
+ # Converting the monomials in the candidate library to scipy expressions
304
+ candid_features = remove_paranth_from_feat(poly_to_scipy(candidate_library_.columns))
305
+ candid_feat_dict = {}
306
+ for feat1, feat2 in zip(candidate_library_.columns, candid_features):
307
+ exec("candid_feat_dict['{}'] = {}".format(feat1, feat2))
308
+
309
+ dropped_feats = set()
310
+ if (isinstance(factor_exp, list) or isinstance(factor_exp, set)):
311
+ for factor_ in factor_exp:
312
+ dropped_feats = dropped_feats.union(set(get_factor_feat(factor_, candid_feat_dict)))
313
+ else:
314
+ dropped_feats = dropped_feats.union(set(get_factor_feat(factor_exp, candid_feat_dict)))
315
+
316
+ if get_dropped_feat:
317
+ return (dropped_feats, candidate_library_.drop(dropped_feats, axis=1))
318
+ else:
319
+ return candidate_library_.drop(dropped_feats, axis=1)
320
+
321
+
322
+ def get_simplified_equation(best_model_df, feature,
323
+ global_feature_list, coef_threshold,
324
+ intercept_threshold= 0.01,
325
+ intercept=0, simplified=True):
326
+
327
+ # Adding the state variables as scipy symbols
328
+ global_feature_list = list(global_feature_list)
329
+ global_feature_list_string = ", ".join(remove_paranth_from_feat(global_feature_list))
330
+ exec(global_feature_list_string + "= sympy.symbols(" + str(global_feature_list) + ")")
331
+
332
+
333
+ model_lhs = feature
334
+ model_lhs_sp_string = remove_paranth_from_feat(poly_to_scipy([model_lhs]))[0]
335
+
336
+ #Intercept below the threshold is assigned to zero
337
+ intercept = 0 if abs(intercept) < intercept_threshold else intercept
338
+
339
+ model_coefs = best_model_df[model_lhs].values
340
+ #Coefficients of features in the model below threshold is eliminated
341
+ model_coefs[abs(model_coefs) < coef_threshold] = 0
342
+
343
+ model_rhs_features = remove_paranth_from_feat(poly_to_scipy(best_model_df[model_lhs].keys()))
344
+
345
+
346
+ rhs_string_sp_string = [str(coef) + "*" + feature for coef, feature in zip(model_coefs, model_rhs_features) ]
347
+ rhs_string_sp_string = "+".join(rhs_string_sp_string) + "+" + str(intercept)
348
+
349
+ result_dict = {}
350
+ exec("result_dict['lhs'] = {}".format(model_lhs_sp_string))
351
+ exec("result_dict['rhs'] = {}".format(rhs_string_sp_string))
352
+
353
+ if not simplified:
354
+ return result_dict
355
+ else:
356
+ n, d = sympy.fraction(sympy.cancel(result_dict['rhs'] / result_dict['lhs']))
357
+ result_dict['lhs'] = d
358
+ result_dict['rhs'] = n
359
+
360
+ return result_dict
361
+
362
+
363
+ def get_simplified_equation_list(best_model_df, global_feature_list,
364
+ coef_threshold, intercept_threshold= 0.01,
365
+ intercept_dict={}, simplified=True,
366
+ feature_list_=[]):
367
+
368
+ if len(feature_list_) > 0:
369
+ feature_list = deepcopy(feature_list_)
370
+ assert set(feature_list) <= set(best_model_df.columns), \
371
+ ("fit for some features missing from the best_model_df")
372
+ else:
373
+ feature_list = best_model_df.columns
374
+
375
+ result_dict = {feature: get_simplified_equation(best_model_df, feature,
376
+ global_feature_list=global_feature_list,
377
+ coef_threshold=coef_threshold,
378
+ intercept_threshold=intercept_threshold,
379
+ intercept=intercept_dict.get(feature, 0),
380
+ simplified=simplified)
381
+ for feature in feature_list}
382
+
383
+ return result_dict
384
+
385
+ def sympy_symb_to_feature_name(sympy_symb, library_feat_names):
386
+ """
387
+
388
+ @param sympy_symb: sympy symbol string in format
389
+ @param library_feat_names:
390
+ @return:
391
+ """
392
+
393
+ symb_str = str(sympy_symb).strip()
394
+ if symb_str == "1":
395
+ return
396
+ symb_str = symb_str.replace("**", "^")
397
+ symb_list = symb_str.split("*")
398
+ possible_permutations = permutations(symb_list)
399
+ for symb_perm in possible_permutations:
400
+ feat = " ".join(symb_perm)
401
+ if feat in library_feat_names:
402
+ return feat
403
+
404
+ raise Exception("No feature corresponding to {} exist in the given library_df".format(sympy_symb))
405
+
406
+
407
+ def construct_reduced_fit_list(full_feature_name_list, simplified_eqs,
408
+ sympy_format=False):
409
+ relation_list = []
410
+ for simpl_eq in simplified_eqs.values():
411
+ lhs = simpl_eq["lhs"]
412
+ rhs = simpl_eq["rhs"]
413
+ lhs_list = []
414
+ rhs_list = []
415
+ try:
416
+ lhs_poly = Poly(lhs)
417
+ lhs_list = [prod(x ** k for x, k in zip(lhs_poly.gens, mon)) for mon in lhs_poly.monoms()]
418
+ except Exception as e:
419
+ print("***Warning: exception occured while trying to find the monomials of {}: {}".format(lhs, e))
420
+
421
+ try:
422
+ rhs_poly = Poly(rhs)
423
+ rhs_list = [prod(x ** k for x, k in zip(rhs_poly.gens, mon)) for mon in rhs_poly.monoms()]
424
+ except Exception as e:
425
+ print("***Warning: exception occured while trying to find the monomials of {}: {}".format(rhs, e))
426
+
427
+ relation_list.append(lhs_list + rhs_list)
428
+
429
+ if sympy_format:
430
+ return relation_list
431
+ else:
432
+ relation_in_lib_feat = [
433
+ [sympy_symb_to_feature_name(sympy_symb, full_feature_name_list) for sympy_symb in relations]
434
+ for relations in relation_list]
435
+ return relation_in_lib_feat
436
+
437
+
438
+ def compare_models_(models_df_1, models_df_2, tol=1.e-5):
439
+ """
440
+ Utility function to compare the structure of two models. Note that model_df_1 and model_df_2
441
+ should have the same column labels, index labels, and shape. Returns a data frame with the same
442
+ shape as the model data frames being compared. 0 will appear whenever the term strcture matches
443
+ between two model df, +1 appears when a term is present in models_df_1, and absent in models_df_2.
444
+ Similarly, -1 appears when a term is absent in models_df_1, and present in models_df_2.
445
+ @param models_df_1: pd.DataFrame with columns = [LHS of model] index = terms in the RHS of model.
446
+ @param models_df_2: pd.DataFrame with columns = [LHS of model] index = terms in the RHS of model.
447
+ @param tol: tolerance that will be used for comparing model structure.
448
+ @return: pd.DataFrame of the same shape as models_df_1 and models_df_2. 0 will appear whenever the term strcture matches
449
+ between two model df, +1 appears when a term is present in models_df_1, and absent in models_df_2.
450
+ Similarly, -1 appears when a term is absent in models_df_1, and present in models_df_2.
451
+ """
452
+ assert models_df_1.shape == models_df_2.shape, "both model dataframes should be of the same shape"
453
+ assert all(models_df_1.columns == models_df_2.columns) and all(models_df_1.index == models_df_2.index)
454
+
455
+ models_df_1[abs(models_df_1) > tol] = 1
456
+ models_df_1[abs(models_df_1) <= tol] = 0
457
+
458
+ models_df_2[abs(models_df_2) > tol] = 1
459
+ models_df_2[abs(models_df_2) <= tol] = 0
460
+
461
+ model_diff_df = models_df_1 - models_df_2
462
+
463
+ model_diff_df.loc["# incosistent terms"] = abs(model_diff_df).sum()
464
+
465
+ return model_diff_df
466
+
467
+ """
468
+ ------------------------------------------------------------------------------------
469
+ ------------------------------------------------------------------------------------
470
+ """
471
+ class PolyFeatureMatrix(BaseEstimator, TransformerMixin):
472
+ """
473
+ Generic class to create polynomial library terms. This class is a wrapper around
474
+ sklearn's preprocessing.PolynomialFeatures class with support for pandas data frame.
475
+ """
476
+ def __init__(self, degree=2, interaction_only=False, include_bias=True, output_df=True):
477
+ self.degree = degree
478
+ self.interaction_only = interaction_only
479
+ self.include_bias = include_bias
480
+ self.output_df = output_df
481
+ self.poly_feature = PolynomialFeatures(degree=self.degree,
482
+ interaction_only=self.interaction_only,
483
+ include_bias=self.include_bias)
484
+
485
+ def fit(self, X, y=None):
486
+ self.poly_feature.fit(X)
487
+ return self
488
+
489
+ def transform(self, X, y=None):
490
+ poly_data_matrix = self.poly_feature.transform(X)
491
+ if self.output_df:
492
+ poly_df = pd.DataFrame(poly_data_matrix, columns=self.poly_feature.get_feature_names_out())
493
+ return poly_df
494
+ else:
495
+ return poly_data_matrix
496
+
497
+ """
498
+ ------------------------------------------------------------------------------------
499
+ ------------------------------------------------------------------------------------
500
+ """
501
+ class FeatureCouplingTransformer(TransformerMixin, BaseEstimator):
502
+ """
503
+ Transformer class for generating features (candidate library functions) derived from coupling between features.
504
+ The coupling between features can either be implied from a sparsity matrix (preferred), or can be explicitly
505
+ provided to the constructor.
506
+
507
+ Coupling behavior of the features can be explicitly fed to the constructor using hte coupling_func argument. If no
508
+ coupling_func is provided, a second order interaction of the form feature_1*feature_2 is assumed.
509
+
510
+ Examples
511
+ ---------
512
+ Case 1: No coupling_func is provided (so default interaction coupling is assumed)
513
+
514
+ data_matrix_ = pd.DataFrame([[1,2,3], [4,5,6]], columns = ["t", "x", "y"])
515
+ row = np.array([0, 0, 1, 1])
516
+ col = np.array([0, 2, 2, 1])
517
+ data = np.array([4, 5, 7, 5])
518
+ sparsity_matrix = coo_array((data, (row, col)))
519
+ coupling_transf = FeatureCouplingTransformer(sparsity_matrix)
520
+ transformed_features = coupling_transf.fit_transform(data_matrix_)
521
+ print(coupling_transf.get_get_feature_names_out())
522
+ output: array(['t*t', 't*y', 'x*y', 'x*x'], dtype=object)
523
+
524
+ Case 1: Coupling function is provided.
525
+
526
+ data_matrix_ = pd.DataFrame([[1,2,3], [4,5,6]], columns = ["t", "x", "y"])
527
+ row = np.array([0, 0, 1, 1])
528
+ col = np.array([0, 2, 2, 1])
529
+ data = np.array([4, 5, 7, 5])
530
+ sparsity_matrix = coo_array((data, (row, col)))
531
+ def coup_fun(x,y,i,j,k=0):
532
+ return x-y-k
533
+ coupling_transf = FeatureCouplingTransformer(sp_array_2,
534
+ coupling_func= coup_fun,
535
+ coupling_namer= lambda x,y,i,j,k : "{}-{}-{}".format(x,y,k),
536
+ coupling_func_args={"k":2})
537
+ transformed_features = coupling_transf.fit_transform(data_matrix_)
538
+ print(coupling_transf.get_get_feature_names_out())
539
+ array(['t-t-2', 't-y-2', 'x-y-2', 'x-x-2'], dtype=object)
540
+
541
+ """
542
+
543
+ def __init__(self, sparsity_matrix=None, coupled_indices_list=None,
544
+ coupling_func=None, coupling_namer=None,
545
+ coupling_func_args={}, return_df=False):
546
+ """
547
+ Note that if coupled indices list is not explicitly given to the constructor, a valid sparsity matrix
548
+ from which the coupled indices can be implied should be provided.
549
+
550
+ @param sparsity_matrix: Sparsity matrix in the scipy.sparse.coo_array format (preferred over directly
551
+ providing coupled_indices_list
552
+ @param coupled_indices_list: List of tuples [(i,j)] which shows coupling between factors with indices i and j
553
+ @param coupling_func: Custom function to define coupling between features. Note that the coupling_func function
554
+ should have arguments (feature_1_value,feature_2_value, i, j) as the first four arguments.
555
+ @param coupling_namer: Custom function to name the feature corresponding to each coupling. Note that the
556
+ coupling_namer function should have arguments (feature_1_value,feature_2_value, i, j)
557
+ as the first four arguments.
558
+ @param coupling_func_args: optional keyword arguments for the coupling_function and coupling_namer functions
559
+ @param return_df: bool flag to output pandas DataFrame instead of numpy array. False by default
560
+ """
561
+
562
+ if not coupled_indices_list:
563
+ assert isinstance(sparsity_matrix, coo_array), "FeatureDiffTransformer only support sparsity matrix\
564
+ in the scipy.sparse.coo_array format"
565
+ self.sparsity_matrix = sparsity_matrix
566
+ self.coupled_indices_list = coupled_indices_list
567
+ if not coupling_func:
568
+ self.coupling_func = lambda x, y, i, j: x * y
569
+ else:
570
+ # If coupling function is not given, it is defined as the interaction term feature_1*feature_2
571
+ self.coupling_func = coupling_func
572
+
573
+ if not coupling_namer:
574
+ self.coupling_namer = lambda feature_1, feature_2, i, j: "{}*{}".format(feature_1, feature_2)
575
+ else:
576
+ self.coupling_namer = coupling_namer
577
+
578
+ self.coupling_func_args = coupling_func_args
579
+ self.return_df = return_df
580
+
581
+
582
+ self.n_features_in_ = 0
583
+ self.feature_names_in_ = None
584
+
585
+ def get_feature_names_out(self, input_features=None):
586
+ """
587
+ Get output feature names for transformation.
588
+
589
+ @param input_features: - If `input_features is None`, then `feature_names_in_` is
590
+ used as feature names in. If `feature_names_in_` is not defined,
591
+ then the following input feature names are generated:
592
+ `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
593
+ - If `input_features` is an array-like, then `input_features` must
594
+ match `feature_names_in_` if `feature_names_in_` is defined.
595
+ It is recommended that the coupling between features are given using a sparsity matrix
596
+ instead of coupling indices.
597
+ @return: feature_names_out : ndarray of str objects
598
+ Transformed feature names.
599
+ """
600
+
601
+ check_is_fitted(self)
602
+ input_features = _check_feature_names_in(self, input_features)
603
+
604
+ feature_names = [self.coupling_namer(input_features[i], input_features[j], i, j, **self.coupling_func_args) for
605
+ i, j in self.coupled_indices_list]
606
+
607
+ return np.asarray(feature_names, dtype=object)
608
+
609
+ def fit(self, X, y=None):
610
+
611
+ self.n_features_in_ = X.shape[1]
612
+ if len(X.columns) > 0:
613
+ self.feature_names_in_ = X.columns
614
+ if not self.coupled_indices_list: # sparsity matrix gives the coupling indices
615
+ assert max(self.sparsity_matrix.col.max(), self.sparsity_matrix.row.max()) <= self.n_features_in_ - 1, \
616
+ "sparsity matrix has indices out of bound of the number of features"
617
+ # Extracting the indices that has coupling with each other.
618
+ self.coupled_indices_list = list(zip(self.sparsity_matrix.row, self.sparsity_matrix.col))
619
+
620
+ return self
621
+
622
+ def transform(self, X):
623
+ """Transform data to output the coupled features
624
+
625
+ Parameters
626
+ ----------
627
+ X : {array-like, sparse matrix} of shape (n_samples, n_features)
628
+ The data to transform, row by row.
629
+
630
+ Returns
631
+ -------
632
+ XP : {ndarray, sparse matrix} of shape (n_samples, NS)
633
+ The matrix of features, where `NS` is the number of non-zero
634
+ connections implied from the sparsity matrix. NS = len(self.get_features_names_out())
635
+ """
636
+ check_is_fitted(self)
637
+
638
+ X = self._validate_data(
639
+ X, order="F", dtype=FLOAT_DTYPES, reset=False, accept_sparse=("csr", "csc")
640
+ )
641
+ X_transpose = X.T
642
+ X_coupled = np.vstack([self.coupling_func(X_transpose[i], X_transpose[j], i, j, **self.coupling_func_args)
643
+ for i, j in self.coupled_indices_list]).T
644
+ if self.return_df:
645
+ return pd.DataFrame(X_coupled, columns=self.get_feature_names_out())
646
+
647
+ return X_coupled
648
+
649
+ """
650
+ ------------------------------------------------------------------------------------
651
+ ------------------------------------------------------------------------------------
652
+ """
653
+
654
+ class AlgModelFinder(BaseEstimator):
655
+
656
+ """"
657
+ Class that helps with finding algrebraic relationship between features (columns)
658
+ of a data matrix.
659
+ - Several prebuilt model choices like lasso, ridge, elastic net etc.
660
+ - Can work with custom models that suppport .fit(), .coef_ methods.
661
+ Simply need to pass the custom model to the constructor.
662
+ - Choice to scale columns and scale back the fitted coefficients accordingly.
663
+ - Selection of best 'n' models using different metrics. "R2" and "mse" on test data are
664
+ prebuilt. Option to pass custom metric object. Can be extended to include other relevant
665
+ metrics as pre-built.
666
+
667
+ """
668
+ def __init__(self, model_id="lasso",
669
+ custom_model=False,
670
+ custom_model_ob=None,
671
+ alpha=0.1,
672
+ fit_intercept=False
673
+ ):
674
+ self.model_id_dict = {"lasso": linear_model.Lasso,
675
+ "RR": linear_model.Ridge,
676
+ "LR": linear_model.LinearRegression}
677
+ if custom_model:
678
+ assert custom_model_ob
679
+ else:
680
+ assert (model_id in self.model_id_dict)
681
+
682
+ self.custom_model = custom_model
683
+ self.model_id = model_id
684
+ self.fit_intercept = fit_intercept
685
+ self.alpha = alpha
686
+ self.is_fit = False
687
+ # {feature: R2_score} obtained from fitting each feature against rest
688
+ self.r2_score_dict = {}
689
+ self.__fitted_models = {}
690
+ self.fitted_models_unscaled = {}
691
+ self.column_scaled = False
692
+
693
+ no_contraint_models = {"LR"}
694
+ self.custom_model_ob = custom_model_ob
695
+ if custom_model:
696
+ self.model = custom_model_ob
697
+ elif self.model_id in no_contraint_models:
698
+ # Instantiating the model object
699
+ self.model = self.model_id_dict[self.model_id](fit_intercept=self.fit_intercept)
700
+ else:
701
+ self.model = self.model_id_dict[self.model_id](alpha=self.alpha,
702
+ fit_intercept=self.fit_intercept)
703
+ self.column_scales = None
704
+
705
+ def fit_and_score(self, feature_, X_scaled_, feature_to_library_map_):
706
+ possible_library_terms = feature_to_library_map_[feature_]
707
+ X_features = X_scaled_[possible_library_terms]
708
+ y_target = X_scaled_[feature_]
709
+
710
+ self.model.fit(X=X_features, y=y_target)
711
+ coefficients = dict(zip(self.model.feature_names_in_, self.model.coef_))
712
+ intercept = self.model.intercept_
713
+ score = self.model.score(X_features, y_target)
714
+ return coefficients, intercept, score
715
+
716
+ def fit(self,
717
+ X,
718
+ y=None,
719
+ scale_columns=False,
720
+ center_mean=False,
721
+ features_to_fit = None,
722
+ feature_to_library_map_ ={},
723
+ coupling_matrix = None,
724
+ parallelize = False,
725
+ num_cpu = 4
726
+ ):
727
+
728
+ """
729
+ X -> Data matrix (either (n,m) numpy array or pandas DF), where each column represents
730
+ one feature from the candidate library.
731
+ scale_columns -> divide the columns by std to get a unit variance for columns.
732
+ features_to_fit -> List of features to fit against the rest of the library terms
733
+ """
734
+ if self.fit_intercept:
735
+ assert "1" not in X, ("Constant column should not be part of the data set if fit_intercept "
736
+ "is set to True")
737
+ self.is_fit = True
738
+ feature_to_library_map = deepcopy(feature_to_library_map_)
739
+
740
+ r_2_dict_unsorted = {}
741
+ self.__fitted_models = {}
742
+ self.__fitted_model_intercepts = {}
743
+ self.r2_score_dict = {}
744
+ if scale_columns:
745
+ s_scaler = StandardScaler(with_std=scale_columns, with_mean=center_mean)
746
+ X_scaled = pd.DataFrame(s_scaler.fit_transform(X), columns=s_scaler.feature_names_in_)
747
+ # Making sure constant term is not removed after mean centering to zero
748
+ if center_mean and '1' in X_scaled:
749
+ X_scaled['1'] = 1
750
+ if scale_columns:
751
+ self.column_scaled = True
752
+ self.column_scales = X.std()
753
+ # To avoid division by zero during the scaling step.
754
+ self.column_scales['1'] = 1
755
+ else:
756
+ X_scaled = X
757
+ if not features_to_fit:
758
+ features_to_fit = X_scaled.columns
759
+
760
+
761
+ for feature in features_to_fit:
762
+ #If feature to library map is not given, all the members of the universal
763
+ # candidate library will be fit against the feature.
764
+ if feature not in feature_to_library_map:
765
+ possible_library_terms = X_scaled.columns.drop(feature, errors='ignore')
766
+ else:
767
+ # print(feature_to_library_map)
768
+ # print(feature, "-reached here")
769
+ possible_library_terms = feature_to_library_map[feature]
770
+ assert set(possible_library_terms) <= set(X_scaled.columns), \
771
+ ("library terms for feature {} from feature_to_library_map is not found"
772
+ "in the universal X library")
773
+ feature_to_library_map[feature] = possible_library_terms
774
+
775
+
776
+ # self.model.fit(X=X_scaled[possible_library_terms], y=X_scaled[feature])
777
+ # self.__fitted_models[feature] = dict(zip(self.model.feature_names_in_, self.model.coef_))
778
+ # self.__fitted_model_intercepts[feature] = self.model.intercept_
779
+ # # self.model.score(X=X_scaled[possible_library_terms],
780
+ # # y=X_scaled[feature])
781
+ # r_2_dict_unsorted[feature] = self.model.score(X=X_scaled[possible_library_terms],
782
+ # y=X_scaled[feature])
783
+
784
+ #r_2_dict_unsorted = {feature: self.model.fit_score(X=X_scaled.drop([feature], axis=1),
785
+ # y=X_scaled[feature]) for feature in X_scaled}
786
+
787
+ # Using dictionary comprehensions to store model details and R² scores
788
+
789
+ # res = Parallel(n_jobs=20)(delayed(dummy)(x) for x in range(100))
790
+ if parallelize:
791
+ combined_fit_results_list = Parallel(n_jobs=num_cpu,require='sharedmem')(delayed(self.fit_and_score)
792
+ (feature, X_scaled, feature_to_library_map)
793
+ for feature in features_to_fit )
794
+ combined_fit_results = dict(zip(features_to_fit, combined_fit_results_list))
795
+ else:
796
+ combined_fit_results = {feature: self.fit_and_score(feature, X_scaled, feature_to_library_map) for feature in features_to_fit}
797
+
798
+ # Extracting separate dictionaries for coefficients, intercepts, and R² scores
799
+ self.__fitted_models = {feature: result[0] for feature, result in combined_fit_results.items()}
800
+ self.__fitted_model_intercepts = {feature: result[1] for feature, result in combined_fit_results.items()}
801
+ r_2_dict_unsorted = {feature: result[2] for feature, result in combined_fit_results.items()}
802
+ self.r2_score_dict = dict(sorted(r_2_dict_unsorted.items(), key=operator.itemgetter(1)))
803
+
804
+ # feature_to_library_map = {}
805
+ return self
806
+
807
+ def best_models(self, num=0, X_test=None, metric="r2",
808
+ scale_coef=True):
809
+ """
810
+ If X_test == None, the r_2 scores already stored from the underlying model will be used for
811
+ selection.
812
+ Best models are selected according to best metric value (eg. high R2 or low mse)
813
+ """
814
+ assert self.is_fit, "Models need to be fit to data first"
815
+ sorted_metric_series = []
816
+ if num < 1: # Output all possible models
817
+ num = len(self.__fitted_models)
818
+ metric_set = {"r2", "mse"}
819
+ assert metric in metric_set, "metric {} is not supported. Only {} is supported".format(metric, metric_set)
820
+ if metric == "r2": # Use the already computed r_2 scores for selection
821
+ r_2_list = list(zip(list(self.r2_score_dict.keys()),
822
+ list(self.r2_score_dict.values())))
823
+ sorted_r2_dict = dict(sorted(self.r2_score_dict.items(), key=operator.itemgetter(1), reverse=True))
824
+ sorted_metric_series = pd.Series(sorted_r2_dict)
825
+ # sorted_metric_list = sorted(r_2_list, key = lambda x: x[1], reverse=True)
826
+
827
+ if metric == "mse":
828
+ assert type(X_test) == pd.DataFrame and len(X_test) > 0, "Test data test needed for calculating mse"
829
+ predicted_df = self.predict_features(X_test=X_test,
830
+ feature_list=self.__fitted_models.keys(),
831
+ scale_coef=scale_coef)
832
+ mse_series = ((predicted_df - X_test) ** 2).mean()
833
+ sorted_metric_series = mse_series.sort_values(na_position='last')
834
+
835
+ fitted_models = self.get_fitted_models(scale_coef=scale_coef)
836
+ best_model_dict = {feature: fitted_models[feature]
837
+ for feature in sorted_metric_series[:num].index}
838
+ best_model_df = pd.DataFrame(best_model_dict)
839
+ metric_label = metric + "- metric"
840
+ best_model_df.loc[metric_label] = {feature: metric_value
841
+ for feature, metric_value in sorted_metric_series[:num].items()}
842
+ return best_model_df
843
+
844
+ def get_fitted_models(self, scale_coef=True):
845
+ """
846
+ for column scaled data matrix, the scaled coefficients for lhs = Summatiion(coef * term) is
847
+ calculated as coef * (std_of_lhs/std_term).
848
+ """
849
+ assert self.is_fit, "Models need to be fit to data first"
850
+ if scale_coef and self.column_scaled:
851
+ unscaled_fitted_models = self.__fitted_models
852
+ scaled_fitted_model_coef = {
853
+ lib_term: {term: coef * (self.column_scales[lib_term] / self.column_scales[term])
854
+ for term, coef in model_coefs.items()}
855
+ for lib_term, model_coefs in unscaled_fitted_models.items()
856
+ }
857
+ return scaled_fitted_model_coef
858
+ else:
859
+ return self.__fitted_models
860
+
861
+ def get_fitted_intercepts(self, scale_coef=True):
862
+ """
863
+ for column scaled data matrix, the intercept is also scaled as std_of_lhs * intercept
864
+ """
865
+ assert self.is_fit, "Models need to be fit to data first"
866
+ if scale_coef and self.column_scaled:
867
+ unscaled_intercepts = self.__fitted_model_intercepts
868
+ scaled_fitted_model_intercepts = { lib_term: intercept_ * (self.column_scales[lib_term])
869
+ for lib_term, intercept_ in unscaled_intercepts.items()}
870
+ return scaled_fitted_model_intercepts
871
+ else:
872
+ return self.__fitted_model_intercepts
873
+
874
+ def predict_features(self, X_test, feature_list, scale_coef=True):
875
+ """
876
+ Function to predict the value of each feature in feature_list, where each feature is a
877
+ linear function of columns of X_test.
878
+ :param X_test: Data matrix, preferably in pd.DataFrame format.
879
+ param feature_list: list of features to be predicted. eg. ["E", "ES"]
880
+ :param scale_coef: if True, coefficients are scaled back to reflect the
881
+ initial column scaling of data during fitting.
882
+ :return: pd.Dataframe of the same size as X_test
883
+ """
884
+ assert self.is_fit, "Models need to be fit to data first"
885
+ assert set(feature_list) <= set(self.__fitted_models.keys()), ("Feature list should be a subset"
886
+ " of features initially fitted")
887
+ fitted_models = self.get_fitted_models(scale_coef=scale_coef)
888
+ fitted_intercepts = self.get_fitted_intercepts()
889
+ prediction_df = pd.DataFrame(columns=feature_list)
890
+ for feature in feature_list:
891
+ coef_features = fitted_models[feature]
892
+ assert set(coef_features.keys()) <= set(X_test.columns), (
893
+ "Data matrix X_test doesnot have all the feature columns"
894
+ "required for fitting feature {}".format(feature))
895
+ prediction_df[feature] = sum(coef_value * X_test[coef_feat] for coef_feat,
896
+ coef_value in coef_features.items()) + fitted_intercepts[feature]
897
+
898
+ return prediction_df
899
+
900
+ def compare_models(self, true_model_df):
901
+ """
902
+ Method to compare the accuray of fitted models with true model structure. This method calls the
903
+ utility function compare_models_(self.best_models(), true_model_df) to compare the best models
904
+ after fitting with the true model structure. The true model dataframe should have the same column labels,
905
+ index labels, and shape as the models from self.best_models() .
906
+
907
+ @param true_model_df: pd.DataFrame with columns = [LHS of model] index = terms in the RHS of model.
908
+ """
909
+ assert self.is_fit, "Models need to be fit to data first"
910
+ return compare_models_(self.best_models(), true_model_df)
911
+
912
+
913
+
914
+ """
915
+ ------------------------------------------------------------------------------------
916
+ ------------------------------------------------------------------------------------
917
+ """
918
+ class sequentialThLin(MultiOutputMixin, RegressorMixin):
919
+ """
920
+ Model-agnostic implementation of sequential thresholdng to impose l0 sparsity.
921
+ Current support for popular models like linear model with l1 and l2 regularizers, and their combination (ElasticNet). Also has the feature to pass in custom models from the user.
922
+ """
923
+
924
+ def __init__(
925
+ self,
926
+ model_id="RR",
927
+ custom_model=False,
928
+ custom_model_ob=None,
929
+ custom_model_arg=None,
930
+ alpha=1.0,
931
+ l1_ratio=0.5,
932
+ coef_threshold=0.1,
933
+ fit_intercept=False,
934
+ precompute=False,
935
+ max_iter_thresh=500,
936
+ max_iter_optimizer=1000,
937
+ copy_X=True,
938
+ tol=1e-4,
939
+ warm_start=False,
940
+ positive=False,
941
+ random_state=None,
942
+ selection="cyclic",
943
+ ):
944
+ self.model_id = model_id
945
+ self.custom_model = custom_model
946
+ self.custom_model_arg = custom_model_arg
947
+ if custom_model:
948
+ assert custom_model_ob
949
+ assert custom_model_arg
950
+ self.coef_threshold = coef_threshold
951
+ self.max_iter_thresh = max_iter_thresh
952
+ self.max_iter_optimizer = max_iter_optimizer
953
+ self.alpha = alpha,
954
+ self.l1_ratio = l1_ratio,
955
+ self.fit_intercept = fit_intercept,
956
+ self.precompute = precompute,
957
+ self.copy_X = copy_X,
958
+ self.tol = tol,
959
+ self.warm_start = warm_start,
960
+ self.positive = positive,
961
+ self.random_state = random_state,
962
+ self.selection = selection,
963
+
964
+ self.input_arg_dict = {"alpha": alpha,
965
+ "l1_ratio": l1_ratio,
966
+ "fit_intercept": fit_intercept,
967
+ "precompute": precompute,
968
+ "max_iter": max_iter_optimizer,
969
+ "copy_X": copy_X,
970
+ "tol": 1e-4,
971
+ "warm_start": False,
972
+ "positive": False,
973
+ "random_state": None,
974
+ "selection": "cyclic"}
975
+
976
+ self.model_id_dict = {"lasso": linear_model.Lasso,
977
+ "RR": linear_model.Ridge,
978
+ "LR": linear_model.LinearRegression,
979
+ "EN": linear_model.ElasticNet}
980
+ assert (model_id in self.model_id_dict)
981
+
982
+ no_constrain_model = {"LR"}
983
+ elastic_models = {"EN"}
984
+
985
+ # Instantiating model objects. Note that currently only the basic arguments are passed to the constructor (init), but more flexibility cn be achieved by passing more arguments from the self.__init__ to the __init__ of the appropriate models.
986
+ if self.custom_model:
987
+ self.model = custom_model_ob(**self.custom_model_arg)
988
+ self.model_for_score = custom_model_ob(**self.custom_model_arg)
989
+ elif self.model_id in no_constrain_model:
990
+ if fit_intercept:
991
+ self.model = self.model_id_dict[self.model_id](fit_intercept=True)
992
+ self.model_for_score = self.model_id_dict[self.model_id](fit_intercept=True)
993
+ else:
994
+ self.model = self.model_id_dict[self.model_id](fit_intercept=False)
995
+ self.model_for_score = self.model_id_dict[self.model_id](fit_intercept=False)
996
+ elif self.model_id in elastic_models:
997
+ arg_input = self.input_arg_dict
998
+ self.model = self.model_id_dict[self.model_id](**arg_input)
999
+ self.model_for_score = self.model_id_dict[self.model_id](**arg_input)
1000
+ else:
1001
+ arg_input = self.input_arg_dict
1002
+ del arg_input["l1_ratio"], arg_input["precompute"], arg_input["warm_start"], arg_input["selection"]
1003
+ self.model = self.model_id_dict[self.model_id](**arg_input)
1004
+ self.model_for_score = self.model_id_dict[self.model_id](**arg_input)
1005
+
1006
+ self.is_fit = False
1007
+
1008
+ self.coef_history_df = pd.DataFrame()
1009
+ self.coef_history_df_pre_thesh = pd.DataFrame()
1010
+ self.intercept_history_df = pd.DataFrame()
1011
+
1012
+ self.coef_ = None
1013
+ self.feature_names_in_ = None
1014
+ self.intercept_ = 0.0
1015
+
1016
+ def fit(self, X, y=None, solver="auto"):
1017
+
1018
+ # num_features = X.columns.shape[0]
1019
+ # coef_ind = np.zeros(num_features)
1020
+ self.is_fit = True
1021
+ self.coef_history_df = pd.DataFrame(columns=X.columns)
1022
+ self.coef_history_df_pre_thesh = pd.DataFrame(columns=X.columns)
1023
+ self.intercept_history_df = pd.DataFrame(columns=["1"])
1024
+
1025
+ # old_sparse_index = [False] * num_features
1026
+ non_sparse_columns = X.columns
1027
+ X_ind = X[non_sparse_columns]
1028
+ for ind in range(self.max_iter_thresh):
1029
+ self.model.fit(X=X_ind, y=y)
1030
+ coef_ind = self.model.coef_
1031
+ self.coef_history_df_pre_thesh.loc[ind] = dict(zip(self.model.feature_names_in_, self.model.coef_))
1032
+ # non_sparse_index = np.ones(coef_ind.shape)
1033
+ sparse_index = abs(coef_ind) < self.coef_threshold
1034
+ coef_ind[sparse_index] = 0.0
1035
+ self.coef_history_df.loc[ind] = dict(zip(self.model.feature_names_in_, coef_ind))
1036
+ self.intercept_history_df.loc[ind]= {"1": self.model.intercept_}
1037
+
1038
+ non_sparse_columns = non_sparse_columns[~sparse_index]
1039
+ if all(sparse_index): # If all the coef go to zero after thresholding
1040
+ warnings.warn("All coefficients fell below threshold {}, please"
1041
+ " lower threshold".format(self.coef_threshold))
1042
+ break
1043
+
1044
+ if set(X_ind.columns) == set(non_sparse_columns):
1045
+ print("Sequential threshold converged in {} iterations".format(ind))
1046
+ break
1047
+ else:
1048
+ X_ind = X[non_sparse_columns]
1049
+
1050
+ final_coefs = self.coef_history_df.iloc[-1].fillna(0.0)
1051
+ self.coef_ = final_coefs.values
1052
+ self.intercept_ = self.intercept_history_df.iloc[-1]["1"]
1053
+ # self.score = self.model.score
1054
+ self.feature_names_in_ = np.array(X.columns)
1055
+
1056
+ return self
1057
+
1058
+ def score(self, X, y, sample_weight=None):
1059
+ assert self.is_fit
1060
+ final_features = self.coef_history_df.iloc[-1].dropna().index
1061
+ if len(final_features) > 0:
1062
+ self.model_for_score.fit(X=X[final_features], y=y)
1063
+ score_ = self.model_for_score.score(X=X[final_features], y=y)
1064
+ return score_
1065
+ else:
1066
+ return 0