DaeFinder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Manu Jayadharan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: DaeFinder
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python package to discover Differential Algebraic Equations from data.
|
|
5
|
+
Home-page: https://github.com/mjayadharan/DAE-FINDER_dev
|
|
6
|
+
Author: Manu Jayadharan
|
|
7
|
+
Author-email: manu.jayadharan@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.7
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: numpy
|
|
16
|
+
Requires-Dist: scipy
|
|
17
|
+
Requires-Dist: pandas
|
|
18
|
+
Requires-Dist: sympy
|
|
19
|
+
Requires-Dist: scikit-learn
|
|
20
|
+
Requires-Dist: matplotlib
|
|
21
|
+
Requires-Dist: joblib
|
|
22
|
+
Dynamic: author
|
|
23
|
+
Dynamic: author-email
|
|
24
|
+
Dynamic: classifier
|
|
25
|
+
Dynamic: description
|
|
26
|
+
Dynamic: description-content-type
|
|
27
|
+
Dynamic: home-page
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: requires-dist
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
|
32
|
+
|
|
33
|
+
# DaeFinder
|
|
34
|
+
|
|
35
|
+
DaeFinder is a Python package to discover Differential Algebraic Equations (DAEs) from data.
|
|
36
|
+
|
|
37
|
+
## Features
|
|
38
|
+
- Solve and analyze toy enzyme kinetics models.
|
|
39
|
+
- Smooth noisy data and calculate derivatives.
|
|
40
|
+
- Generate polynomial features for regression models.
|
|
41
|
+
- Support for sparse feature coupling.
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
Install the package via pip:
|
|
45
|
+
```bash
|
|
46
|
+
pip install DaeFinder
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
daeFinder/__init__.py,sha256=ooXd2okg64x6gwxGFXZb5cJG0iTqSwS6XJg1U5wjewo,26
|
|
2
|
+
daeFinder/dae_finder.py,sha256=qOx-Zt5lg4v4SidbapTGbEjPTU4erMCp3XXzePNN9Ew,47943
|
|
3
|
+
DaeFinder-0.1.0.dist-info/LICENSE,sha256=GqjANvOy8FvtSb2tRaa_ylQAeBGInvXIEVkju5WBKlI,1072
|
|
4
|
+
DaeFinder-0.1.0.dist-info/METADATA,sha256=PVvHmbiv33xQkG82kYWf0yuTZzBVSrN9EwoLHt-59KY,1259
|
|
5
|
+
DaeFinder-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
6
|
+
DaeFinder-0.1.0.dist-info/top_level.txt,sha256=1Q7Anr1UEe0tZ_RZ4owQqqTUDsnXqHzNcrL8hrICwGU,10
|
|
7
|
+
DaeFinder-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
daeFinder
|
daeFinder/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .dae_finder import *
|
daeFinder/dae_finder.py
ADDED
|
@@ -0,0 +1,1066 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
|
|
3
|
+
from sklearn.preprocessing import PolynomialFeatures
|
|
4
|
+
from sklearn.base import BaseEstimator, TransformerMixin
|
|
5
|
+
from sklearn.base import MultiOutputMixin, RegressorMixin
|
|
6
|
+
from sklearn import linear_model
|
|
7
|
+
from sklearn.preprocessing import StandardScaler
|
|
8
|
+
from sklearn.utils.validation import (
|
|
9
|
+
FLOAT_DTYPES,
|
|
10
|
+
_check_feature_names_in,
|
|
11
|
+
check_is_fitted,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import numpy as np
|
|
16
|
+
import warnings
|
|
17
|
+
import operator
|
|
18
|
+
from copy import deepcopy
|
|
19
|
+
from itertools import permutations
|
|
20
|
+
|
|
21
|
+
from scipy.integrate import odeint
|
|
22
|
+
from scipy import interpolate
|
|
23
|
+
from scipy.sparse import coo_array
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
import sympy
|
|
27
|
+
from sympy import prod, Poly
|
|
28
|
+
|
|
29
|
+
import matplotlib.pyplot as plt
|
|
30
|
+
|
|
31
|
+
from joblib import delayed, Parallel
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
"""Data Generation functions
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def toyEnzRHS(y, t, k_rates):
|
|
39
|
+
# Unpack states, params
|
|
40
|
+
S, E, ES, P = y
|
|
41
|
+
k, kr, kcat = k_rates['k'], k_rates['kr'], k_rates['kcat']
|
|
42
|
+
|
|
43
|
+
dydt = [kr * ES - k * E * S,
|
|
44
|
+
(kr + kcat) * ES - k * S * E,
|
|
45
|
+
k * E * S - (kr + kcat) * ES,
|
|
46
|
+
kcat * ES]
|
|
47
|
+
return dydt
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def solveToyEnz(init_cond, k_rates, solvedT, tsID, print_to_file=False):
|
|
51
|
+
y0 = [init_cond["S"], init_cond["E"],
|
|
52
|
+
init_cond["ES"], init_cond["P"]]
|
|
53
|
+
sol = odeint(lambda y, t: toyEnzRHS(y, t, k_rates), y0, solvedT)
|
|
54
|
+
|
|
55
|
+
paramID = "".join(str(k_rates.values).strip("()").split())
|
|
56
|
+
if print_to_file:
|
|
57
|
+
np.savetxt('data/toyEnzData_' + paramID + '_' + tsID + '.txt', sol)
|
|
58
|
+
return sol
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def toyMM_RHS(y, t, k_rates, IC):
|
|
62
|
+
# Unpack states, params
|
|
63
|
+
# S, E, ES, P = y
|
|
64
|
+
S, P = y
|
|
65
|
+
E_0 = IC["E"]
|
|
66
|
+
|
|
67
|
+
k, kr, kcat = k_rates['k'], k_rates['kr'], k_rates['kcat']
|
|
68
|
+
|
|
69
|
+
# dydt = [S*(-k + kr/(kr+kcat))*(E_0 - (k*E_0*S)/(kr+kcat+k*S)),
|
|
70
|
+
# (k*kcat*E_0*S)/(kr+kcat+k*S)]
|
|
71
|
+
|
|
72
|
+
dydt = [-(k * kcat * E_0 * S) / (kr + kcat + k * S),
|
|
73
|
+
(k * kcat * E_0 * S) / (kr + kcat + k * S)]
|
|
74
|
+
return dydt
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def solveMM(init_cond, k_rates, solvedT, tsID, print_to_scr = False, print_to_file=False):
|
|
78
|
+
if print_to_scr:
|
|
79
|
+
print("Solving for Initial Conditions: {} \n and k_rates: {}".format(init_cond, k_rates))
|
|
80
|
+
y0 = [init_cond["S"], init_cond["P"]]
|
|
81
|
+
E_0 = init_cond["E"]
|
|
82
|
+
k, kr, kcat = k_rates['k'], k_rates['kr'], k_rates['kcat']
|
|
83
|
+
sol = odeint(lambda y, t: toyMM_RHS(y, t, k_rates, init_cond), y0, solvedT)
|
|
84
|
+
# print(sol[-5:,1])
|
|
85
|
+
ES_sol = k * E_0 * sol[:, 0] / (kr + kcat + k * sol[:, 0])
|
|
86
|
+
E_sol = E_0 - ES_sol
|
|
87
|
+
|
|
88
|
+
final_sol = np.column_stack((sol[:, 0], E_sol, ES_sol, sol[:, 1]))
|
|
89
|
+
|
|
90
|
+
# paramID = "".join(str(k_rates.values).strip("()").split())
|
|
91
|
+
# print(paramID)
|
|
92
|
+
if print_to_file:
|
|
93
|
+
np.savetxt('data/MM_Data_' + 'k_' + str(k_rates.values) + '__' + str(init_cond.values) + '_' + tsID + '.txt',
|
|
94
|
+
final_sol)
|
|
95
|
+
return final_sol
|
|
96
|
+
|
|
97
|
+
def plotToyEnz(solT, sol, title = ""):
|
|
98
|
+
plt.plot(solT, sol[:, 0], '-ob', label='S(t)', ms=3)
|
|
99
|
+
plt.plot(solT, sol[:, 1], '-og', label='E(t)', ms=3)
|
|
100
|
+
plt.plot(solT, sol[:, 2], '-or', label='ES(t)', ms=3)
|
|
101
|
+
plt.plot(solT, sol[:, 3], '-ok', label='P(t)', ms=3)
|
|
102
|
+
plt.legend(loc='best')
|
|
103
|
+
plt.xlabel('t')
|
|
104
|
+
plt.grid()
|
|
105
|
+
plt.title(title)
|
|
106
|
+
plt.show()
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
def plotToy_MM(solT, sol, title =""):
|
|
110
|
+
plt.plot(solT, sol[:, 0], '-ob', label='S(t)', ms=3)
|
|
111
|
+
# plt.plot(solT, sol[:, 1], '-og', label='E(t)', ms=3)
|
|
112
|
+
# plt.plot(solT, sol[:, 2], '-or', label='ES(t)', ms=3)
|
|
113
|
+
plt.plot(solT, sol[:, 1], '-ok', label='P(t)', ms=3)
|
|
114
|
+
plt.legend(loc='best')
|
|
115
|
+
plt.xlabel('t')
|
|
116
|
+
plt.title(title)
|
|
117
|
+
plt.grid()
|
|
118
|
+
plt.show()
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
def add_noise_to_df(data_df, noise_perc, make_copy=True,
|
|
122
|
+
random_seed = None, method= "std"):
|
|
123
|
+
"""
|
|
124
|
+
data_df: pandas df with columns representing features.
|
|
125
|
+
Add noise to each feature column in the data matrix using a Gaussian distribution with mean zero and standard deviation equal to
|
|
126
|
+
noise_percentage/100 * std of the feature.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
if random_seed:
|
|
130
|
+
np.random.seed(random_seed)
|
|
131
|
+
if make_copy:
|
|
132
|
+
data_df_new = deepcopy(data_df)
|
|
133
|
+
else:
|
|
134
|
+
data_df_new = data_df
|
|
135
|
+
if method == "std":
|
|
136
|
+
std_features = data_df_new.std()
|
|
137
|
+
for feature in data_df_new:
|
|
138
|
+
noise_level = std_features[feature] * noise_perc/100
|
|
139
|
+
data_df_new[feature] += np.random.normal(loc=0.0, scale=noise_level, size=data_df_new[feature].shape)
|
|
140
|
+
|
|
141
|
+
return data_df_new
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_der_names(feature_list, get_list=False):
|
|
145
|
+
"""
|
|
146
|
+
Utility function to get a strings denoting the derivatives of the features in the feature_list
|
|
147
|
+
:param feature_list: ['A', 'B', 'C'] or any iterable of strings
|
|
148
|
+
:param get_list: If True, a list of strings are returned, else a dictionary is returned.
|
|
149
|
+
:return: dictionary of the form {'A': 'd(A) /dt'}.
|
|
150
|
+
"""
|
|
151
|
+
if get_list:
|
|
152
|
+
return ["d(" + feature + ") /dt" for feature in feature_list]
|
|
153
|
+
return {feature: "d(" + feature + ") /dt" for feature in feature_list}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def der_matrix_calculator(data_matrix, delta_t, rename_feat=True):
|
|
157
|
+
"""
|
|
158
|
+
Utility function to calculate the derivative matrix from a data matrix.
|
|
159
|
+
The data is assumed to be evenly spaced with a time interval delta_t in between.
|
|
160
|
+
Frist order forward difference is then used to find the derivative using (f(t+delta_t)-f(t))/delta_t
|
|
161
|
+
:param data_matrix: pd.DataFrame with features.
|
|
162
|
+
:param delta_t: time difference between subsequent data points.
|
|
163
|
+
:param rename_feat: if True, the features are renamed to reflected the derivative notation in the output.
|
|
164
|
+
:return: pd.DataFrame with len = len(data_matrix)-1.
|
|
165
|
+
"""
|
|
166
|
+
assert delta_t > 1.e-10, "delta_t cannot be too small or negative"
|
|
167
|
+
derr_matrix = (data_matrix.iloc[1:].reset_index(drop=True) -
|
|
168
|
+
data_matrix.iloc[:-1].reset_index(drop=True)) / delta_t
|
|
169
|
+
if rename_feat:
|
|
170
|
+
derr_names = get_der_names(data_matrix.columns)
|
|
171
|
+
derr_matrix.rename(columns=derr_names, inplace=True)
|
|
172
|
+
|
|
173
|
+
return derr_matrix
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def der_label(feature, der=1):
|
|
177
|
+
if der == 0:
|
|
178
|
+
return feature
|
|
179
|
+
elif der == 1:
|
|
180
|
+
return "d({}) /dt".format(feature)
|
|
181
|
+
else:
|
|
182
|
+
return "d^{}({}) /dt^{}".format(der, feature, der)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def smooth_data(data_matrix,
|
|
186
|
+
domain_var="t",
|
|
187
|
+
smooth_method ="spline",
|
|
188
|
+
s_param_=None,
|
|
189
|
+
noise_perc=0,
|
|
190
|
+
derr_order=1,
|
|
191
|
+
eval_points=[],
|
|
192
|
+
num_time_points=0,
|
|
193
|
+
silent =True):
|
|
194
|
+
"""
|
|
195
|
+
:param data_matrix: Data matrix to smoothen. nxp data frame structure is assumed where n is the number of
|
|
196
|
+
data points and p is the number of features (predictors).
|
|
197
|
+
:param domain_var: Domain variable with respect to which the data needs to be smoothened. Default is assumed to be
|
|
198
|
+
"t" (time).
|
|
199
|
+
:param smooth_method: Numerical method used for smoothening.
|
|
200
|
+
:param s_param: smoothening parameter.
|
|
201
|
+
:param noise_perc: optional estimate of noise to signal ratio %
|
|
202
|
+
:param derr_order: Number of derivatives need to be calculated, wrt the domain variable, after smoothening the data.
|
|
203
|
+
:param eval_points: option list of points at which the smoothened data and derivatives will be evaluated for output
|
|
204
|
+
:return: pd.DataFrame of size len(eval_points) x k where k is the number of features and their derivatives.
|
|
205
|
+
"""
|
|
206
|
+
assert domain_var in data_matrix, "domain variable not found in the data matrix"
|
|
207
|
+
s_param = deepcopy(s_param_)
|
|
208
|
+
data_t = data_matrix[domain_var]
|
|
209
|
+
if num_time_points == 0:
|
|
210
|
+
num_time_points = len(data_matrix)
|
|
211
|
+
if len(eval_points) == 0:
|
|
212
|
+
eval_points = np.linspace(data_t.iloc[0], data_t.iloc[-1], num_time_points)
|
|
213
|
+
t_eval_new = eval_points
|
|
214
|
+
|
|
215
|
+
data_matrix_ = data_matrix.drop(domain_var, axis=1)
|
|
216
|
+
data_matrix_std = data_matrix_.std()
|
|
217
|
+
|
|
218
|
+
data_matrix_smooth = pd.DataFrame(t_eval_new, columns=[domain_var])
|
|
219
|
+
|
|
220
|
+
if smooth_method == "spline":
|
|
221
|
+
if s_param:
|
|
222
|
+
s_param_list = [s_param for feature in data_matrix_]
|
|
223
|
+
else:
|
|
224
|
+
s_param_list = [num_time_points * (0.01 * noise_perc * data_matrix_std[feature]) ** 2 for
|
|
225
|
+
feature in data_matrix_]
|
|
226
|
+
smoothened_values_list = [np.hstack([interpolate.splev(t_eval_new, interpolate.splrep(data_t,
|
|
227
|
+
data_matrix_[feature],
|
|
228
|
+
s=s_param_val), der=der_ind) [:, None]
|
|
229
|
+
for der_ind in range(derr_order + 1)])
|
|
230
|
+
for feature, s_param_val in zip(data_matrix_, s_param_list)]
|
|
231
|
+
smoothened_values = np.hstack(smoothened_values_list)
|
|
232
|
+
column_label_list = [[der_label(feature, der_ind) for der_ind in range(derr_order + 1)]
|
|
233
|
+
for feature in data_matrix_]
|
|
234
|
+
column_label_list = list(itertools.chain.from_iterable(column_label_list))
|
|
235
|
+
smoothened_df = pd.DataFrame(smoothened_values, columns=column_label_list)
|
|
236
|
+
data_matrix_smooth = pd.concat([data_matrix_smooth, smoothened_df], axis=1)
|
|
237
|
+
|
|
238
|
+
# for feature in data_matrix_:
|
|
239
|
+
# if not s_param:
|
|
240
|
+
# # smoothing parameter: when equal weightage: num_data_points * std of data
|
|
241
|
+
# s_param = num_time_points * (0.01 * noise_perc * data_matrix_std[feature]) ** 2
|
|
242
|
+
# tck = interpolate.splrep(data_t, data_matrix_[feature], s=s_param)
|
|
243
|
+
# for der_ind in range(derr_order + 1):
|
|
244
|
+
# smoothed_data = interpolate.splev(t_eval_new, tck, der=der_ind)
|
|
245
|
+
# data_matrix_smooth[der_label(feature, der_ind)] = smoothed_data
|
|
246
|
+
else:
|
|
247
|
+
raise "Smoothening type not supported"
|
|
248
|
+
|
|
249
|
+
if not silent:
|
|
250
|
+
print("Returning the smoothened data")
|
|
251
|
+
return data_matrix_smooth
|
|
252
|
+
|
|
253
|
+
def remove_paranth_from_feat(feature_list):
|
|
254
|
+
"""
|
|
255
|
+
Utility function to remove the parenthesis from the name of the feature if they exists.
|
|
256
|
+
If either "[", or "]" are not present, the feature string is returned unchanged.
|
|
257
|
+
:param feature_list: ["[E]", "[ES]"]
|
|
258
|
+
:return: ["E", "ES"]
|
|
259
|
+
"""
|
|
260
|
+
result_list = list(feature_list)
|
|
261
|
+
for ind, feat in enumerate(result_list):
|
|
262
|
+
if "[" in feat and "]" in feat:
|
|
263
|
+
result_list[ind] = feat.replace("[", "").replace("]", "")
|
|
264
|
+
|
|
265
|
+
return result_list
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def poly_to_scipy(exp_list):
|
|
269
|
+
"""
|
|
270
|
+
Utility function to convert the power symbol "^" from monomial strings to scipy compatible "**"
|
|
271
|
+
symbol for power.
|
|
272
|
+
:param exp_list: ["A^2", "A*B^3"]
|
|
273
|
+
:return: ["A**2", "A*B**3"]
|
|
274
|
+
"""
|
|
275
|
+
return [exp.replace(" ", "*").replace("^", "**") for exp in exp_list]
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def get_factor_feat(factor_exp, feat_dict):
|
|
279
|
+
"""
|
|
280
|
+
Utility function to return the list of expressions from expr_list which has factor_exp as a factor
|
|
281
|
+
factor_exp: sympy expression eg: [ES]**2
|
|
282
|
+
feat_dict : {'[ES]*[S]^2': [ES]*[S]**2}
|
|
283
|
+
"""
|
|
284
|
+
return [feat for feat, feat_sym in feat_dict.items() if sympy.fraction(feat_sym / factor_exp)[1] == 1]
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def get_refined_lib(factor_exp, data_matrix_df_, candidate_library_, get_dropped_feat=False):
|
|
288
|
+
"""
|
|
289
|
+
Utility function to get the refined library by removing all features in the candidate library which
|
|
290
|
+
has factor_exp as a factor in it.
|
|
291
|
+
:param factor_exp: sympy expression eg. S*ES
|
|
292
|
+
:param data_matrix_df_ (pd.DataFrame): data matrix containing all the state variables as column labels
|
|
293
|
+
:param candidate_library_ (pd.DataFrame): candidate library that needs to be refined.
|
|
294
|
+
:param get_dropped_feat: if True, both the dropped features and the refined library is returned,
|
|
295
|
+
else only the refined library is returned
|
|
296
|
+
:return:
|
|
297
|
+
"""
|
|
298
|
+
# Adding the state variables as scipy symbols
|
|
299
|
+
feat_list = list(data_matrix_df_.columns)
|
|
300
|
+
feat_list_str = ", ".join(remove_paranth_from_feat(data_matrix_df_.columns))
|
|
301
|
+
exec(feat_list_str + "= sympy.symbols(" + str(feat_list) + ")")
|
|
302
|
+
|
|
303
|
+
# Converting the monomials in the candidate library to scipy expressions
|
|
304
|
+
candid_features = remove_paranth_from_feat(poly_to_scipy(candidate_library_.columns))
|
|
305
|
+
candid_feat_dict = {}
|
|
306
|
+
for feat1, feat2 in zip(candidate_library_.columns, candid_features):
|
|
307
|
+
exec("candid_feat_dict['{}'] = {}".format(feat1, feat2))
|
|
308
|
+
|
|
309
|
+
dropped_feats = set()
|
|
310
|
+
if (isinstance(factor_exp, list) or isinstance(factor_exp, set)):
|
|
311
|
+
for factor_ in factor_exp:
|
|
312
|
+
dropped_feats = dropped_feats.union(set(get_factor_feat(factor_, candid_feat_dict)))
|
|
313
|
+
else:
|
|
314
|
+
dropped_feats = dropped_feats.union(set(get_factor_feat(factor_exp, candid_feat_dict)))
|
|
315
|
+
|
|
316
|
+
if get_dropped_feat:
|
|
317
|
+
return (dropped_feats, candidate_library_.drop(dropped_feats, axis=1))
|
|
318
|
+
else:
|
|
319
|
+
return candidate_library_.drop(dropped_feats, axis=1)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def get_simplified_equation(best_model_df, feature,
|
|
323
|
+
global_feature_list, coef_threshold,
|
|
324
|
+
intercept_threshold= 0.01,
|
|
325
|
+
intercept=0, simplified=True):
|
|
326
|
+
|
|
327
|
+
# Adding the state variables as scipy symbols
|
|
328
|
+
global_feature_list = list(global_feature_list)
|
|
329
|
+
global_feature_list_string = ", ".join(remove_paranth_from_feat(global_feature_list))
|
|
330
|
+
exec(global_feature_list_string + "= sympy.symbols(" + str(global_feature_list) + ")")
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
model_lhs = feature
|
|
334
|
+
model_lhs_sp_string = remove_paranth_from_feat(poly_to_scipy([model_lhs]))[0]
|
|
335
|
+
|
|
336
|
+
#Intercept below the threshold is assigned to zero
|
|
337
|
+
intercept = 0 if abs(intercept) < intercept_threshold else intercept
|
|
338
|
+
|
|
339
|
+
model_coefs = best_model_df[model_lhs].values
|
|
340
|
+
#Coefficients of features in the model below threshold is eliminated
|
|
341
|
+
model_coefs[abs(model_coefs) < coef_threshold] = 0
|
|
342
|
+
|
|
343
|
+
model_rhs_features = remove_paranth_from_feat(poly_to_scipy(best_model_df[model_lhs].keys()))
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
rhs_string_sp_string = [str(coef) + "*" + feature for coef, feature in zip(model_coefs, model_rhs_features) ]
|
|
347
|
+
rhs_string_sp_string = "+".join(rhs_string_sp_string) + "+" + str(intercept)
|
|
348
|
+
|
|
349
|
+
result_dict = {}
|
|
350
|
+
exec("result_dict['lhs'] = {}".format(model_lhs_sp_string))
|
|
351
|
+
exec("result_dict['rhs'] = {}".format(rhs_string_sp_string))
|
|
352
|
+
|
|
353
|
+
if not simplified:
|
|
354
|
+
return result_dict
|
|
355
|
+
else:
|
|
356
|
+
n, d = sympy.fraction(sympy.cancel(result_dict['rhs'] / result_dict['lhs']))
|
|
357
|
+
result_dict['lhs'] = d
|
|
358
|
+
result_dict['rhs'] = n
|
|
359
|
+
|
|
360
|
+
return result_dict
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def get_simplified_equation_list(best_model_df, global_feature_list,
|
|
364
|
+
coef_threshold, intercept_threshold= 0.01,
|
|
365
|
+
intercept_dict={}, simplified=True,
|
|
366
|
+
feature_list_=[]):
|
|
367
|
+
|
|
368
|
+
if len(feature_list_) > 0:
|
|
369
|
+
feature_list = deepcopy(feature_list_)
|
|
370
|
+
assert set(feature_list) <= set(best_model_df.columns), \
|
|
371
|
+
("fit for some features missing from the best_model_df")
|
|
372
|
+
else:
|
|
373
|
+
feature_list = best_model_df.columns
|
|
374
|
+
|
|
375
|
+
result_dict = {feature: get_simplified_equation(best_model_df, feature,
|
|
376
|
+
global_feature_list=global_feature_list,
|
|
377
|
+
coef_threshold=coef_threshold,
|
|
378
|
+
intercept_threshold=intercept_threshold,
|
|
379
|
+
intercept=intercept_dict.get(feature, 0),
|
|
380
|
+
simplified=simplified)
|
|
381
|
+
for feature in feature_list}
|
|
382
|
+
|
|
383
|
+
return result_dict
|
|
384
|
+
|
|
385
|
+
def sympy_symb_to_feature_name(sympy_symb, library_feat_names):
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
@param sympy_symb: sympy symbol string in format
|
|
389
|
+
@param library_feat_names:
|
|
390
|
+
@return:
|
|
391
|
+
"""
|
|
392
|
+
|
|
393
|
+
symb_str = str(sympy_symb).strip()
|
|
394
|
+
if symb_str == "1":
|
|
395
|
+
return
|
|
396
|
+
symb_str = symb_str.replace("**", "^")
|
|
397
|
+
symb_list = symb_str.split("*")
|
|
398
|
+
possible_permutations = permutations(symb_list)
|
|
399
|
+
for symb_perm in possible_permutations:
|
|
400
|
+
feat = " ".join(symb_perm)
|
|
401
|
+
if feat in library_feat_names:
|
|
402
|
+
return feat
|
|
403
|
+
|
|
404
|
+
raise Exception("No feature corresponding to {} exist in the given library_df".format(sympy_symb))
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def construct_reduced_fit_list(full_feature_name_list, simplified_eqs,
|
|
408
|
+
sympy_format=False):
|
|
409
|
+
relation_list = []
|
|
410
|
+
for simpl_eq in simplified_eqs.values():
|
|
411
|
+
lhs = simpl_eq["lhs"]
|
|
412
|
+
rhs = simpl_eq["rhs"]
|
|
413
|
+
lhs_list = []
|
|
414
|
+
rhs_list = []
|
|
415
|
+
try:
|
|
416
|
+
lhs_poly = Poly(lhs)
|
|
417
|
+
lhs_list = [prod(x ** k for x, k in zip(lhs_poly.gens, mon)) for mon in lhs_poly.monoms()]
|
|
418
|
+
except Exception as e:
|
|
419
|
+
print("***Warning: exception occured while trying to find the monomials of {}: {}".format(lhs, e))
|
|
420
|
+
|
|
421
|
+
try:
|
|
422
|
+
rhs_poly = Poly(rhs)
|
|
423
|
+
rhs_list = [prod(x ** k for x, k in zip(rhs_poly.gens, mon)) for mon in rhs_poly.monoms()]
|
|
424
|
+
except Exception as e:
|
|
425
|
+
print("***Warning: exception occured while trying to find the monomials of {}: {}".format(rhs, e))
|
|
426
|
+
|
|
427
|
+
relation_list.append(lhs_list + rhs_list)
|
|
428
|
+
|
|
429
|
+
if sympy_format:
|
|
430
|
+
return relation_list
|
|
431
|
+
else:
|
|
432
|
+
relation_in_lib_feat = [
|
|
433
|
+
[sympy_symb_to_feature_name(sympy_symb, full_feature_name_list) for sympy_symb in relations]
|
|
434
|
+
for relations in relation_list]
|
|
435
|
+
return relation_in_lib_feat
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def compare_models_(models_df_1, models_df_2, tol=1.e-5):
|
|
439
|
+
"""
|
|
440
|
+
Utility function to compare the structure of two models. Note that model_df_1 and model_df_2
|
|
441
|
+
should have the same column labels, index labels, and shape. Returns a data frame with the same
|
|
442
|
+
shape as the model data frames being compared. 0 will appear whenever the term strcture matches
|
|
443
|
+
between two model df, +1 appears when a term is present in models_df_1, and absent in models_df_2.
|
|
444
|
+
Similarly, -1 appears when a term is absent in models_df_1, and present in models_df_2.
|
|
445
|
+
@param models_df_1: pd.DataFrame with columns = [LHS of model] index = terms in the RHS of model.
|
|
446
|
+
@param models_df_2: pd.DataFrame with columns = [LHS of model] index = terms in the RHS of model.
|
|
447
|
+
@param tol: tolerance that will be used for comparing model structure.
|
|
448
|
+
@return: pd.DataFrame of the same shape as models_df_1 and models_df_2. 0 will appear whenever the term strcture matches
|
|
449
|
+
between two model df, +1 appears when a term is present in models_df_1, and absent in models_df_2.
|
|
450
|
+
Similarly, -1 appears when a term is absent in models_df_1, and present in models_df_2.
|
|
451
|
+
"""
|
|
452
|
+
assert models_df_1.shape == models_df_2.shape, "both model dataframes should be of the same shape"
|
|
453
|
+
assert all(models_df_1.columns == models_df_2.columns) and all(models_df_1.index == models_df_2.index)
|
|
454
|
+
|
|
455
|
+
models_df_1[abs(models_df_1) > tol] = 1
|
|
456
|
+
models_df_1[abs(models_df_1) <= tol] = 0
|
|
457
|
+
|
|
458
|
+
models_df_2[abs(models_df_2) > tol] = 1
|
|
459
|
+
models_df_2[abs(models_df_2) <= tol] = 0
|
|
460
|
+
|
|
461
|
+
model_diff_df = models_df_1 - models_df_2
|
|
462
|
+
|
|
463
|
+
model_diff_df.loc["# incosistent terms"] = abs(model_diff_df).sum()
|
|
464
|
+
|
|
465
|
+
return model_diff_df
|
|
466
|
+
|
|
467
|
+
"""
|
|
468
|
+
------------------------------------------------------------------------------------
|
|
469
|
+
------------------------------------------------------------------------------------
|
|
470
|
+
"""
|
|
471
|
+
class PolyFeatureMatrix(BaseEstimator, TransformerMixin):
|
|
472
|
+
"""
|
|
473
|
+
Generic class to create polynomial library terms. This class is a wrapper around
|
|
474
|
+
sklearn's preprocessing.PolynomialFeatures class with support for pandas data frame.
|
|
475
|
+
"""
|
|
476
|
+
def __init__(self, degree=2, interaction_only=False, include_bias=True, output_df=True):
|
|
477
|
+
self.degree = degree
|
|
478
|
+
self.interaction_only = interaction_only
|
|
479
|
+
self.include_bias = include_bias
|
|
480
|
+
self.output_df = output_df
|
|
481
|
+
self.poly_feature = PolynomialFeatures(degree=self.degree,
|
|
482
|
+
interaction_only=self.interaction_only,
|
|
483
|
+
include_bias=self.include_bias)
|
|
484
|
+
|
|
485
|
+
def fit(self, X, y=None):
|
|
486
|
+
self.poly_feature.fit(X)
|
|
487
|
+
return self
|
|
488
|
+
|
|
489
|
+
def transform(self, X, y=None):
|
|
490
|
+
poly_data_matrix = self.poly_feature.transform(X)
|
|
491
|
+
if self.output_df:
|
|
492
|
+
poly_df = pd.DataFrame(poly_data_matrix, columns=self.poly_feature.get_feature_names_out())
|
|
493
|
+
return poly_df
|
|
494
|
+
else:
|
|
495
|
+
return poly_data_matrix
|
|
496
|
+
|
|
497
|
+
"""
|
|
498
|
+
------------------------------------------------------------------------------------
|
|
499
|
+
------------------------------------------------------------------------------------
|
|
500
|
+
"""
|
|
501
|
+
class FeatureCouplingTransformer(TransformerMixin, BaseEstimator):
|
|
502
|
+
"""
|
|
503
|
+
Transformer class for generating features (candidate library functions) derived from coupling between features.
|
|
504
|
+
The coupling between features can either be implied from a sparsity matrix (preferred), or can be explicitly
|
|
505
|
+
provided to the constructor.
|
|
506
|
+
|
|
507
|
+
Coupling behavior of the features can be explicitly fed to the constructor using hte coupling_func argument. If no
|
|
508
|
+
coupling_func is provided, a second order interaction of the form feature_1*feature_2 is assumed.
|
|
509
|
+
|
|
510
|
+
Examples
|
|
511
|
+
---------
|
|
512
|
+
Case 1: No coupling_func is provided (so default interaction coupling is assumed)
|
|
513
|
+
|
|
514
|
+
data_matrix_ = pd.DataFrame([[1,2,3], [4,5,6]], columns = ["t", "x", "y"])
|
|
515
|
+
row = np.array([0, 0, 1, 1])
|
|
516
|
+
col = np.array([0, 2, 2, 1])
|
|
517
|
+
data = np.array([4, 5, 7, 5])
|
|
518
|
+
sparsity_matrix = coo_array((data, (row, col)))
|
|
519
|
+
coupling_transf = FeatureCouplingTransformer(sparsity_matrix)
|
|
520
|
+
transformed_features = coupling_transf.fit_transform(data_matrix_)
|
|
521
|
+
print(coupling_transf.get_get_feature_names_out())
|
|
522
|
+
output: array(['t*t', 't*y', 'x*y', 'x*x'], dtype=object)
|
|
523
|
+
|
|
524
|
+
Case 1: Coupling function is provided.
|
|
525
|
+
|
|
526
|
+
data_matrix_ = pd.DataFrame([[1,2,3], [4,5,6]], columns = ["t", "x", "y"])
|
|
527
|
+
row = np.array([0, 0, 1, 1])
|
|
528
|
+
col = np.array([0, 2, 2, 1])
|
|
529
|
+
data = np.array([4, 5, 7, 5])
|
|
530
|
+
sparsity_matrix = coo_array((data, (row, col)))
|
|
531
|
+
def coup_fun(x,y,i,j,k=0):
|
|
532
|
+
return x-y-k
|
|
533
|
+
coupling_transf = FeatureCouplingTransformer(sp_array_2,
|
|
534
|
+
coupling_func= coup_fun,
|
|
535
|
+
coupling_namer= lambda x,y,i,j,k : "{}-{}-{}".format(x,y,k),
|
|
536
|
+
coupling_func_args={"k":2})
|
|
537
|
+
transformed_features = coupling_transf.fit_transform(data_matrix_)
|
|
538
|
+
print(coupling_transf.get_get_feature_names_out())
|
|
539
|
+
array(['t-t-2', 't-y-2', 'x-y-2', 'x-x-2'], dtype=object)
|
|
540
|
+
|
|
541
|
+
"""
|
|
542
|
+
|
|
543
|
+
def __init__(self, sparsity_matrix=None, coupled_indices_list=None,
|
|
544
|
+
coupling_func=None, coupling_namer=None,
|
|
545
|
+
coupling_func_args={}, return_df=False):
|
|
546
|
+
"""
|
|
547
|
+
Note that if coupled indices list is not explicitly given to the constructor, a valid sparsity matrix
|
|
548
|
+
from which the coupled indices can be implied should be provided.
|
|
549
|
+
|
|
550
|
+
@param sparsity_matrix: Sparsity matrix in the scipy.sparse.coo_array format (preferred over directly
|
|
551
|
+
providing coupled_indices_list
|
|
552
|
+
@param coupled_indices_list: List of tuples [(i,j)] which shows coupling between factors with indices i and j
|
|
553
|
+
@param coupling_func: Custom function to define coupling between features. Note that the coupling_func function
|
|
554
|
+
should have arguments (feature_1_value,feature_2_value, i, j) as the first four arguments.
|
|
555
|
+
@param coupling_namer: Custom function to name the feature corresponding to each coupling. Note that the
|
|
556
|
+
coupling_namer function should have arguments (feature_1_value,feature_2_value, i, j)
|
|
557
|
+
as the first four arguments.
|
|
558
|
+
@param coupling_func_args: optional keyword arguments for the coupling_function and coupling_namer functions
|
|
559
|
+
@param return_df: bool flag to output pandas DataFrame instead of numpy array. False by default
|
|
560
|
+
"""
|
|
561
|
+
|
|
562
|
+
if not coupled_indices_list:
|
|
563
|
+
assert isinstance(sparsity_matrix, coo_array), "FeatureDiffTransformer only support sparsity matrix\
|
|
564
|
+
in the scipy.sparse.coo_array format"
|
|
565
|
+
self.sparsity_matrix = sparsity_matrix
|
|
566
|
+
self.coupled_indices_list = coupled_indices_list
|
|
567
|
+
if not coupling_func:
|
|
568
|
+
self.coupling_func = lambda x, y, i, j: x * y
|
|
569
|
+
else:
|
|
570
|
+
# If coupling function is not given, it is defined as the interaction term feature_1*feature_2
|
|
571
|
+
self.coupling_func = coupling_func
|
|
572
|
+
|
|
573
|
+
if not coupling_namer:
|
|
574
|
+
self.coupling_namer = lambda feature_1, feature_2, i, j: "{}*{}".format(feature_1, feature_2)
|
|
575
|
+
else:
|
|
576
|
+
self.coupling_namer = coupling_namer
|
|
577
|
+
|
|
578
|
+
self.coupling_func_args = coupling_func_args
|
|
579
|
+
self.return_df = return_df
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
self.n_features_in_ = 0
|
|
583
|
+
self.feature_names_in_ = None
|
|
584
|
+
|
|
585
|
+
def get_feature_names_out(self, input_features=None):
|
|
586
|
+
"""
|
|
587
|
+
Get output feature names for transformation.
|
|
588
|
+
|
|
589
|
+
@param input_features: - If `input_features is None`, then `feature_names_in_` is
|
|
590
|
+
used as feature names in. If `feature_names_in_` is not defined,
|
|
591
|
+
then the following input feature names are generated:
|
|
592
|
+
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
|
593
|
+
- If `input_features` is an array-like, then `input_features` must
|
|
594
|
+
match `feature_names_in_` if `feature_names_in_` is defined.
|
|
595
|
+
It is recommended that the coupling between features are given using a sparsity matrix
|
|
596
|
+
instead of coupling indices.
|
|
597
|
+
@return: feature_names_out : ndarray of str objects
|
|
598
|
+
Transformed feature names.
|
|
599
|
+
"""
|
|
600
|
+
|
|
601
|
+
check_is_fitted(self)
|
|
602
|
+
input_features = _check_feature_names_in(self, input_features)
|
|
603
|
+
|
|
604
|
+
feature_names = [self.coupling_namer(input_features[i], input_features[j], i, j, **self.coupling_func_args) for
|
|
605
|
+
i, j in self.coupled_indices_list]
|
|
606
|
+
|
|
607
|
+
return np.asarray(feature_names, dtype=object)
|
|
608
|
+
|
|
609
|
+
def fit(self, X, y=None):
|
|
610
|
+
|
|
611
|
+
self.n_features_in_ = X.shape[1]
|
|
612
|
+
if len(X.columns) > 0:
|
|
613
|
+
self.feature_names_in_ = X.columns
|
|
614
|
+
if not self.coupled_indices_list: # sparsity matrix gives the coupling indices
|
|
615
|
+
assert max(self.sparsity_matrix.col.max(), self.sparsity_matrix.row.max()) <= self.n_features_in_ - 1, \
|
|
616
|
+
"sparsity matrix has indices out of bound of the number of features"
|
|
617
|
+
# Extracting the indices that has coupling with each other.
|
|
618
|
+
self.coupled_indices_list = list(zip(self.sparsity_matrix.row, self.sparsity_matrix.col))
|
|
619
|
+
|
|
620
|
+
return self
|
|
621
|
+
|
|
622
|
+
def transform(self, X):
|
|
623
|
+
"""Transform data to output the coupled features
|
|
624
|
+
|
|
625
|
+
Parameters
|
|
626
|
+
----------
|
|
627
|
+
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
628
|
+
The data to transform, row by row.
|
|
629
|
+
|
|
630
|
+
Returns
|
|
631
|
+
-------
|
|
632
|
+
XP : {ndarray, sparse matrix} of shape (n_samples, NS)
|
|
633
|
+
The matrix of features, where `NS` is the number of non-zero
|
|
634
|
+
connections implied from the sparsity matrix. NS = len(self.get_features_names_out())
|
|
635
|
+
"""
|
|
636
|
+
check_is_fitted(self)
|
|
637
|
+
|
|
638
|
+
X = self._validate_data(
|
|
639
|
+
X, order="F", dtype=FLOAT_DTYPES, reset=False, accept_sparse=("csr", "csc")
|
|
640
|
+
)
|
|
641
|
+
X_transpose = X.T
|
|
642
|
+
X_coupled = np.vstack([self.coupling_func(X_transpose[i], X_transpose[j], i, j, **self.coupling_func_args)
|
|
643
|
+
for i, j in self.coupled_indices_list]).T
|
|
644
|
+
if self.return_df:
|
|
645
|
+
return pd.DataFrame(X_coupled, columns=self.get_feature_names_out())
|
|
646
|
+
|
|
647
|
+
return X_coupled
|
|
648
|
+
|
|
649
|
+
"""
|
|
650
|
+
------------------------------------------------------------------------------------
|
|
651
|
+
------------------------------------------------------------------------------------
|
|
652
|
+
"""
|
|
653
|
+
|
|
654
|
+
class AlgModelFinder(BaseEstimator):
|
|
655
|
+
|
|
656
|
+
""""
|
|
657
|
+
Class that helps with finding algrebraic relationship between features (columns)
|
|
658
|
+
of a data matrix.
|
|
659
|
+
- Several prebuilt model choices like lasso, ridge, elastic net etc.
|
|
660
|
+
- Can work with custom models that suppport .fit(), .coef_ methods.
|
|
661
|
+
Simply need to pass the custom model to the constructor.
|
|
662
|
+
- Choice to scale columns and scale back the fitted coefficients accordingly.
|
|
663
|
+
- Selection of best 'n' models using different metrics. "R2" and "mse" on test data are
|
|
664
|
+
prebuilt. Option to pass custom metric object. Can be extended to include other relevant
|
|
665
|
+
metrics as pre-built.
|
|
666
|
+
|
|
667
|
+
"""
|
|
668
|
+
def __init__(self, model_id="lasso",
|
|
669
|
+
custom_model=False,
|
|
670
|
+
custom_model_ob=None,
|
|
671
|
+
alpha=0.1,
|
|
672
|
+
fit_intercept=False
|
|
673
|
+
):
|
|
674
|
+
self.model_id_dict = {"lasso": linear_model.Lasso,
|
|
675
|
+
"RR": linear_model.Ridge,
|
|
676
|
+
"LR": linear_model.LinearRegression}
|
|
677
|
+
if custom_model:
|
|
678
|
+
assert custom_model_ob
|
|
679
|
+
else:
|
|
680
|
+
assert (model_id in self.model_id_dict)
|
|
681
|
+
|
|
682
|
+
self.custom_model = custom_model
|
|
683
|
+
self.model_id = model_id
|
|
684
|
+
self.fit_intercept = fit_intercept
|
|
685
|
+
self.alpha = alpha
|
|
686
|
+
self.is_fit = False
|
|
687
|
+
# {feature: R2_score} obtained from fitting each feature against rest
|
|
688
|
+
self.r2_score_dict = {}
|
|
689
|
+
self.__fitted_models = {}
|
|
690
|
+
self.fitted_models_unscaled = {}
|
|
691
|
+
self.column_scaled = False
|
|
692
|
+
|
|
693
|
+
no_contraint_models = {"LR"}
|
|
694
|
+
self.custom_model_ob = custom_model_ob
|
|
695
|
+
if custom_model:
|
|
696
|
+
self.model = custom_model_ob
|
|
697
|
+
elif self.model_id in no_contraint_models:
|
|
698
|
+
# Instantiating the model object
|
|
699
|
+
self.model = self.model_id_dict[self.model_id](fit_intercept=self.fit_intercept)
|
|
700
|
+
else:
|
|
701
|
+
self.model = self.model_id_dict[self.model_id](alpha=self.alpha,
|
|
702
|
+
fit_intercept=self.fit_intercept)
|
|
703
|
+
self.column_scales = None
|
|
704
|
+
|
|
705
|
+
def fit_and_score(self, feature_, X_scaled_, feature_to_library_map_):
|
|
706
|
+
possible_library_terms = feature_to_library_map_[feature_]
|
|
707
|
+
X_features = X_scaled_[possible_library_terms]
|
|
708
|
+
y_target = X_scaled_[feature_]
|
|
709
|
+
|
|
710
|
+
self.model.fit(X=X_features, y=y_target)
|
|
711
|
+
coefficients = dict(zip(self.model.feature_names_in_, self.model.coef_))
|
|
712
|
+
intercept = self.model.intercept_
|
|
713
|
+
score = self.model.score(X_features, y_target)
|
|
714
|
+
return coefficients, intercept, score
|
|
715
|
+
|
|
716
|
+
def fit(self,
|
|
717
|
+
X,
|
|
718
|
+
y=None,
|
|
719
|
+
scale_columns=False,
|
|
720
|
+
center_mean=False,
|
|
721
|
+
features_to_fit = None,
|
|
722
|
+
feature_to_library_map_ ={},
|
|
723
|
+
coupling_matrix = None,
|
|
724
|
+
parallelize = False,
|
|
725
|
+
num_cpu = 4
|
|
726
|
+
):
|
|
727
|
+
|
|
728
|
+
"""
|
|
729
|
+
X -> Data matrix (either (n,m) numpy array or pandas DF), where each column represents
|
|
730
|
+
one feature from the candidate library.
|
|
731
|
+
scale_columns -> divide the columns by std to get a unit variance for columns.
|
|
732
|
+
features_to_fit -> List of features to fit against the rest of the library terms
|
|
733
|
+
"""
|
|
734
|
+
if self.fit_intercept:
|
|
735
|
+
assert "1" not in X, ("Constant column should not be part of the data set if fit_intercept "
|
|
736
|
+
"is set to True")
|
|
737
|
+
self.is_fit = True
|
|
738
|
+
feature_to_library_map = deepcopy(feature_to_library_map_)
|
|
739
|
+
|
|
740
|
+
r_2_dict_unsorted = {}
|
|
741
|
+
self.__fitted_models = {}
|
|
742
|
+
self.__fitted_model_intercepts = {}
|
|
743
|
+
self.r2_score_dict = {}
|
|
744
|
+
if scale_columns:
|
|
745
|
+
s_scaler = StandardScaler(with_std=scale_columns, with_mean=center_mean)
|
|
746
|
+
X_scaled = pd.DataFrame(s_scaler.fit_transform(X), columns=s_scaler.feature_names_in_)
|
|
747
|
+
# Making sure constant term is not removed after mean centering to zero
|
|
748
|
+
if center_mean and '1' in X_scaled:
|
|
749
|
+
X_scaled['1'] = 1
|
|
750
|
+
if scale_columns:
|
|
751
|
+
self.column_scaled = True
|
|
752
|
+
self.column_scales = X.std()
|
|
753
|
+
# To avoid division by zero during the scaling step.
|
|
754
|
+
self.column_scales['1'] = 1
|
|
755
|
+
else:
|
|
756
|
+
X_scaled = X
|
|
757
|
+
if not features_to_fit:
|
|
758
|
+
features_to_fit = X_scaled.columns
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
for feature in features_to_fit:
|
|
762
|
+
#If feature to library map is not given, all the members of the universal
|
|
763
|
+
# candidate library will be fit against the feature.
|
|
764
|
+
if feature not in feature_to_library_map:
|
|
765
|
+
possible_library_terms = X_scaled.columns.drop(feature, errors='ignore')
|
|
766
|
+
else:
|
|
767
|
+
# print(feature_to_library_map)
|
|
768
|
+
# print(feature, "-reached here")
|
|
769
|
+
possible_library_terms = feature_to_library_map[feature]
|
|
770
|
+
assert set(possible_library_terms) <= set(X_scaled.columns), \
|
|
771
|
+
("library terms for feature {} from feature_to_library_map is not found"
|
|
772
|
+
"in the universal X library")
|
|
773
|
+
feature_to_library_map[feature] = possible_library_terms
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
# self.model.fit(X=X_scaled[possible_library_terms], y=X_scaled[feature])
|
|
777
|
+
# self.__fitted_models[feature] = dict(zip(self.model.feature_names_in_, self.model.coef_))
|
|
778
|
+
# self.__fitted_model_intercepts[feature] = self.model.intercept_
|
|
779
|
+
# # self.model.score(X=X_scaled[possible_library_terms],
|
|
780
|
+
# # y=X_scaled[feature])
|
|
781
|
+
# r_2_dict_unsorted[feature] = self.model.score(X=X_scaled[possible_library_terms],
|
|
782
|
+
# y=X_scaled[feature])
|
|
783
|
+
|
|
784
|
+
#r_2_dict_unsorted = {feature: self.model.fit_score(X=X_scaled.drop([feature], axis=1),
|
|
785
|
+
# y=X_scaled[feature]) for feature in X_scaled}
|
|
786
|
+
|
|
787
|
+
# Using dictionary comprehensions to store model details and R² scores
|
|
788
|
+
|
|
789
|
+
# res = Parallel(n_jobs=20)(delayed(dummy)(x) for x in range(100))
|
|
790
|
+
if parallelize:
|
|
791
|
+
combined_fit_results_list = Parallel(n_jobs=num_cpu,require='sharedmem')(delayed(self.fit_and_score)
|
|
792
|
+
(feature, X_scaled, feature_to_library_map)
|
|
793
|
+
for feature in features_to_fit )
|
|
794
|
+
combined_fit_results = dict(zip(features_to_fit, combined_fit_results_list))
|
|
795
|
+
else:
|
|
796
|
+
combined_fit_results = {feature: self.fit_and_score(feature, X_scaled, feature_to_library_map) for feature in features_to_fit}
|
|
797
|
+
|
|
798
|
+
# Extracting separate dictionaries for coefficients, intercepts, and R² scores
|
|
799
|
+
self.__fitted_models = {feature: result[0] for feature, result in combined_fit_results.items()}
|
|
800
|
+
self.__fitted_model_intercepts = {feature: result[1] for feature, result in combined_fit_results.items()}
|
|
801
|
+
r_2_dict_unsorted = {feature: result[2] for feature, result in combined_fit_results.items()}
|
|
802
|
+
self.r2_score_dict = dict(sorted(r_2_dict_unsorted.items(), key=operator.itemgetter(1)))
|
|
803
|
+
|
|
804
|
+
# feature_to_library_map = {}
|
|
805
|
+
return self
|
|
806
|
+
|
|
807
|
+
def best_models(self, num=0, X_test=None, metric="r2",
|
|
808
|
+
scale_coef=True):
|
|
809
|
+
"""
|
|
810
|
+
If X_test == None, the r_2 scores already stored from the underlying model will be used for
|
|
811
|
+
selection.
|
|
812
|
+
Best models are selected according to best metric value (eg. high R2 or low mse)
|
|
813
|
+
"""
|
|
814
|
+
assert self.is_fit, "Models need to be fit to data first"
|
|
815
|
+
sorted_metric_series = []
|
|
816
|
+
if num < 1: # Output all possible models
|
|
817
|
+
num = len(self.__fitted_models)
|
|
818
|
+
metric_set = {"r2", "mse"}
|
|
819
|
+
assert metric in metric_set, "metric {} is not supported. Only {} is supported".format(metric, metric_set)
|
|
820
|
+
if metric == "r2": # Use the already computed r_2 scores for selection
|
|
821
|
+
r_2_list = list(zip(list(self.r2_score_dict.keys()),
|
|
822
|
+
list(self.r2_score_dict.values())))
|
|
823
|
+
sorted_r2_dict = dict(sorted(self.r2_score_dict.items(), key=operator.itemgetter(1), reverse=True))
|
|
824
|
+
sorted_metric_series = pd.Series(sorted_r2_dict)
|
|
825
|
+
# sorted_metric_list = sorted(r_2_list, key = lambda x: x[1], reverse=True)
|
|
826
|
+
|
|
827
|
+
if metric == "mse":
|
|
828
|
+
assert type(X_test) == pd.DataFrame and len(X_test) > 0, "Test data test needed for calculating mse"
|
|
829
|
+
predicted_df = self.predict_features(X_test=X_test,
|
|
830
|
+
feature_list=self.__fitted_models.keys(),
|
|
831
|
+
scale_coef=scale_coef)
|
|
832
|
+
mse_series = ((predicted_df - X_test) ** 2).mean()
|
|
833
|
+
sorted_metric_series = mse_series.sort_values(na_position='last')
|
|
834
|
+
|
|
835
|
+
fitted_models = self.get_fitted_models(scale_coef=scale_coef)
|
|
836
|
+
best_model_dict = {feature: fitted_models[feature]
|
|
837
|
+
for feature in sorted_metric_series[:num].index}
|
|
838
|
+
best_model_df = pd.DataFrame(best_model_dict)
|
|
839
|
+
metric_label = metric + "- metric"
|
|
840
|
+
best_model_df.loc[metric_label] = {feature: metric_value
|
|
841
|
+
for feature, metric_value in sorted_metric_series[:num].items()}
|
|
842
|
+
return best_model_df
|
|
843
|
+
|
|
844
|
+
def get_fitted_models(self, scale_coef=True):
|
|
845
|
+
"""
|
|
846
|
+
for column scaled data matrix, the scaled coefficients for lhs = Summatiion(coef * term) is
|
|
847
|
+
calculated as coef * (std_of_lhs/std_term).
|
|
848
|
+
"""
|
|
849
|
+
assert self.is_fit, "Models need to be fit to data first"
|
|
850
|
+
if scale_coef and self.column_scaled:
|
|
851
|
+
unscaled_fitted_models = self.__fitted_models
|
|
852
|
+
scaled_fitted_model_coef = {
|
|
853
|
+
lib_term: {term: coef * (self.column_scales[lib_term] / self.column_scales[term])
|
|
854
|
+
for term, coef in model_coefs.items()}
|
|
855
|
+
for lib_term, model_coefs in unscaled_fitted_models.items()
|
|
856
|
+
}
|
|
857
|
+
return scaled_fitted_model_coef
|
|
858
|
+
else:
|
|
859
|
+
return self.__fitted_models
|
|
860
|
+
|
|
861
|
+
def get_fitted_intercepts(self, scale_coef=True):
|
|
862
|
+
"""
|
|
863
|
+
for column scaled data matrix, the intercept is also scaled as std_of_lhs * intercept
|
|
864
|
+
"""
|
|
865
|
+
assert self.is_fit, "Models need to be fit to data first"
|
|
866
|
+
if scale_coef and self.column_scaled:
|
|
867
|
+
unscaled_intercepts = self.__fitted_model_intercepts
|
|
868
|
+
scaled_fitted_model_intercepts = { lib_term: intercept_ * (self.column_scales[lib_term])
|
|
869
|
+
for lib_term, intercept_ in unscaled_intercepts.items()}
|
|
870
|
+
return scaled_fitted_model_intercepts
|
|
871
|
+
else:
|
|
872
|
+
return self.__fitted_model_intercepts
|
|
873
|
+
|
|
874
|
+
def predict_features(self, X_test, feature_list, scale_coef=True):
|
|
875
|
+
"""
|
|
876
|
+
Function to predict the value of each feature in feature_list, where each feature is a
|
|
877
|
+
linear function of columns of X_test.
|
|
878
|
+
:param X_test: Data matrix, preferably in pd.DataFrame format.
|
|
879
|
+
param feature_list: list of features to be predicted. eg. ["E", "ES"]
|
|
880
|
+
:param scale_coef: if True, coefficients are scaled back to reflect the
|
|
881
|
+
initial column scaling of data during fitting.
|
|
882
|
+
:return: pd.Dataframe of the same size as X_test
|
|
883
|
+
"""
|
|
884
|
+
assert self.is_fit, "Models need to be fit to data first"
|
|
885
|
+
assert set(feature_list) <= set(self.__fitted_models.keys()), ("Feature list should be a subset"
|
|
886
|
+
" of features initially fitted")
|
|
887
|
+
fitted_models = self.get_fitted_models(scale_coef=scale_coef)
|
|
888
|
+
fitted_intercepts = self.get_fitted_intercepts()
|
|
889
|
+
prediction_df = pd.DataFrame(columns=feature_list)
|
|
890
|
+
for feature in feature_list:
|
|
891
|
+
coef_features = fitted_models[feature]
|
|
892
|
+
assert set(coef_features.keys()) <= set(X_test.columns), (
|
|
893
|
+
"Data matrix X_test doesnot have all the feature columns"
|
|
894
|
+
"required for fitting feature {}".format(feature))
|
|
895
|
+
prediction_df[feature] = sum(coef_value * X_test[coef_feat] for coef_feat,
|
|
896
|
+
coef_value in coef_features.items()) + fitted_intercepts[feature]
|
|
897
|
+
|
|
898
|
+
return prediction_df
|
|
899
|
+
|
|
900
|
+
def compare_models(self, true_model_df):
|
|
901
|
+
"""
|
|
902
|
+
Method to compare the accuray of fitted models with true model structure. This method calls the
|
|
903
|
+
utility function compare_models_(self.best_models(), true_model_df) to compare the best models
|
|
904
|
+
after fitting with the true model structure. The true model dataframe should have the same column labels,
|
|
905
|
+
index labels, and shape as the models from self.best_models() .
|
|
906
|
+
|
|
907
|
+
@param true_model_df: pd.DataFrame with columns = [LHS of model] index = terms in the RHS of model.
|
|
908
|
+
"""
|
|
909
|
+
assert self.is_fit, "Models need to be fit to data first"
|
|
910
|
+
return compare_models_(self.best_models(), true_model_df)
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
"""
|
|
915
|
+
------------------------------------------------------------------------------------
|
|
916
|
+
------------------------------------------------------------------------------------
|
|
917
|
+
"""
|
|
918
|
+
class sequentialThLin(MultiOutputMixin, RegressorMixin):
|
|
919
|
+
"""
|
|
920
|
+
Model-agnostic implementation of sequential thresholdng to impose l0 sparsity.
|
|
921
|
+
Current support for popular models like linear model with l1 and l2 regularizers, and their combination (ElasticNet). Also has the feature to pass in custom models from the user.
|
|
922
|
+
"""
|
|
923
|
+
|
|
924
|
+
def __init__(
|
|
925
|
+
self,
|
|
926
|
+
model_id="RR",
|
|
927
|
+
custom_model=False,
|
|
928
|
+
custom_model_ob=None,
|
|
929
|
+
custom_model_arg=None,
|
|
930
|
+
alpha=1.0,
|
|
931
|
+
l1_ratio=0.5,
|
|
932
|
+
coef_threshold=0.1,
|
|
933
|
+
fit_intercept=False,
|
|
934
|
+
precompute=False,
|
|
935
|
+
max_iter_thresh=500,
|
|
936
|
+
max_iter_optimizer=1000,
|
|
937
|
+
copy_X=True,
|
|
938
|
+
tol=1e-4,
|
|
939
|
+
warm_start=False,
|
|
940
|
+
positive=False,
|
|
941
|
+
random_state=None,
|
|
942
|
+
selection="cyclic",
|
|
943
|
+
):
|
|
944
|
+
self.model_id = model_id
|
|
945
|
+
self.custom_model = custom_model
|
|
946
|
+
self.custom_model_arg = custom_model_arg
|
|
947
|
+
if custom_model:
|
|
948
|
+
assert custom_model_ob
|
|
949
|
+
assert custom_model_arg
|
|
950
|
+
self.coef_threshold = coef_threshold
|
|
951
|
+
self.max_iter_thresh = max_iter_thresh
|
|
952
|
+
self.max_iter_optimizer = max_iter_optimizer
|
|
953
|
+
self.alpha = alpha,
|
|
954
|
+
self.l1_ratio = l1_ratio,
|
|
955
|
+
self.fit_intercept = fit_intercept,
|
|
956
|
+
self.precompute = precompute,
|
|
957
|
+
self.copy_X = copy_X,
|
|
958
|
+
self.tol = tol,
|
|
959
|
+
self.warm_start = warm_start,
|
|
960
|
+
self.positive = positive,
|
|
961
|
+
self.random_state = random_state,
|
|
962
|
+
self.selection = selection,
|
|
963
|
+
|
|
964
|
+
self.input_arg_dict = {"alpha": alpha,
|
|
965
|
+
"l1_ratio": l1_ratio,
|
|
966
|
+
"fit_intercept": fit_intercept,
|
|
967
|
+
"precompute": precompute,
|
|
968
|
+
"max_iter": max_iter_optimizer,
|
|
969
|
+
"copy_X": copy_X,
|
|
970
|
+
"tol": 1e-4,
|
|
971
|
+
"warm_start": False,
|
|
972
|
+
"positive": False,
|
|
973
|
+
"random_state": None,
|
|
974
|
+
"selection": "cyclic"}
|
|
975
|
+
|
|
976
|
+
self.model_id_dict = {"lasso": linear_model.Lasso,
|
|
977
|
+
"RR": linear_model.Ridge,
|
|
978
|
+
"LR": linear_model.LinearRegression,
|
|
979
|
+
"EN": linear_model.ElasticNet}
|
|
980
|
+
assert (model_id in self.model_id_dict)
|
|
981
|
+
|
|
982
|
+
no_constrain_model = {"LR"}
|
|
983
|
+
elastic_models = {"EN"}
|
|
984
|
+
|
|
985
|
+
# Instantiating model objects. Note that currently only the basic arguments are passed to the constructor (init), but more flexibility cn be achieved by passing more arguments from the self.__init__ to the __init__ of the appropriate models.
|
|
986
|
+
if self.custom_model:
|
|
987
|
+
self.model = custom_model_ob(**self.custom_model_arg)
|
|
988
|
+
self.model_for_score = custom_model_ob(**self.custom_model_arg)
|
|
989
|
+
elif self.model_id in no_constrain_model:
|
|
990
|
+
if fit_intercept:
|
|
991
|
+
self.model = self.model_id_dict[self.model_id](fit_intercept=True)
|
|
992
|
+
self.model_for_score = self.model_id_dict[self.model_id](fit_intercept=True)
|
|
993
|
+
else:
|
|
994
|
+
self.model = self.model_id_dict[self.model_id](fit_intercept=False)
|
|
995
|
+
self.model_for_score = self.model_id_dict[self.model_id](fit_intercept=False)
|
|
996
|
+
elif self.model_id in elastic_models:
|
|
997
|
+
arg_input = self.input_arg_dict
|
|
998
|
+
self.model = self.model_id_dict[self.model_id](**arg_input)
|
|
999
|
+
self.model_for_score = self.model_id_dict[self.model_id](**arg_input)
|
|
1000
|
+
else:
|
|
1001
|
+
arg_input = self.input_arg_dict
|
|
1002
|
+
del arg_input["l1_ratio"], arg_input["precompute"], arg_input["warm_start"], arg_input["selection"]
|
|
1003
|
+
self.model = self.model_id_dict[self.model_id](**arg_input)
|
|
1004
|
+
self.model_for_score = self.model_id_dict[self.model_id](**arg_input)
|
|
1005
|
+
|
|
1006
|
+
self.is_fit = False
|
|
1007
|
+
|
|
1008
|
+
self.coef_history_df = pd.DataFrame()
|
|
1009
|
+
self.coef_history_df_pre_thesh = pd.DataFrame()
|
|
1010
|
+
self.intercept_history_df = pd.DataFrame()
|
|
1011
|
+
|
|
1012
|
+
self.coef_ = None
|
|
1013
|
+
self.feature_names_in_ = None
|
|
1014
|
+
self.intercept_ = 0.0
|
|
1015
|
+
|
|
1016
|
+
def fit(self, X, y=None, solver="auto"):
|
|
1017
|
+
|
|
1018
|
+
# num_features = X.columns.shape[0]
|
|
1019
|
+
# coef_ind = np.zeros(num_features)
|
|
1020
|
+
self.is_fit = True
|
|
1021
|
+
self.coef_history_df = pd.DataFrame(columns=X.columns)
|
|
1022
|
+
self.coef_history_df_pre_thesh = pd.DataFrame(columns=X.columns)
|
|
1023
|
+
self.intercept_history_df = pd.DataFrame(columns=["1"])
|
|
1024
|
+
|
|
1025
|
+
# old_sparse_index = [False] * num_features
|
|
1026
|
+
non_sparse_columns = X.columns
|
|
1027
|
+
X_ind = X[non_sparse_columns]
|
|
1028
|
+
for ind in range(self.max_iter_thresh):
|
|
1029
|
+
self.model.fit(X=X_ind, y=y)
|
|
1030
|
+
coef_ind = self.model.coef_
|
|
1031
|
+
self.coef_history_df_pre_thesh.loc[ind] = dict(zip(self.model.feature_names_in_, self.model.coef_))
|
|
1032
|
+
# non_sparse_index = np.ones(coef_ind.shape)
|
|
1033
|
+
sparse_index = abs(coef_ind) < self.coef_threshold
|
|
1034
|
+
coef_ind[sparse_index] = 0.0
|
|
1035
|
+
self.coef_history_df.loc[ind] = dict(zip(self.model.feature_names_in_, coef_ind))
|
|
1036
|
+
self.intercept_history_df.loc[ind]= {"1": self.model.intercept_}
|
|
1037
|
+
|
|
1038
|
+
non_sparse_columns = non_sparse_columns[~sparse_index]
|
|
1039
|
+
if all(sparse_index): # If all the coef go to zero after thresholding
|
|
1040
|
+
warnings.warn("All coefficients fell below threshold {}, please"
|
|
1041
|
+
" lower threshold".format(self.coef_threshold))
|
|
1042
|
+
break
|
|
1043
|
+
|
|
1044
|
+
if set(X_ind.columns) == set(non_sparse_columns):
|
|
1045
|
+
print("Sequential threshold converged in {} iterations".format(ind))
|
|
1046
|
+
break
|
|
1047
|
+
else:
|
|
1048
|
+
X_ind = X[non_sparse_columns]
|
|
1049
|
+
|
|
1050
|
+
final_coefs = self.coef_history_df.iloc[-1].fillna(0.0)
|
|
1051
|
+
self.coef_ = final_coefs.values
|
|
1052
|
+
self.intercept_ = self.intercept_history_df.iloc[-1]["1"]
|
|
1053
|
+
# self.score = self.model.score
|
|
1054
|
+
self.feature_names_in_ = np.array(X.columns)
|
|
1055
|
+
|
|
1056
|
+
return self
|
|
1057
|
+
|
|
1058
|
+
def score(self, X, y, sample_weight=None):
|
|
1059
|
+
assert self.is_fit
|
|
1060
|
+
final_features = self.coef_history_df.iloc[-1].dropna().index
|
|
1061
|
+
if len(final_features) > 0:
|
|
1062
|
+
self.model_for_score.fit(X=X[final_features], y=y)
|
|
1063
|
+
score_ = self.model_for_score.score(X=X[final_features], y=y)
|
|
1064
|
+
return score_
|
|
1065
|
+
else:
|
|
1066
|
+
return 0
|