tensorzinb-plusplus 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorzinb/__init__.py +0 -0
- tensorzinb/lrtest.py +176 -0
- tensorzinb/tensorzinb.py +611 -0
- tensorzinb/utils.py +62 -0
- tensorzinb_plusplus-0.0.2.dist-info/METADATA +218 -0
- tensorzinb_plusplus-0.0.2.dist-info/RECORD +8 -0
- tensorzinb_plusplus-0.0.2.dist-info/WHEEL +4 -0
- tensorzinb_plusplus-0.0.2.dist-info/licenses/LICENSE +13 -0
tensorzinb/__init__.py
ADDED
|
File without changes
|
tensorzinb/lrtest.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from patsy import dmatrices
|
|
4
|
+
from scipy import stats
|
|
5
|
+
from .tensorzinb import TensorZINB
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LRTest:
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
df_data,
|
|
12
|
+
df_feature,
|
|
13
|
+
conditions,
|
|
14
|
+
nb_features,
|
|
15
|
+
nb_features_c=None,
|
|
16
|
+
infl_features=None,
|
|
17
|
+
infl_features_c=None,
|
|
18
|
+
add_intercept=True,
|
|
19
|
+
nb_only=False,
|
|
20
|
+
same_dispersion=False,
|
|
21
|
+
):
|
|
22
|
+
self.df_data = df_data
|
|
23
|
+
self.df_feature = df_feature
|
|
24
|
+
self.conditions = list(conditions)
|
|
25
|
+
self.nb_features = list(set(nb_features) - set(conditions))
|
|
26
|
+
self.nb_features_c = nb_features_c
|
|
27
|
+
if nb_features_c is None or len(nb_features_c) == 0:
|
|
28
|
+
self._no_nb_c = True
|
|
29
|
+
else:
|
|
30
|
+
self._no_nb_c = False
|
|
31
|
+
|
|
32
|
+
if infl_features is not None and len(infl_features) > 0:
|
|
33
|
+
self.infl_features = list(set(infl_features) - set(conditions))
|
|
34
|
+
self._no_infl = False
|
|
35
|
+
else:
|
|
36
|
+
self.infl_features = None
|
|
37
|
+
self._no_infl = True
|
|
38
|
+
|
|
39
|
+
self.infl_features_c = infl_features_c
|
|
40
|
+
if infl_features_c is None or len(infl_features_c) == 0:
|
|
41
|
+
self._no_infl_c = True
|
|
42
|
+
else:
|
|
43
|
+
self._no_infl_c = False
|
|
44
|
+
|
|
45
|
+
self.nb_only = nb_only
|
|
46
|
+
if self._no_infl_c and self._no_infl:
|
|
47
|
+
self.nb_only = True
|
|
48
|
+
|
|
49
|
+
self.add_intercept = add_intercept
|
|
50
|
+
self.same_dispersion = same_dispersion
|
|
51
|
+
|
|
52
|
+
self._gen_feature_dfs()
|
|
53
|
+
|
|
54
|
+
self.res0 = None
|
|
55
|
+
self.res1 = None
|
|
56
|
+
self.df_result = None
|
|
57
|
+
|
|
58
|
+
def _get_feature_idx_map(self, Xs):
|
|
59
|
+
df_maps = []
|
|
60
|
+
for X in Xs:
|
|
61
|
+
df_f = pd.DataFrame(X.columns, columns=["feature"])
|
|
62
|
+
df_f["idx"] = range(len(df_f))
|
|
63
|
+
df_maps.append(df_f)
|
|
64
|
+
dft = pd.merge(df_maps[1], df_maps[0], on="feature")
|
|
65
|
+
idx_map = [dft.idx_x.values, dft.idx_y.values]
|
|
66
|
+
return idx_map
|
|
67
|
+
|
|
68
|
+
def _gen_feature_df(self, df_feature, features, add_intercept=True):
|
|
69
|
+
if add_intercept:
|
|
70
|
+
formula = "{} ~ {}".format("1", " + ".join(features))
|
|
71
|
+
else:
|
|
72
|
+
formula = "{} ~ {} - 1".format("1", " + ".join(features))
|
|
73
|
+
|
|
74
|
+
_, predictors = dmatrices(formula, df_feature, return_type="dataframe")
|
|
75
|
+
return predictors
|
|
76
|
+
|
|
77
|
+
def _gen_feature_dfs(self):
|
|
78
|
+
fs = [self.nb_features, self.nb_features + self.conditions]
|
|
79
|
+
Xs = []
|
|
80
|
+
dfs = []
|
|
81
|
+
for feature in fs:
|
|
82
|
+
df = self._gen_feature_df(
|
|
83
|
+
self.df_feature, feature, add_intercept=self.add_intercept
|
|
84
|
+
)
|
|
85
|
+
dfs.append(df)
|
|
86
|
+
Xs.append(df.values)
|
|
87
|
+
self.Xs = Xs
|
|
88
|
+
self.X_idx_map = self._get_feature_idx_map(dfs)
|
|
89
|
+
|
|
90
|
+
if self._no_nb_c:
|
|
91
|
+
self.X_c = None
|
|
92
|
+
else:
|
|
93
|
+
self.X_c = self._gen_feature_df(
|
|
94
|
+
self.df_feature, self.nb_features_c, add_intercept=False
|
|
95
|
+
).values
|
|
96
|
+
|
|
97
|
+
if not self.nb_only and not self._no_infl:
|
|
98
|
+
fs = [self.infl_features, self.infl_features + self.conditions]
|
|
99
|
+
X_infls = []
|
|
100
|
+
dfs = []
|
|
101
|
+
for feature in fs:
|
|
102
|
+
df = self._gen_feature_df(
|
|
103
|
+
self.df_feature, feature, add_intercept=self.add_intercept
|
|
104
|
+
)
|
|
105
|
+
dfs.append(df)
|
|
106
|
+
X_infls.append(df.values)
|
|
107
|
+
self.X_infls = X_infls
|
|
108
|
+
self.X_infl_idx_map = self._get_feature_idx_map(dfs)
|
|
109
|
+
else:
|
|
110
|
+
self.X_infls = [None, None]
|
|
111
|
+
self.X_infl_idx_map = None
|
|
112
|
+
|
|
113
|
+
if self._no_infl_c:
|
|
114
|
+
self.X_infl_c = None
|
|
115
|
+
else:
|
|
116
|
+
self.X_infl_c = self._gen_feature_df(
|
|
117
|
+
self.df_feature, self.infl_features_c, add_intercept=False
|
|
118
|
+
).values
|
|
119
|
+
|
|
120
|
+
def run(self, learning_rate=0.008, epochs=5000):
|
|
121
|
+
|
|
122
|
+
zinb0 = TensorZINB(
|
|
123
|
+
self.df_data.values,
|
|
124
|
+
self.Xs[0],
|
|
125
|
+
exog_c=self.X_c,
|
|
126
|
+
exog_infl=self.X_infls[0],
|
|
127
|
+
exog_infl_c=self.X_infl_c,
|
|
128
|
+
same_dispersion=self.same_dispersion,
|
|
129
|
+
nb_only=self.nb_only,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
res0 = zinb0.fit(learning_rate=learning_rate, epochs=epochs)
|
|
133
|
+
self.res0 = res0
|
|
134
|
+
|
|
135
|
+
zinb1 = TensorZINB(
|
|
136
|
+
self.df_data.values,
|
|
137
|
+
self.Xs[1],
|
|
138
|
+
exog_c=self.X_c,
|
|
139
|
+
exog_infl=self.X_infls[1],
|
|
140
|
+
exog_infl_c=self.X_infl_c,
|
|
141
|
+
same_dispersion=self.same_dispersion,
|
|
142
|
+
nb_only=self.nb_only,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
weights = res0["weights"]
|
|
146
|
+
if "x_mu" in weights:
|
|
147
|
+
x_mu = np.zeros((zinb1.k_exog, zinb1.num_out))
|
|
148
|
+
x_mu[self.X_idx_map[0], :] = weights["x_mu"][self.X_idx_map[1], :]
|
|
149
|
+
weights["x_mu"] = x_mu
|
|
150
|
+
|
|
151
|
+
if "x_pi" in weights:
|
|
152
|
+
x_pi = np.zeros((zinb1.k_exog_infl, zinb1.num_out))
|
|
153
|
+
x_pi[self.X_infl_idx_map[0], :] = weights["x_pi"][self.X_infl_idx_map[1], :]
|
|
154
|
+
weights["x_pi"] = x_pi
|
|
155
|
+
|
|
156
|
+
res1 = zinb1.fit(init_weights=weights, learning_rate=learning_rate, epochs=epochs)
|
|
157
|
+
self.res1 = res1
|
|
158
|
+
|
|
159
|
+
dfr = pd.DataFrame(self.df_data.columns, columns=["test"])
|
|
160
|
+
dfr["llf0"] = res0["llfs"]
|
|
161
|
+
dfr["aic0"] = res0["aics"]
|
|
162
|
+
dfr["df0"] = res0["df"]
|
|
163
|
+
dfr["cpu_time0"] = res0["cpu_time"] / len(dfr)
|
|
164
|
+
dfr["llf1"] = res1["llfs"]
|
|
165
|
+
dfr["aic1"] = res1["aics"]
|
|
166
|
+
dfr["df1"] = res1["df"]
|
|
167
|
+
dfr["cpu_time1"] = res1["cpu_time"] / len(dfr)
|
|
168
|
+
|
|
169
|
+
dfr["llfd"] = dfr["llf1"] - dfr["llf0"]
|
|
170
|
+
dfr["aicd"] = dfr["aic1"] - dfr["aic0"]
|
|
171
|
+
dfd = dfr["df1"] - dfr["df0"]
|
|
172
|
+
dfr["pvalue"] = 1 - stats.chi2.cdf(2 * dfr["llfd"], dfd)
|
|
173
|
+
|
|
174
|
+
self.df_result = dfr
|
|
175
|
+
|
|
176
|
+
return dfr
|
tensorzinb/tensorzinb.py
ADDED
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
import contextlib
|
|
5
|
+
import os
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|
9
|
+
os.environ["TF_USE_LEGACY_KERAS"] = "1"
|
|
10
|
+
|
|
11
|
+
import tensorflow as tf
|
|
12
|
+
|
|
13
|
+
tf.get_logger().setLevel("ERROR")
|
|
14
|
+
tf.autograph.set_verbosity(0)
|
|
15
|
+
|
|
16
|
+
import tf_keras
|
|
17
|
+
from tf_keras.models import Model
|
|
18
|
+
from tf_keras.layers import Lambda, Input, Dense, RepeatVector, Reshape, Add
|
|
19
|
+
from tf_keras.callbacks import EarlyStopping, ReduceLROnPlateau
|
|
20
|
+
from tf_keras.optimizers.legacy import RMSprop
|
|
21
|
+
from tf_keras import backend as K
|
|
22
|
+
from scipy.special import gammaln
|
|
23
|
+
import statsmodels.api as sm
|
|
24
|
+
|
|
25
|
+
# Reset Keras Session
|
|
26
|
+
def reset_keras():
|
|
27
|
+
K.clear_session()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ZINBLogLik:
|
|
31
|
+
def __init__(
|
|
32
|
+
self, pi=None, log_theta=None, nb_only=False, scope="zinb/", zero_threshold=1e-8
|
|
33
|
+
):
|
|
34
|
+
self.pi = pi
|
|
35
|
+
self.zero_threshold = zero_threshold
|
|
36
|
+
self.scope = scope
|
|
37
|
+
self.log_theta = log_theta
|
|
38
|
+
self.nb_only = nb_only
|
|
39
|
+
self.y = None
|
|
40
|
+
self.llf = None
|
|
41
|
+
|
|
42
|
+
def loss(self, y_true, y_pred):
|
|
43
|
+
with tf.name_scope(self.scope):
|
|
44
|
+
y = tf.cast(y_true, tf.float32)
|
|
45
|
+
# mu is already in log
|
|
46
|
+
mu = tf.cast(y_pred, tf.float32)
|
|
47
|
+
log_theta = self.log_theta
|
|
48
|
+
theta = tf.math.exp(log_theta)
|
|
49
|
+
|
|
50
|
+
t1 = tf.math.lgamma(y + theta)
|
|
51
|
+
t2 = -tf.math.lgamma(theta)
|
|
52
|
+
t3 = theta * log_theta
|
|
53
|
+
t4 = y * mu
|
|
54
|
+
ty = tf.reduce_logsumexp(tf.stack([log_theta, mu], axis=0), axis=0)
|
|
55
|
+
t5 = -(theta + y) * ty
|
|
56
|
+
|
|
57
|
+
if self.nb_only:
|
|
58
|
+
result = -(t1 + t2 + t3 + t4 + t5)
|
|
59
|
+
else:
|
|
60
|
+
log_q0 = -tf.nn.softplus(-self.pi)
|
|
61
|
+
# log_q1 = -tf.nn.softplus(self.pi) = -tf.nn.softplus(-self.pi) - self.pi
|
|
62
|
+
log_q1 = log_q0 - self.pi
|
|
63
|
+
|
|
64
|
+
nb_case = -(t1 + t2 + t3 + t4 + t5 + log_q1)
|
|
65
|
+
|
|
66
|
+
p1 = theta * (log_theta - ty) + log_q1
|
|
67
|
+
zero_case = -tf.reduce_logsumexp(tf.stack([log_q0, p1], axis=0), axis=0)
|
|
68
|
+
result = tf.where(tf.less(y, self.zero_threshold), zero_case, nb_case)
|
|
69
|
+
self.llf = tf.reduce_mean(result, axis=0)
|
|
70
|
+
self.y = y
|
|
71
|
+
result = tf.reduce_sum(self.llf)
|
|
72
|
+
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class PredictionCallback(tf_keras.callbacks.Callback):
|
|
77
|
+
def on_train_begin(self, logs={}):
|
|
78
|
+
self.weights = []
|
|
79
|
+
|
|
80
|
+
def on_epoch_end(self, epoch, logs={}):
|
|
81
|
+
ws = (
|
|
82
|
+
np.concatenate([np.array(w.flatten()) for w in self.model.get_weights()])
|
|
83
|
+
).flatten()
|
|
84
|
+
self.weights.append(ws)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class ReduceLROnPlateauSkip(ReduceLROnPlateau):
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
monitor="val_loss",
|
|
91
|
+
factor=0.1,
|
|
92
|
+
patience=10,
|
|
93
|
+
verbose=0,
|
|
94
|
+
mode="auto",
|
|
95
|
+
min_delta=1e-4,
|
|
96
|
+
cooldown=0,
|
|
97
|
+
min_lr=0,
|
|
98
|
+
num_epoch_skip=3,
|
|
99
|
+
**kwargs,
|
|
100
|
+
):
|
|
101
|
+
self.num_epoch_skip = num_epoch_skip
|
|
102
|
+
|
|
103
|
+
super(ReduceLROnPlateauSkip, self).__init__(
|
|
104
|
+
monitor=monitor,
|
|
105
|
+
factor=factor,
|
|
106
|
+
patience=patience,
|
|
107
|
+
verbose=verbose,
|
|
108
|
+
mode=mode,
|
|
109
|
+
min_delta=min_delta,
|
|
110
|
+
cooldown=cooldown,
|
|
111
|
+
min_lr=min_lr,
|
|
112
|
+
**kwargs,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def on_epoch_end(self, epoch, logs=None):
|
|
116
|
+
if epoch < self.num_epoch_skip:
|
|
117
|
+
return
|
|
118
|
+
super(ReduceLROnPlateauSkip, self).on_epoch_end(epoch, logs)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class TensorZINB:
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
endog,
|
|
125
|
+
exog,
|
|
126
|
+
exog_c=None,
|
|
127
|
+
exog_infl=None,
|
|
128
|
+
exog_infl_c=None,
|
|
129
|
+
same_dispersion=False,
|
|
130
|
+
nb_only=False,
|
|
131
|
+
**kwargs,
|
|
132
|
+
):
|
|
133
|
+
self.endog = endog
|
|
134
|
+
if len(endog.shape) == 1:
|
|
135
|
+
self.num_sample = len(endog)
|
|
136
|
+
self.num_out = 1
|
|
137
|
+
self.endog = endog.reshape((-1, 1))
|
|
138
|
+
else:
|
|
139
|
+
self.num_sample, self.num_out = np.shape(endog)
|
|
140
|
+
|
|
141
|
+
df_model = 0
|
|
142
|
+
df_model_c = 0
|
|
143
|
+
|
|
144
|
+
self.exog = exog
|
|
145
|
+
df_model = np.linalg.matrix_rank(exog)
|
|
146
|
+
if len(exog.shape) == 1:
|
|
147
|
+
self.k_exog = 1
|
|
148
|
+
self.exog = exog.reshape((-1, 1))
|
|
149
|
+
else:
|
|
150
|
+
self.k_exog = exog.shape[1]
|
|
151
|
+
|
|
152
|
+
self.exog_c = exog_c
|
|
153
|
+
if exog_c is None:
|
|
154
|
+
self.k_exog_c = 0
|
|
155
|
+
self._no_exog_c = True
|
|
156
|
+
self.exog_c = np.zeros((self.num_sample, self.k_exog_c), dtype=np.float64)
|
|
157
|
+
else:
|
|
158
|
+
self.k_exog_c = exog_c.shape[1]
|
|
159
|
+
self._no_exog_c = False
|
|
160
|
+
if self.k_exog_c > 0:
|
|
161
|
+
df_model_c = df_model_c + np.linalg.matrix_rank(exog_c)
|
|
162
|
+
|
|
163
|
+
self.nb_only = nb_only
|
|
164
|
+
if exog_infl is None and exog_infl_c is None:
|
|
165
|
+
self.nb_only = True
|
|
166
|
+
|
|
167
|
+
self.exog_infl = exog_infl
|
|
168
|
+
if exog_infl is None or self.nb_only:
|
|
169
|
+
self.k_exog_infl = 0
|
|
170
|
+
self._no_exog_infl = True
|
|
171
|
+
self.exog_infl = np.ones(
|
|
172
|
+
(self.num_sample, self.k_exog_infl), dtype=np.float64
|
|
173
|
+
)
|
|
174
|
+
else:
|
|
175
|
+
self.k_exog_infl = exog_infl.shape[1]
|
|
176
|
+
self._no_exog_infl = False
|
|
177
|
+
if self.k_exog_infl > 0:
|
|
178
|
+
df_model = df_model + np.linalg.matrix_rank(exog_infl)
|
|
179
|
+
|
|
180
|
+
self.exog_infl_c = exog_infl_c
|
|
181
|
+
if exog_infl_c is None or self.nb_only:
|
|
182
|
+
self.k_exog_infl_c = 0
|
|
183
|
+
self._no_exog_infl_c = True
|
|
184
|
+
self.exog_infl_c = np.ones(
|
|
185
|
+
(self.num_sample, self.k_exog_infl_c), dtype=np.float64
|
|
186
|
+
)
|
|
187
|
+
else:
|
|
188
|
+
self.k_exog_infl_c = exog_infl_c.shape[1]
|
|
189
|
+
self._no_exog_infl_c = False
|
|
190
|
+
if self.k_exog_infl_c > 0:
|
|
191
|
+
df_model_c = df_model_c + np.linalg.matrix_rank(exog_infl_c)
|
|
192
|
+
|
|
193
|
+
df_model_each = df_model + df_model_c
|
|
194
|
+
df_model = df_model * self.num_out + df_model_c
|
|
195
|
+
|
|
196
|
+
self.same_dispersion = same_dispersion
|
|
197
|
+
if same_dispersion:
|
|
198
|
+
self.k_disperson = 1
|
|
199
|
+
df_model = df_model + 1
|
|
200
|
+
else:
|
|
201
|
+
self.k_disperson = self.num_out
|
|
202
|
+
df_model = df_model + self.num_out
|
|
203
|
+
df_model_each = df_model_each + 1
|
|
204
|
+
self.df_model = df_model
|
|
205
|
+
self.df_model_each = df_model_each
|
|
206
|
+
|
|
207
|
+
self.loglike_method = "nb2"
|
|
208
|
+
|
|
209
|
+
def fit(
|
|
210
|
+
self,
|
|
211
|
+
init_weights={},
|
|
212
|
+
init_method="poi",
|
|
213
|
+
device_type="CPU",
|
|
214
|
+
device_name=None,
|
|
215
|
+
return_history=False,
|
|
216
|
+
epochs=5000,
|
|
217
|
+
learning_rate=0.01,
|
|
218
|
+
num_epoch_skip=3,
|
|
219
|
+
is_early_stop=True,
|
|
220
|
+
is_reduce_lr=True,
|
|
221
|
+
min_delta_early_stop=0.05,
|
|
222
|
+
patience_early_stop=50,
|
|
223
|
+
factor_reduce_lr=0.8,
|
|
224
|
+
patience_reduce_lr=10,
|
|
225
|
+
min_lr=0.001,
|
|
226
|
+
reset_keras_session=False,
|
|
227
|
+
**kwargs,
|
|
228
|
+
):
|
|
229
|
+
if device_name is None:
|
|
230
|
+
devices = tf.config.list_logical_devices(device_type)
|
|
231
|
+
device_name = devices[0].name
|
|
232
|
+
|
|
233
|
+
num_sample = self.num_sample
|
|
234
|
+
num_out = self.num_out
|
|
235
|
+
num_feat = self.k_exog
|
|
236
|
+
num_feat_infl = self.k_exog_infl
|
|
237
|
+
num_feat_c = self.k_exog_c
|
|
238
|
+
num_feat_infl_c = self.k_exog_infl_c
|
|
239
|
+
num_dispersion = self.k_disperson
|
|
240
|
+
|
|
241
|
+
# initiate weights
|
|
242
|
+
if len(init_weights) == 0:
|
|
243
|
+
if init_method == "poi":
|
|
244
|
+
init_weights = self._poisson_init()
|
|
245
|
+
elif init_method == "nb":
|
|
246
|
+
init_weights = self._nb_init()
|
|
247
|
+
|
|
248
|
+
# use distinct names to retrieve weights from layers
|
|
249
|
+
weight_keys = ["x_mu", "z_mu", "x_pi", "z_pi", "theta"]
|
|
250
|
+
|
|
251
|
+
if reset_keras_session:
|
|
252
|
+
with warnings.catch_warnings():
|
|
253
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
254
|
+
reset_keras()
|
|
255
|
+
K.clear_session()
|
|
256
|
+
|
|
257
|
+
with tf.device(device_name):
|
|
258
|
+
tf.compat.v1.disable_eager_execution()
|
|
259
|
+
|
|
260
|
+
inputs = Input(shape=(num_feat,))
|
|
261
|
+
inputs_infl = Input(shape=(num_feat_infl,))
|
|
262
|
+
inputs_c = Input(shape=(num_feat_c,))
|
|
263
|
+
inputs_infl_c = Input(shape=(num_feat_infl_c,))
|
|
264
|
+
inputs_theta = Input(shape=(1,))
|
|
265
|
+
|
|
266
|
+
if 'x_mu' in init_weights:
|
|
267
|
+
x = Dense(num_out, use_bias=False, name='x_mu', weights=[init_weights['x_mu']])(inputs)
|
|
268
|
+
else:
|
|
269
|
+
x = Dense(num_out, use_bias=False, name='x_mu')(inputs)
|
|
270
|
+
if num_feat_c>0:
|
|
271
|
+
if 'z_mu' in init_weights:
|
|
272
|
+
x_c = Dense(1, use_bias=False, name='z_mu', weights=[init_weights['z_mu']])(inputs_c)
|
|
273
|
+
else:
|
|
274
|
+
x_c = Dense(1, use_bias=False, name='z_mu')(inputs_c)
|
|
275
|
+
predictions = Add()([x,x_c])
|
|
276
|
+
else:
|
|
277
|
+
predictions = x
|
|
278
|
+
|
|
279
|
+
if self.nb_only:
|
|
280
|
+
pi = None
|
|
281
|
+
else:
|
|
282
|
+
if 'x_pi' in init_weights:
|
|
283
|
+
x_infl = Dense(num_out, use_bias=False, name='x_pi', weights=[init_weights['x_pi']])(inputs_infl)
|
|
284
|
+
else:
|
|
285
|
+
x_infl = Dense(num_out, use_bias=False, name='x_pi')(inputs_infl)
|
|
286
|
+
|
|
287
|
+
if num_feat_infl_c>0:
|
|
288
|
+
if 'z_pi' in init_weights:
|
|
289
|
+
x_infl_c = Dense(1, use_bias=False, name='z_pi', weights=[init_weights['z_pi']])(inputs_infl_c)
|
|
290
|
+
else:
|
|
291
|
+
x_infl_c = Dense(1, use_bias=False, name='z_pi')(inputs_infl_c)
|
|
292
|
+
pi = Add()([x_infl, x_infl_c])
|
|
293
|
+
else:
|
|
294
|
+
pi = x_infl
|
|
295
|
+
|
|
296
|
+
if 'theta' in init_weights:
|
|
297
|
+
theta0 = Dense(num_dispersion, use_bias=False, name='theta', weights=[init_weights['theta']])(inputs_theta)
|
|
298
|
+
else:
|
|
299
|
+
theta0 = Dense(num_dispersion, use_bias=False, name='theta')(inputs_theta)
|
|
300
|
+
if self.same_dispersion:
|
|
301
|
+
theta = Reshape((num_out,))(RepeatVector(num_out)(theta0))
|
|
302
|
+
else:
|
|
303
|
+
theta = theta0
|
|
304
|
+
|
|
305
|
+
zinb = ZINBLogLik(pi, theta, nb_only=self.nb_only)
|
|
306
|
+
|
|
307
|
+
if self.nb_only:
|
|
308
|
+
output = Lambda(lambda x: x[0])([predictions, theta])
|
|
309
|
+
else:
|
|
310
|
+
output = Lambda(lambda x: x[0])([predictions, pi, theta])
|
|
311
|
+
|
|
312
|
+
model = Model(
|
|
313
|
+
inputs=[inputs, inputs_c, inputs_infl, inputs_infl_c, inputs_theta],
|
|
314
|
+
outputs=[output],
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
opt = RMSprop(learning_rate=learning_rate)
|
|
318
|
+
model.compile(loss=zinb.loss, optimizer=opt)
|
|
319
|
+
|
|
320
|
+
callbacks = []
|
|
321
|
+
if is_early_stop:
|
|
322
|
+
early_stop = EarlyStopping(
|
|
323
|
+
monitor="loss",
|
|
324
|
+
min_delta=min_delta_early_stop / num_sample,
|
|
325
|
+
patience=patience_early_stop,
|
|
326
|
+
mode="min",
|
|
327
|
+
)
|
|
328
|
+
callbacks.append(early_stop)
|
|
329
|
+
|
|
330
|
+
if is_reduce_lr:
|
|
331
|
+
reduce_lr = ReduceLROnPlateauSkip(
|
|
332
|
+
monitor="loss",
|
|
333
|
+
factor=factor_reduce_lr,
|
|
334
|
+
patience=patience_reduce_lr,
|
|
335
|
+
min_lr=min_lr,
|
|
336
|
+
num_epoch_skip=num_epoch_skip,
|
|
337
|
+
)
|
|
338
|
+
callbacks.append(reduce_lr)
|
|
339
|
+
|
|
340
|
+
if return_history:
|
|
341
|
+
# this get all weights over epoch
|
|
342
|
+
get_weights = PredictionCallback()
|
|
343
|
+
callbacks.append(get_weights)
|
|
344
|
+
|
|
345
|
+
# TODO: FIX this. code randomly crashes on apple silicon M1/M2 with
|
|
346
|
+
# error `Incompatible shapes`. code usually runs fine after second try.
|
|
347
|
+
# similar to this issue https://developer.apple.com/forums/thread/701985
|
|
348
|
+
for i in range(10):
|
|
349
|
+
try:
|
|
350
|
+
start_time = time.time()
|
|
351
|
+
losses = model.fit(
|
|
352
|
+
[
|
|
353
|
+
self.exog,
|
|
354
|
+
self.exog_c,
|
|
355
|
+
self.exog_infl,
|
|
356
|
+
self.exog_infl_c,
|
|
357
|
+
np.ones((num_sample, 1)),
|
|
358
|
+
],
|
|
359
|
+
[self.endog],
|
|
360
|
+
callbacks=callbacks,
|
|
361
|
+
batch_size=num_sample,
|
|
362
|
+
epochs=epochs,
|
|
363
|
+
verbose=0,
|
|
364
|
+
)
|
|
365
|
+
cpu_time = time.time() - start_time
|
|
366
|
+
break
|
|
367
|
+
except Exception as e:
|
|
368
|
+
print(model.summary())
|
|
369
|
+
print(e)
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
# retrieve LL
|
|
373
|
+
get_llfs = K.function(
|
|
374
|
+
[inputs, inputs_c, inputs_infl, inputs_infl_c, inputs_theta, zinb.y],
|
|
375
|
+
[zinb.llf],
|
|
376
|
+
)
|
|
377
|
+
llft = get_llfs(
|
|
378
|
+
[
|
|
379
|
+
self.exog,
|
|
380
|
+
self.exog_c,
|
|
381
|
+
self.exog_infl,
|
|
382
|
+
self.exog_infl_c,
|
|
383
|
+
np.ones((num_sample, 1)),
|
|
384
|
+
self.endog,
|
|
385
|
+
]
|
|
386
|
+
)[0]
|
|
387
|
+
|
|
388
|
+
llfs = -(llft * num_sample + np.sum(gammaln(self.endog + 1), axis=0))
|
|
389
|
+
aics = -2 * (llfs - self.df_model_each)
|
|
390
|
+
|
|
391
|
+
llf = np.sum(llfs)
|
|
392
|
+
aic = -2 * (llf - self.df_model)
|
|
393
|
+
|
|
394
|
+
# get weights
|
|
395
|
+
weights = model.get_weights()
|
|
396
|
+
|
|
397
|
+
names_t = [
|
|
398
|
+
(weight.name).split("/")[0]
|
|
399
|
+
for layer in model.layers
|
|
400
|
+
for weight in layer.weights
|
|
401
|
+
]
|
|
402
|
+
# tf layer weight has subscript in names
|
|
403
|
+
weight_names = []
|
|
404
|
+
for name in names_t:
|
|
405
|
+
matched = name
|
|
406
|
+
for nt in weight_keys:
|
|
407
|
+
if nt in name:
|
|
408
|
+
matched = nt
|
|
409
|
+
break
|
|
410
|
+
weight_names.append(matched)
|
|
411
|
+
|
|
412
|
+
weights_dict = dict(zip(weight_names, weights))
|
|
413
|
+
|
|
414
|
+
res = {
|
|
415
|
+
"llf_total": llf,
|
|
416
|
+
"llfs": llfs,
|
|
417
|
+
"aic_total": aic,
|
|
418
|
+
"aics": aics,
|
|
419
|
+
"df_model_total": self.df_model,
|
|
420
|
+
"df": self.df_model_each,
|
|
421
|
+
"weights": weights_dict,
|
|
422
|
+
"cpu_time": cpu_time,
|
|
423
|
+
"num_sample": num_sample,
|
|
424
|
+
"epochs": len(losses.history["loss"]),
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
if return_history:
|
|
428
|
+
res["loss_history"] = losses.history["loss"]
|
|
429
|
+
res["weights_history"] = np.array(get_weights.weights)
|
|
430
|
+
|
|
431
|
+
return res
|
|
432
|
+
|
|
433
|
+
# https://github.com/statsmodels/statsmodels/blob/main/statsmodels/discrete/discrete_model.py#L3691
|
|
434
|
+
def _estimate_dispersion(self, mu, resid, df_resid=None, loglike_method="nb2"):
|
|
435
|
+
if df_resid is None:
|
|
436
|
+
df_resid = resid.shape[0]
|
|
437
|
+
if loglike_method == "nb2":
|
|
438
|
+
a = ((resid**2 / mu - 1) / mu).sum() / df_resid
|
|
439
|
+
else: # self.loglike_method == 'nb1':
|
|
440
|
+
a = (resid**2 / mu - 1).sum() / df_resid
|
|
441
|
+
return a
|
|
442
|
+
|
|
443
|
+
def _compute_pi_init(self, nz_prob, p_nonzero, infl_prob_max=0.99):
|
|
444
|
+
ww = 1 - min(nz_prob / p_nonzero, infl_prob_max)
|
|
445
|
+
return -np.log(1 / ww - 1)
|
|
446
|
+
|
|
447
|
+
def _poisson_init_each(
|
|
448
|
+
self,
|
|
449
|
+
Y,
|
|
450
|
+
estimate_infl=True,
|
|
451
|
+
eps=1e-10,
|
|
452
|
+
maxiter=100,
|
|
453
|
+
theta_lb=0.05,
|
|
454
|
+
intercept_var_th=1e-3,
|
|
455
|
+
infl_prob_max=0.99,
|
|
456
|
+
):
|
|
457
|
+
find_poi_sol = True
|
|
458
|
+
with warnings.catch_warnings():
|
|
459
|
+
warnings.filterwarnings("ignore")
|
|
460
|
+
try:
|
|
461
|
+
poi_mod = sm.Poisson(Y, self.exog).fit(
|
|
462
|
+
maxiter=maxiter, disp=False, warn_convergence=False
|
|
463
|
+
)
|
|
464
|
+
if np.isnan(poi_mod.params).any():
|
|
465
|
+
find_poi_sol = False
|
|
466
|
+
else:
|
|
467
|
+
mu = poi_mod.predict()
|
|
468
|
+
a = self._estimate_dispersion(
|
|
469
|
+
mu, poi_mod.resid, df_resid=poi_mod.df_resid
|
|
470
|
+
)
|
|
471
|
+
theta = 1 / max(a, theta_lb)
|
|
472
|
+
x_mu = np.reshape(poi_mod.params, (self.k_exog, 1))
|
|
473
|
+
except:
|
|
474
|
+
find_poi_sol = False
|
|
475
|
+
|
|
476
|
+
if not find_poi_sol:
|
|
477
|
+
vs = np.std(self.exog, axis=0)
|
|
478
|
+
# find intercept index
|
|
479
|
+
min_idx = np.argmin(vs)
|
|
480
|
+
if vs[min_idx] < intercept_var_th:
|
|
481
|
+
x_mu = np.zeros((self.k_exog, 1))
|
|
482
|
+
mu = np.mean(Y)
|
|
483
|
+
x_mu[min_idx] = np.log(mu) / np.mean(self.exog[:, min_idx])
|
|
484
|
+
resid = Y - mu
|
|
485
|
+
a = self._estimate_dispersion(mu, resid)
|
|
486
|
+
if np.isnan(a) or np.isinf(a):
|
|
487
|
+
a = theta_lb
|
|
488
|
+
theta = 1 / max(a, theta_lb)
|
|
489
|
+
else:
|
|
490
|
+
x_mu = None
|
|
491
|
+
return {}
|
|
492
|
+
|
|
493
|
+
weights = {"x_mu": x_mu, "theta": np.array([np.log(theta)]).reshape((-1, 1))}
|
|
494
|
+
|
|
495
|
+
if not self._no_exog_infl and estimate_infl:
|
|
496
|
+
pred = np.maximum(mu, 10 * eps)
|
|
497
|
+
p_nonzero = 1 - np.mean(np.power(theta / (theta + pred + eps), theta))
|
|
498
|
+
|
|
499
|
+
# find intercept index
|
|
500
|
+
vs = np.std(self.exog_infl, axis=0)
|
|
501
|
+
min_idx = np.argmin(vs)
|
|
502
|
+
if vs[min_idx] < intercept_var_th:
|
|
503
|
+
nz_prob = np.mean(Y > 0)
|
|
504
|
+
x_pi = np.zeros((self.k_exog_infl, 1))
|
|
505
|
+
fv = np.mean(self.exog_infl[:, min_idx])
|
|
506
|
+
w_pi = self._compute_pi_init(
|
|
507
|
+
nz_prob, p_nonzero, infl_prob_max=infl_prob_max
|
|
508
|
+
)
|
|
509
|
+
x_pi[min_idx] = w_pi / fv
|
|
510
|
+
weights["x_pi"] = x_pi
|
|
511
|
+
|
|
512
|
+
return weights
|
|
513
|
+
|
|
514
|
+
def _poisson_init(
|
|
515
|
+
self,
|
|
516
|
+
eps=1e-10,
|
|
517
|
+
maxiter=100,
|
|
518
|
+
theta_lb=0.05,
|
|
519
|
+
intercept_var_th=1e-3,
|
|
520
|
+
infl_prob_max=0.99,
|
|
521
|
+
):
|
|
522
|
+
x_mu = []
|
|
523
|
+
x_pi = []
|
|
524
|
+
theta = []
|
|
525
|
+
return_x_pi = True
|
|
526
|
+
return_theta = True
|
|
527
|
+
for i in range(self.num_out):
|
|
528
|
+
w = self._poisson_init_each(
|
|
529
|
+
self.endog[:, i],
|
|
530
|
+
eps=eps,
|
|
531
|
+
maxiter=maxiter,
|
|
532
|
+
theta_lb=theta_lb,
|
|
533
|
+
intercept_var_th=intercept_var_th,
|
|
534
|
+
infl_prob_max=infl_prob_max,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
if len(w) == 0:
|
|
538
|
+
return {}
|
|
539
|
+
|
|
540
|
+
if "x_mu" in w:
|
|
541
|
+
x_mu.append(w["x_mu"])
|
|
542
|
+
else:
|
|
543
|
+
return {}
|
|
544
|
+
|
|
545
|
+
if "x_pi" in w:
|
|
546
|
+
x_pi.append(w["x_pi"])
|
|
547
|
+
else:
|
|
548
|
+
return_x_pi = False
|
|
549
|
+
|
|
550
|
+
if "theta" in w:
|
|
551
|
+
theta.append(w["theta"])
|
|
552
|
+
else:
|
|
553
|
+
return_theta = False
|
|
554
|
+
weights = {"x_mu": np.concatenate(x_mu, axis=1)}
|
|
555
|
+
if return_x_pi:
|
|
556
|
+
weights["x_pi"] = np.concatenate(x_pi, axis=1)
|
|
557
|
+
if return_theta:
|
|
558
|
+
t = np.concatenate(theta, axis=1)
|
|
559
|
+
if self.same_dispersion:
|
|
560
|
+
weights["theta"] = np.array(np.mean(t)).reshape((-1, 1))
|
|
561
|
+
else:
|
|
562
|
+
weights["theta"] = t
|
|
563
|
+
return weights
|
|
564
|
+
|
|
565
|
+
def _nb_init(self, infl_prob_max=0.99, intercept_var_th=1e-3):
|
|
566
|
+
nb_mod = TensorZINB(
|
|
567
|
+
self.endog,
|
|
568
|
+
self.exog,
|
|
569
|
+
exog_c=self.exog_c,
|
|
570
|
+
same_dispersion=self.same_dispersion,
|
|
571
|
+
nb_only=True,
|
|
572
|
+
)
|
|
573
|
+
nb_res = nb_mod.fit(init_method="poi")
|
|
574
|
+
weights = nb_res["weights"]
|
|
575
|
+
|
|
576
|
+
if self._no_exog_infl:
|
|
577
|
+
return weights
|
|
578
|
+
# find intercept index
|
|
579
|
+
vs = np.std(self.exog_infl, axis=0)
|
|
580
|
+
min_idx = np.argmin(vs)
|
|
581
|
+
# do not compute logit weight if there is no intercept
|
|
582
|
+
if vs[min_idx] < intercept_var_th:
|
|
583
|
+
x_pi = np.zeros((self.k_exog_infl, self.num_out))
|
|
584
|
+
fv = np.mean(self.exog_infl[:, min_idx])
|
|
585
|
+
if self.same_dispersion:
|
|
586
|
+
theta = np.exp(
|
|
587
|
+
np.array(list(weights["theta"].flatten()) * self.num_out)
|
|
588
|
+
)
|
|
589
|
+
else:
|
|
590
|
+
theta = np.exp(weights["theta"].flatten())
|
|
591
|
+
|
|
592
|
+
mu_c = 0
|
|
593
|
+
if self.k_exog_c > 0 and "z_mu" in weights:
|
|
594
|
+
mu_c = np.dot(self.exog_c, weights["z_mu"])
|
|
595
|
+
|
|
596
|
+
for i in range(self.num_out):
|
|
597
|
+
mu = np.dot(self.exog, weights["x_mu"][:, i]) + mu_c
|
|
598
|
+
mu = np.exp(mu)
|
|
599
|
+
p_nonzero = 1 - np.mean(np.power(theta[i] / (theta[i] + mu), theta[i]))
|
|
600
|
+
nz_prob = np.mean(self.endog[:, i] > 0)
|
|
601
|
+
w_pi = self._compute_pi_init(
|
|
602
|
+
nz_prob, p_nonzero, infl_prob_max=infl_prob_max
|
|
603
|
+
)
|
|
604
|
+
x_pi[min_idx, i] = w_pi / fv
|
|
605
|
+
|
|
606
|
+
weights["x_pi"] = x_pi
|
|
607
|
+
|
|
608
|
+
if self.k_exog_infl_c > 0:
|
|
609
|
+
weights["z_pi"] = np.zeros((self.k_exog_infl_c, 1))
|
|
610
|
+
|
|
611
|
+
return weights
|
tensorzinb/utils.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.preprocessing import StandardScaler
|
|
3
|
+
from numpy.linalg import matrix_rank
|
|
4
|
+
|
|
5
|
+
def LI_vecs(dim,M):
|
|
6
|
+
LI=[M[0]]
|
|
7
|
+
idxs=[0]
|
|
8
|
+
for i in range(dim):
|
|
9
|
+
tmp=[]
|
|
10
|
+
for r in LI:
|
|
11
|
+
tmp.append(r)
|
|
12
|
+
tmp.append(M[i]) #set tmp=LI+[M[i]]
|
|
13
|
+
if matrix_rank(tmp)>len(LI): #test if M[i] is linearly independent from all (row) vectors in LI
|
|
14
|
+
LI.append(M[i]) #note that matrix_rank does not need to take in a square matrix
|
|
15
|
+
idxs.append(i)
|
|
16
|
+
return idxs
|
|
17
|
+
|
|
18
|
+
def find_independent_columns(exog):
|
|
19
|
+
if np.linalg.matrix_rank(exog)==np.shape(exog)[1]:
|
|
20
|
+
return np.array(exog.columns)
|
|
21
|
+
idxs = LI_vecs(np.shape(exog)[1], exog.T.values)
|
|
22
|
+
return np.array(exog.columns[idxs])
|
|
23
|
+
|
|
24
|
+
def normalize_features(df_feature, features_to_norm):
|
|
25
|
+
scaler = StandardScaler()
|
|
26
|
+
for f in features_to_norm:
|
|
27
|
+
df_feature[f] = scaler.fit_transform(df_feature[f].values.reshape(-1, 1)).flatten()
|
|
28
|
+
return df_feature
|
|
29
|
+
|
|
30
|
+
# https://stackoverflow.com/a/21739593
|
|
31
|
+
def correct_pvalues_for_multiple_testing(pvalues, correction_type = "Benjamini-Hochberg"):
|
|
32
|
+
"""
|
|
33
|
+
consistent with R - print correct_pvalues_for_multiple_testing([0.0, 0.01, 0.029, 0.03, 0.031, 0.05, 0.069, 0.07, 0.071, 0.09, 0.1])
|
|
34
|
+
"""
|
|
35
|
+
from numpy import array, empty
|
|
36
|
+
pvalues = array(pvalues)
|
|
37
|
+
n = pvalues.shape[0]
|
|
38
|
+
new_pvalues = np.zeros(n)
|
|
39
|
+
if correction_type == "Bonferroni":
|
|
40
|
+
new_pvalues = n * pvalues
|
|
41
|
+
elif correction_type == "Bonferroni-Holm":
|
|
42
|
+
values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
|
|
43
|
+
values.sort()
|
|
44
|
+
for rank, vals in enumerate(values):
|
|
45
|
+
pvalue, i = vals
|
|
46
|
+
new_pvalues[i] = (n-rank) * pvalue
|
|
47
|
+
elif correction_type == "Benjamini-Hochberg":
|
|
48
|
+
values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
|
|
49
|
+
values.sort()
|
|
50
|
+
values.reverse()
|
|
51
|
+
new_values = []
|
|
52
|
+
for i, vals in enumerate(values):
|
|
53
|
+
rank = n - i
|
|
54
|
+
pvalue, index = vals
|
|
55
|
+
new_values.append((n/rank) * pvalue)
|
|
56
|
+
for i in range(0, int(n)-1):
|
|
57
|
+
if new_values[i] < new_values[i+1]:
|
|
58
|
+
new_values[i+1] = new_values[i]
|
|
59
|
+
for i, vals in enumerate(values):
|
|
60
|
+
pvalue, index = vals
|
|
61
|
+
new_pvalues[index] = new_values[i]
|
|
62
|
+
return new_pvalues
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tensorzinb-plusplus
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Zero Inflated Negative Binomial regression for scRNA-seq (maintained fork of tensorzinb by Tao Cui & Tingting Wang)
|
|
5
|
+
Project-URL: Homepage, https://github.com/saarantras/tensorzinb-plusplus
|
|
6
|
+
Project-URL: Original Repository, https://github.com/wanglab-georgetown/tensorzinb
|
|
7
|
+
Author: Tingting Wang
|
|
8
|
+
Author-email: Tao Cui <taocui.caltech@gmail.com>
|
|
9
|
+
Maintainer-email: Mackenzie Noon <me@mackenzienoon.com>
|
|
10
|
+
License: Apache-2.0
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: differential expression,scRNA-seq,tensorflow,zero-inflated negative binomial
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
23
|
+
Requires-Python: <3.13,>=3.9
|
|
24
|
+
Requires-Dist: numpy>=1.23.5
|
|
25
|
+
Requires-Dist: pandas>=1.5.2
|
|
26
|
+
Requires-Dist: patsy>=0.5.3
|
|
27
|
+
Requires-Dist: scikit-learn>=1.2.0
|
|
28
|
+
Requires-Dist: scipy>=1.9.3
|
|
29
|
+
Requires-Dist: statsmodels>=0.13.5
|
|
30
|
+
Requires-Dist: tensorflow>=2.16
|
|
31
|
+
Requires-Dist: tf-keras>=2.16
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
## Zero-inflated Negative Binomial Model using TensorFlow
|
|
35
|
+
|
|
36
|
+
TensorZINB is a Python module that uses TensorFlow to effectively solve negative binomial (NB) and zero-inflated negative binomial (ZINB) models. One of its key strengths is its ability to accurately calculate the NB/ZINB log likelihood. Additionally, it can be used for differentially expressed gene (DEG) analysis in the context of single-cell RNA sequencing (scRNA-seq). This package distinguishes itself by ensuring numerical stability, enabling the processing of datasets in batches, and delivering superior computing speeds compared to other existing NB/ZINB solvers. To guarantee the reliability of its analysis results, TensorZINB has undergone rigorous testing against various statistical packages. TensorZINB supports the execution of various features on both the negative binomial and zero-inflated (logit) components. Furthermore, it allows for the use of common features with the same weights across multiple subjects within a batch.
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
The negative binomial distribution is
|
|
40
|
+
$$NB(y;\mu,\theta)=\frac{\Gamma(y+\theta)}{\Gamma(\theta)\Gamma(y+1)}\left( \frac{\theta}{\theta+\mu}\right)^\theta\left(\frac{\mu}{\theta+\mu}\right)^y$$
|
|
41
|
+
where $\mu$ is the mean and $\theta$ is the dispersion parameter. For zero-inflated models, the counts are modelled as a mixture of the Bernoulli distribution and count distribution, i.e.,
|
|
42
|
+
|
|
43
|
+
$$
|
|
44
|
+
Pr(Y=0)=\pi+(1-\pi)NB(0),\\
|
|
45
|
+
Pr(Y=y)=(1-\pi)NB(y),y>0.
|
|
46
|
+
$$
|
|
47
|
+
|
|
48
|
+
We use the following model parameterization
|
|
49
|
+
|
|
50
|
+
$$
|
|
51
|
+
\log \mu_g =X_{\mu}\beta_{g,\mu}+Z_{\mu}\alpha_{\mu},
|
|
52
|
+
logit \pi_g =X_{\pi}\beta_{g,\pi}+Z_{\pi}\alpha_{\pi}, \log \theta_g = \beta_{g,\theta},
|
|
53
|
+
$$
|
|
54
|
+
|
|
55
|
+
where $\mu_g$ is the mean of subject $g$, $X_{\mu}$, $Z_{\mu}$, $X_{\pi}$ and $Z_{\pi}$ are feature matrices, $\beta_{g,\mu}$ and $\beta_{g,\pi}$ are coefficients for each subject $g$, $\alpha_{\mu}$ and $\alpha_{\pi}$ are common coefficients shared across all subjects.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
|
|
60
|
+
After downloading this repo, `cd` to the directory of downloaded repo and run:
|
|
61
|
+
|
|
62
|
+
`python setup.py install`
|
|
63
|
+
|
|
64
|
+
or
|
|
65
|
+
|
|
66
|
+
`pip install .`
|
|
67
|
+
|
|
68
|
+
For Apple silicon (M1, M2 and etc), it is recommended to install TensorFlow by following the command in Troubleshooting Section below.
|
|
69
|
+
|
|
70
|
+
## Model Estimation
|
|
71
|
+
|
|
72
|
+
`TensorZINB` solves the negative binomial (NB) and zero-inflated negative binomial (ZINB) models with given read counts.
|
|
73
|
+
|
|
74
|
+
### Model initialization
|
|
75
|
+
|
|
76
|
+
``` r
|
|
77
|
+
TensorZINB(
|
|
78
|
+
endog, # counts data: number of samples x number of subjects
|
|
79
|
+
exog, # observed variables for the negative binomial part
|
|
80
|
+
exog_c=None, # common observed variables across all subjects for the nb part
|
|
81
|
+
exog_infl=None, # observed variables for the logit part
|
|
82
|
+
exog_infl_c=None, # common observed variables across all subjects for the logit part
|
|
83
|
+
same_dispersion=False, # whether all subjects use the same dispersion
|
|
84
|
+
nb_only=False, # whether negative binomial only without logit or zero-inflation part
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Model fit
|
|
89
|
+
|
|
90
|
+
``` r
|
|
91
|
+
TensorZINB.fit(
|
|
92
|
+
init_weights={}, # initial model weights. If empty, init_method is used to find init weights
|
|
93
|
+
init_method="poi", # initialization method: `poi` for Poisson and `nb` for negative binomial
|
|
94
|
+
device_type="CPU", # device_type: `CPU` or `GPU`
|
|
95
|
+
device_name=None, # None or one from `tf.config.list_logical_devices()`
|
|
96
|
+
return_history=False, # whether return loss and weights history during training
|
|
97
|
+
epochs=5000, # maximum number of epochs to run
|
|
98
|
+
learning_rate=0.008, # start learning rate
|
|
99
|
+
num_epoch_skip=3, # number of epochs to skip learning rate reduction
|
|
100
|
+
is_early_stop=True, # whether use early stop
|
|
101
|
+
min_delta_early_stop=0.05,# minimum change in loss to qualify as an improvement
|
|
102
|
+
patience_early_stop=50, # number of epochs with no improvement after which training will be stopped
|
|
103
|
+
factor_reduce_lr=0.8, # factor by which the learning rate will be reduced
|
|
104
|
+
patience_reduce_lr=10, # number of epochs with no improvement after which learning rate will be reduced
|
|
105
|
+
min_lr=0.001, # lower bound on the learning rate
|
|
106
|
+
reset_keras_session=False,# reset keras session at the beginning
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Model results
|
|
111
|
+
|
|
112
|
+
``` r
|
|
113
|
+
{
|
|
114
|
+
"llf_total": # sum of log likelihood across all subjects
|
|
115
|
+
"llfs": # an array contains log likelihood for each subject
|
|
116
|
+
"aic_total": # sum of AIC across all subjects
|
|
117
|
+
"aics": # an array contains AIC for each subject
|
|
118
|
+
"df_model_total": # total degree of freedom of all subjects
|
|
119
|
+
"df_model": # degree of freedom for each subject
|
|
120
|
+
"weights": # model weights
|
|
121
|
+
"cpu_time": # total computing time for all subjects
|
|
122
|
+
"num_sample": # number of samples
|
|
123
|
+
"epochs": # number of epochs run
|
|
124
|
+
"loss_history": # loss history over epochs if return_history=True
|
|
125
|
+
"weights_history": # weights history over epochs if return_history=True
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## DEG Analysis
|
|
130
|
+
|
|
131
|
+
`LRTest` provides utility for scRNA-seq DEG analysis. It runs the likelihood ratio test (LRT) by computing the log likelihood difference with and without conditions being added to the model.
|
|
132
|
+
|
|
133
|
+
To construct a `LRTest` object, we use
|
|
134
|
+
``` r
|
|
135
|
+
LRTest(
|
|
136
|
+
df_data, # count data frame. columns: subjects (genes), rows: samples
|
|
137
|
+
df_feature, # feature data frame. columns: features, rows: samples
|
|
138
|
+
conditions, # list of features to test DEG, e.g., diagnosis
|
|
139
|
+
nb_features, # list of features for the negative binomial model
|
|
140
|
+
nb_features_c=None, # list of common features for the negative binomial model
|
|
141
|
+
infl_features=None, # list of features for the zero inflated (logit) model
|
|
142
|
+
infl_features_c=None, # list of common features for the zero inflated (logit) model
|
|
143
|
+
add_intercept=True, # whether add intercept. False if df_feature already contains intercept
|
|
144
|
+
nb_only=False, # whether only do negative binomial without zero inflation
|
|
145
|
+
same_dispersion=False, # whether all subjects use the same dispersion
|
|
146
|
+
)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
We then call `LRTest.run` to run the likelihood ratio test
|
|
150
|
+
``` r
|
|
151
|
+
LRTest.run(
|
|
152
|
+
learning_rate=0.008, # learning rate
|
|
153
|
+
epochs=5000, # number of epochs run
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
The `LRTest.run` returns a result dataframe `dfr` with columns:
|
|
158
|
+
``` r
|
|
159
|
+
[
|
|
160
|
+
"ll0": # log likelihood without conditions
|
|
161
|
+
"aic0": # AIC without conditions
|
|
162
|
+
"df0": # degree of freedom without conditions
|
|
163
|
+
"cpu_time0": # computing time for each subject without conditions
|
|
164
|
+
"ll1": # log likelihood without conditions
|
|
165
|
+
"aic1": # AIC with conditions
|
|
166
|
+
"df1": # degree of freedom with conditions
|
|
167
|
+
"cpu_time1": # computing time for each subject with conditions
|
|
168
|
+
"lld": # ll1 - ll0
|
|
169
|
+
"aicd": # aic1 - aic0
|
|
170
|
+
"pvalue": # p-value: 1 - stats.chi2.cdf(2 * lld, df1 - df0)
|
|
171
|
+
]
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
`tensorzinb.utils` provides utility functions:
|
|
176
|
+
|
|
177
|
+
- `normalize_features`: normalize scRNA-seq features by removing the mean and scaling to unit variance.
|
|
178
|
+
- `correct_pvalues_for_multiple_testing`: correct pvalues for multiple testing in Python, which is the same as `p.adjust` in `R`.
|
|
179
|
+
|
|
180
|
+
We can further correct pvalues for multiple testing by calling `correct_pvalues_for_multiple_testing(dfr['pvalue'])`.
|
|
181
|
+
|
|
182
|
+
## Example
|
|
183
|
+
|
|
184
|
+
An example code to show how to use `TensorZINB` and `LRTest` to perform DEG analysis can be found at [`examples/deg_example.ipynb`](examples/deg_example.ipynb). The example runs DEG analysis on a sample dataset with 17 clusters and 20 genes in each cluster.
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
## Tests
|
|
188
|
+
|
|
189
|
+
In `tests/tensorzinb.ipynb`, we show several tests:
|
|
190
|
+
|
|
191
|
+
- validate the Poisson weights initialization.
|
|
192
|
+
- compare with `statsmodels` for negative binomial model only without zero-inflation to make sure the results match.
|
|
193
|
+
- show `statsmodels` is not numerically stable for zero-inflated negative binomial. `statsmodels` can only return results when initialized with TensorZINB results. TensorZINB results match the true parameters used to generate the samples.
|
|
194
|
+
|
|
195
|
+
More tests can be found in https://github.com/wanglab-georgetown/countmodels/blob/main/tests/zinb_test.ipynb
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
## Troubleshooting
|
|
199
|
+
|
|
200
|
+
### Run on Apple silicon
|
|
201
|
+
To run tensorflow on Apple silicon (M1, M2, etc), install TensorFlow using the following:
|
|
202
|
+
|
|
203
|
+
`conda install -c apple tensorflow-deps`
|
|
204
|
+
|
|
205
|
+
`python -m pip install tensorflow-macos==2.9.2`
|
|
206
|
+
|
|
207
|
+
`python -m pip install tensorflow-metal==0.5.1`
|
|
208
|
+
|
|
209
|
+
### Feature normalization
|
|
210
|
+
|
|
211
|
+
If the solver cannot return correct results, please ensure features in $X$ are normalized by using `StandardScaler()`. Please refer to the example in [`examples/deg_example.ipynb`](examples/deg_example.ipynb).
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
## Reference
|
|
215
|
+
Cui, T., Wang, T. [A Comprehensive Assessment of Hurdle and Zero-inflated Models for Single Cell RNA-sequencing Analysis](https://doi.org/10.1093/bib/bbad272), Briefings in Bioinformatics, July 2023. https://doi.org/10.1093/bib/bbad272
|
|
216
|
+
|
|
217
|
+
## Support and Contribution
|
|
218
|
+
If you encounter any bugs while using the code, please don't hesitate to create an issue on GitHub here.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
tensorzinb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
tensorzinb/lrtest.py,sha256=eAwPdr5cKdJzVED5IGdHe9ZTjewhzqu_56Q6HSqGrt4,5680
|
|
3
|
+
tensorzinb/tensorzinb.py,sha256=mR995dyAegF51pIcDf_WtLsxAVTUn6A6ycMTTT9q6cQ,20535
|
|
4
|
+
tensorzinb/utils.py,sha256=whRWe4TpGqsQ3R6p8eHjvquEBDm0IKX00WMisduN1sg,3429
|
|
5
|
+
tensorzinb_plusplus-0.0.2.dist-info/METADATA,sha256=AMuET10jEXK_oELgMvRLFRrHN0tv3joAKQN2NgArprI,10850
|
|
6
|
+
tensorzinb_plusplus-0.0.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
7
|
+
tensorzinb_plusplus-0.0.2.dist-info/licenses/LICENSE,sha256=vdzedejlUZ0EzkJtQR3a_QIOkx5EHPrn04D_T5k-uDk,566
|
|
8
|
+
tensorzinb_plusplus-0.0.2.dist-info/RECORD,,
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Copyright 2020-2023 Tao Cui, Tingting Wang
|
|
2
|
+
|
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
you may not use this file except in compliance with the License.
|
|
5
|
+
You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
See the License for the specific language governing permissions and
|
|
13
|
+
limitations under the License.
|