tensorzinb-plusplus 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tensorzinb/__init__.py ADDED
File without changes
tensorzinb/lrtest.py ADDED
@@ -0,0 +1,176 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from patsy import dmatrices
4
+ from scipy import stats
5
+ from .tensorzinb import TensorZINB
6
+
7
+
8
+ class LRTest:
9
+ def __init__(
10
+ self,
11
+ df_data,
12
+ df_feature,
13
+ conditions,
14
+ nb_features,
15
+ nb_features_c=None,
16
+ infl_features=None,
17
+ infl_features_c=None,
18
+ add_intercept=True,
19
+ nb_only=False,
20
+ same_dispersion=False,
21
+ ):
22
+ self.df_data = df_data
23
+ self.df_feature = df_feature
24
+ self.conditions = list(conditions)
25
+ self.nb_features = list(set(nb_features) - set(conditions))
26
+ self.nb_features_c = nb_features_c
27
+ if nb_features_c is None or len(nb_features_c) == 0:
28
+ self._no_nb_c = True
29
+ else:
30
+ self._no_nb_c = False
31
+
32
+ if infl_features is not None and len(infl_features) > 0:
33
+ self.infl_features = list(set(infl_features) - set(conditions))
34
+ self._no_infl = False
35
+ else:
36
+ self.infl_features = None
37
+ self._no_infl = True
38
+
39
+ self.infl_features_c = infl_features_c
40
+ if infl_features_c is None or len(infl_features_c) == 0:
41
+ self._no_infl_c = True
42
+ else:
43
+ self._no_infl_c = False
44
+
45
+ self.nb_only = nb_only
46
+ if self._no_infl_c and self._no_infl:
47
+ self.nb_only = True
48
+
49
+ self.add_intercept = add_intercept
50
+ self.same_dispersion = same_dispersion
51
+
52
+ self._gen_feature_dfs()
53
+
54
+ self.res0 = None
55
+ self.res1 = None
56
+ self.df_result = None
57
+
58
+ def _get_feature_idx_map(self, Xs):
59
+ df_maps = []
60
+ for X in Xs:
61
+ df_f = pd.DataFrame(X.columns, columns=["feature"])
62
+ df_f["idx"] = range(len(df_f))
63
+ df_maps.append(df_f)
64
+ dft = pd.merge(df_maps[1], df_maps[0], on="feature")
65
+ idx_map = [dft.idx_x.values, dft.idx_y.values]
66
+ return idx_map
67
+
68
+ def _gen_feature_df(self, df_feature, features, add_intercept=True):
69
+ if add_intercept:
70
+ formula = "{} ~ {}".format("1", " + ".join(features))
71
+ else:
72
+ formula = "{} ~ {} - 1".format("1", " + ".join(features))
73
+
74
+ _, predictors = dmatrices(formula, df_feature, return_type="dataframe")
75
+ return predictors
76
+
77
+ def _gen_feature_dfs(self):
78
+ fs = [self.nb_features, self.nb_features + self.conditions]
79
+ Xs = []
80
+ dfs = []
81
+ for feature in fs:
82
+ df = self._gen_feature_df(
83
+ self.df_feature, feature, add_intercept=self.add_intercept
84
+ )
85
+ dfs.append(df)
86
+ Xs.append(df.values)
87
+ self.Xs = Xs
88
+ self.X_idx_map = self._get_feature_idx_map(dfs)
89
+
90
+ if self._no_nb_c:
91
+ self.X_c = None
92
+ else:
93
+ self.X_c = self._gen_feature_df(
94
+ self.df_feature, self.nb_features_c, add_intercept=False
95
+ ).values
96
+
97
+ if not self.nb_only and not self._no_infl:
98
+ fs = [self.infl_features, self.infl_features + self.conditions]
99
+ X_infls = []
100
+ dfs = []
101
+ for feature in fs:
102
+ df = self._gen_feature_df(
103
+ self.df_feature, feature, add_intercept=self.add_intercept
104
+ )
105
+ dfs.append(df)
106
+ X_infls.append(df.values)
107
+ self.X_infls = X_infls
108
+ self.X_infl_idx_map = self._get_feature_idx_map(dfs)
109
+ else:
110
+ self.X_infls = [None, None]
111
+ self.X_infl_idx_map = None
112
+
113
+ if self._no_infl_c:
114
+ self.X_infl_c = None
115
+ else:
116
+ self.X_infl_c = self._gen_feature_df(
117
+ self.df_feature, self.infl_features_c, add_intercept=False
118
+ ).values
119
+
120
+ def run(self, learning_rate=0.008, epochs=5000):
121
+
122
+ zinb0 = TensorZINB(
123
+ self.df_data.values,
124
+ self.Xs[0],
125
+ exog_c=self.X_c,
126
+ exog_infl=self.X_infls[0],
127
+ exog_infl_c=self.X_infl_c,
128
+ same_dispersion=self.same_dispersion,
129
+ nb_only=self.nb_only,
130
+ )
131
+
132
+ res0 = zinb0.fit(learning_rate=learning_rate, epochs=epochs)
133
+ self.res0 = res0
134
+
135
+ zinb1 = TensorZINB(
136
+ self.df_data.values,
137
+ self.Xs[1],
138
+ exog_c=self.X_c,
139
+ exog_infl=self.X_infls[1],
140
+ exog_infl_c=self.X_infl_c,
141
+ same_dispersion=self.same_dispersion,
142
+ nb_only=self.nb_only,
143
+ )
144
+
145
+ weights = res0["weights"]
146
+ if "x_mu" in weights:
147
+ x_mu = np.zeros((zinb1.k_exog, zinb1.num_out))
148
+ x_mu[self.X_idx_map[0], :] = weights["x_mu"][self.X_idx_map[1], :]
149
+ weights["x_mu"] = x_mu
150
+
151
+ if "x_pi" in weights:
152
+ x_pi = np.zeros((zinb1.k_exog_infl, zinb1.num_out))
153
+ x_pi[self.X_infl_idx_map[0], :] = weights["x_pi"][self.X_infl_idx_map[1], :]
154
+ weights["x_pi"] = x_pi
155
+
156
+ res1 = zinb1.fit(init_weights=weights, learning_rate=learning_rate, epochs=epochs)
157
+ self.res1 = res1
158
+
159
+ dfr = pd.DataFrame(self.df_data.columns, columns=["test"])
160
+ dfr["llf0"] = res0["llfs"]
161
+ dfr["aic0"] = res0["aics"]
162
+ dfr["df0"] = res0["df"]
163
+ dfr["cpu_time0"] = res0["cpu_time"] / len(dfr)
164
+ dfr["llf1"] = res1["llfs"]
165
+ dfr["aic1"] = res1["aics"]
166
+ dfr["df1"] = res1["df"]
167
+ dfr["cpu_time1"] = res1["cpu_time"] / len(dfr)
168
+
169
+ dfr["llfd"] = dfr["llf1"] - dfr["llf0"]
170
+ dfr["aicd"] = dfr["aic1"] - dfr["aic0"]
171
+ dfd = dfr["df1"] - dfr["df0"]
172
+ dfr["pvalue"] = 1 - stats.chi2.cdf(2 * dfr["llfd"], dfd)
173
+
174
+ self.df_result = dfr
175
+
176
+ return dfr
@@ -0,0 +1,611 @@
1
+ import warnings
2
+ import time
3
+
4
+ import contextlib
5
+ import os
6
+ import numpy as np
7
+
8
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
9
+ os.environ["TF_USE_LEGACY_KERAS"] = "1"
10
+
11
+ import tensorflow as tf
12
+
13
+ tf.get_logger().setLevel("ERROR")
14
+ tf.autograph.set_verbosity(0)
15
+
16
+ import tf_keras
17
+ from tf_keras.models import Model
18
+ from tf_keras.layers import Lambda, Input, Dense, RepeatVector, Reshape, Add
19
+ from tf_keras.callbacks import EarlyStopping, ReduceLROnPlateau
20
+ from tf_keras.optimizers.legacy import RMSprop
21
+ from tf_keras import backend as K
22
+ from scipy.special import gammaln
23
+ import statsmodels.api as sm
24
+
25
+ # Reset Keras Session
26
+ def reset_keras():
27
+ K.clear_session()
28
+
29
+
30
+ class ZINBLogLik:
31
+ def __init__(
32
+ self, pi=None, log_theta=None, nb_only=False, scope="zinb/", zero_threshold=1e-8
33
+ ):
34
+ self.pi = pi
35
+ self.zero_threshold = zero_threshold
36
+ self.scope = scope
37
+ self.log_theta = log_theta
38
+ self.nb_only = nb_only
39
+ self.y = None
40
+ self.llf = None
41
+
42
+ def loss(self, y_true, y_pred):
43
+ with tf.name_scope(self.scope):
44
+ y = tf.cast(y_true, tf.float32)
45
+ # mu is already in log
46
+ mu = tf.cast(y_pred, tf.float32)
47
+ log_theta = self.log_theta
48
+ theta = tf.math.exp(log_theta)
49
+
50
+ t1 = tf.math.lgamma(y + theta)
51
+ t2 = -tf.math.lgamma(theta)
52
+ t3 = theta * log_theta
53
+ t4 = y * mu
54
+ ty = tf.reduce_logsumexp(tf.stack([log_theta, mu], axis=0), axis=0)
55
+ t5 = -(theta + y) * ty
56
+
57
+ if self.nb_only:
58
+ result = -(t1 + t2 + t3 + t4 + t5)
59
+ else:
60
+ log_q0 = -tf.nn.softplus(-self.pi)
61
+ # log_q1 = -tf.nn.softplus(self.pi) = -tf.nn.softplus(-self.pi) - self.pi
62
+ log_q1 = log_q0 - self.pi
63
+
64
+ nb_case = -(t1 + t2 + t3 + t4 + t5 + log_q1)
65
+
66
+ p1 = theta * (log_theta - ty) + log_q1
67
+ zero_case = -tf.reduce_logsumexp(tf.stack([log_q0, p1], axis=0), axis=0)
68
+ result = tf.where(tf.less(y, self.zero_threshold), zero_case, nb_case)
69
+ self.llf = tf.reduce_mean(result, axis=0)
70
+ self.y = y
71
+ result = tf.reduce_sum(self.llf)
72
+
73
+ return result
74
+
75
+
76
+ class PredictionCallback(tf_keras.callbacks.Callback):
77
+ def on_train_begin(self, logs={}):
78
+ self.weights = []
79
+
80
+ def on_epoch_end(self, epoch, logs={}):
81
+ ws = (
82
+ np.concatenate([np.array(w.flatten()) for w in self.model.get_weights()])
83
+ ).flatten()
84
+ self.weights.append(ws)
85
+
86
+
87
+ class ReduceLROnPlateauSkip(ReduceLROnPlateau):
88
+ def __init__(
89
+ self,
90
+ monitor="val_loss",
91
+ factor=0.1,
92
+ patience=10,
93
+ verbose=0,
94
+ mode="auto",
95
+ min_delta=1e-4,
96
+ cooldown=0,
97
+ min_lr=0,
98
+ num_epoch_skip=3,
99
+ **kwargs,
100
+ ):
101
+ self.num_epoch_skip = num_epoch_skip
102
+
103
+ super(ReduceLROnPlateauSkip, self).__init__(
104
+ monitor=monitor,
105
+ factor=factor,
106
+ patience=patience,
107
+ verbose=verbose,
108
+ mode=mode,
109
+ min_delta=min_delta,
110
+ cooldown=cooldown,
111
+ min_lr=min_lr,
112
+ **kwargs,
113
+ )
114
+
115
+ def on_epoch_end(self, epoch, logs=None):
116
+ if epoch < self.num_epoch_skip:
117
+ return
118
+ super(ReduceLROnPlateauSkip, self).on_epoch_end(epoch, logs)
119
+
120
+
121
+ class TensorZINB:
122
+ def __init__(
123
+ self,
124
+ endog,
125
+ exog,
126
+ exog_c=None,
127
+ exog_infl=None,
128
+ exog_infl_c=None,
129
+ same_dispersion=False,
130
+ nb_only=False,
131
+ **kwargs,
132
+ ):
133
+ self.endog = endog
134
+ if len(endog.shape) == 1:
135
+ self.num_sample = len(endog)
136
+ self.num_out = 1
137
+ self.endog = endog.reshape((-1, 1))
138
+ else:
139
+ self.num_sample, self.num_out = np.shape(endog)
140
+
141
+ df_model = 0
142
+ df_model_c = 0
143
+
144
+ self.exog = exog
145
+ df_model = np.linalg.matrix_rank(exog)
146
+ if len(exog.shape) == 1:
147
+ self.k_exog = 1
148
+ self.exog = exog.reshape((-1, 1))
149
+ else:
150
+ self.k_exog = exog.shape[1]
151
+
152
+ self.exog_c = exog_c
153
+ if exog_c is None:
154
+ self.k_exog_c = 0
155
+ self._no_exog_c = True
156
+ self.exog_c = np.zeros((self.num_sample, self.k_exog_c), dtype=np.float64)
157
+ else:
158
+ self.k_exog_c = exog_c.shape[1]
159
+ self._no_exog_c = False
160
+ if self.k_exog_c > 0:
161
+ df_model_c = df_model_c + np.linalg.matrix_rank(exog_c)
162
+
163
+ self.nb_only = nb_only
164
+ if exog_infl is None and exog_infl_c is None:
165
+ self.nb_only = True
166
+
167
+ self.exog_infl = exog_infl
168
+ if exog_infl is None or self.nb_only:
169
+ self.k_exog_infl = 0
170
+ self._no_exog_infl = True
171
+ self.exog_infl = np.ones(
172
+ (self.num_sample, self.k_exog_infl), dtype=np.float64
173
+ )
174
+ else:
175
+ self.k_exog_infl = exog_infl.shape[1]
176
+ self._no_exog_infl = False
177
+ if self.k_exog_infl > 0:
178
+ df_model = df_model + np.linalg.matrix_rank(exog_infl)
179
+
180
+ self.exog_infl_c = exog_infl_c
181
+ if exog_infl_c is None or self.nb_only:
182
+ self.k_exog_infl_c = 0
183
+ self._no_exog_infl_c = True
184
+ self.exog_infl_c = np.ones(
185
+ (self.num_sample, self.k_exog_infl_c), dtype=np.float64
186
+ )
187
+ else:
188
+ self.k_exog_infl_c = exog_infl_c.shape[1]
189
+ self._no_exog_infl_c = False
190
+ if self.k_exog_infl_c > 0:
191
+ df_model_c = df_model_c + np.linalg.matrix_rank(exog_infl_c)
192
+
193
+ df_model_each = df_model + df_model_c
194
+ df_model = df_model * self.num_out + df_model_c
195
+
196
+ self.same_dispersion = same_dispersion
197
+ if same_dispersion:
198
+ self.k_disperson = 1
199
+ df_model = df_model + 1
200
+ else:
201
+ self.k_disperson = self.num_out
202
+ df_model = df_model + self.num_out
203
+ df_model_each = df_model_each + 1
204
+ self.df_model = df_model
205
+ self.df_model_each = df_model_each
206
+
207
+ self.loglike_method = "nb2"
208
+
209
+ def fit(
210
+ self,
211
+ init_weights={},
212
+ init_method="poi",
213
+ device_type="CPU",
214
+ device_name=None,
215
+ return_history=False,
216
+ epochs=5000,
217
+ learning_rate=0.01,
218
+ num_epoch_skip=3,
219
+ is_early_stop=True,
220
+ is_reduce_lr=True,
221
+ min_delta_early_stop=0.05,
222
+ patience_early_stop=50,
223
+ factor_reduce_lr=0.8,
224
+ patience_reduce_lr=10,
225
+ min_lr=0.001,
226
+ reset_keras_session=False,
227
+ **kwargs,
228
+ ):
229
+ if device_name is None:
230
+ devices = tf.config.list_logical_devices(device_type)
231
+ device_name = devices[0].name
232
+
233
+ num_sample = self.num_sample
234
+ num_out = self.num_out
235
+ num_feat = self.k_exog
236
+ num_feat_infl = self.k_exog_infl
237
+ num_feat_c = self.k_exog_c
238
+ num_feat_infl_c = self.k_exog_infl_c
239
+ num_dispersion = self.k_disperson
240
+
241
+ # initiate weights
242
+ if len(init_weights) == 0:
243
+ if init_method == "poi":
244
+ init_weights = self._poisson_init()
245
+ elif init_method == "nb":
246
+ init_weights = self._nb_init()
247
+
248
+ # use distinct names to retrieve weights from layers
249
+ weight_keys = ["x_mu", "z_mu", "x_pi", "z_pi", "theta"]
250
+
251
+ if reset_keras_session:
252
+ with warnings.catch_warnings():
253
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
254
+ reset_keras()
255
+ K.clear_session()
256
+
257
+ with tf.device(device_name):
258
+ tf.compat.v1.disable_eager_execution()
259
+
260
+ inputs = Input(shape=(num_feat,))
261
+ inputs_infl = Input(shape=(num_feat_infl,))
262
+ inputs_c = Input(shape=(num_feat_c,))
263
+ inputs_infl_c = Input(shape=(num_feat_infl_c,))
264
+ inputs_theta = Input(shape=(1,))
265
+
266
+ if 'x_mu' in init_weights:
267
+ x = Dense(num_out, use_bias=False, name='x_mu', weights=[init_weights['x_mu']])(inputs)
268
+ else:
269
+ x = Dense(num_out, use_bias=False, name='x_mu')(inputs)
270
+ if num_feat_c>0:
271
+ if 'z_mu' in init_weights:
272
+ x_c = Dense(1, use_bias=False, name='z_mu', weights=[init_weights['z_mu']])(inputs_c)
273
+ else:
274
+ x_c = Dense(1, use_bias=False, name='z_mu')(inputs_c)
275
+ predictions = Add()([x,x_c])
276
+ else:
277
+ predictions = x
278
+
279
+ if self.nb_only:
280
+ pi = None
281
+ else:
282
+ if 'x_pi' in init_weights:
283
+ x_infl = Dense(num_out, use_bias=False, name='x_pi', weights=[init_weights['x_pi']])(inputs_infl)
284
+ else:
285
+ x_infl = Dense(num_out, use_bias=False, name='x_pi')(inputs_infl)
286
+
287
+ if num_feat_infl_c>0:
288
+ if 'z_pi' in init_weights:
289
+ x_infl_c = Dense(1, use_bias=False, name='z_pi', weights=[init_weights['z_pi']])(inputs_infl_c)
290
+ else:
291
+ x_infl_c = Dense(1, use_bias=False, name='z_pi')(inputs_infl_c)
292
+ pi = Add()([x_infl, x_infl_c])
293
+ else:
294
+ pi = x_infl
295
+
296
+ if 'theta' in init_weights:
297
+ theta0 = Dense(num_dispersion, use_bias=False, name='theta', weights=[init_weights['theta']])(inputs_theta)
298
+ else:
299
+ theta0 = Dense(num_dispersion, use_bias=False, name='theta')(inputs_theta)
300
+ if self.same_dispersion:
301
+ theta = Reshape((num_out,))(RepeatVector(num_out)(theta0))
302
+ else:
303
+ theta = theta0
304
+
305
+ zinb = ZINBLogLik(pi, theta, nb_only=self.nb_only)
306
+
307
+ if self.nb_only:
308
+ output = Lambda(lambda x: x[0])([predictions, theta])
309
+ else:
310
+ output = Lambda(lambda x: x[0])([predictions, pi, theta])
311
+
312
+ model = Model(
313
+ inputs=[inputs, inputs_c, inputs_infl, inputs_infl_c, inputs_theta],
314
+ outputs=[output],
315
+ )
316
+
317
+ opt = RMSprop(learning_rate=learning_rate)
318
+ model.compile(loss=zinb.loss, optimizer=opt)
319
+
320
+ callbacks = []
321
+ if is_early_stop:
322
+ early_stop = EarlyStopping(
323
+ monitor="loss",
324
+ min_delta=min_delta_early_stop / num_sample,
325
+ patience=patience_early_stop,
326
+ mode="min",
327
+ )
328
+ callbacks.append(early_stop)
329
+
330
+ if is_reduce_lr:
331
+ reduce_lr = ReduceLROnPlateauSkip(
332
+ monitor="loss",
333
+ factor=factor_reduce_lr,
334
+ patience=patience_reduce_lr,
335
+ min_lr=min_lr,
336
+ num_epoch_skip=num_epoch_skip,
337
+ )
338
+ callbacks.append(reduce_lr)
339
+
340
+ if return_history:
341
+ # this get all weights over epoch
342
+ get_weights = PredictionCallback()
343
+ callbacks.append(get_weights)
344
+
345
+ # TODO: FIX this. code randomly crashes on apple silicon M1/M2 with
346
+ # error `Incompatible shapes`. code usually runs fine after second try.
347
+ # similar to this issue https://developer.apple.com/forums/thread/701985
348
+ for i in range(10):
349
+ try:
350
+ start_time = time.time()
351
+ losses = model.fit(
352
+ [
353
+ self.exog,
354
+ self.exog_c,
355
+ self.exog_infl,
356
+ self.exog_infl_c,
357
+ np.ones((num_sample, 1)),
358
+ ],
359
+ [self.endog],
360
+ callbacks=callbacks,
361
+ batch_size=num_sample,
362
+ epochs=epochs,
363
+ verbose=0,
364
+ )
365
+ cpu_time = time.time() - start_time
366
+ break
367
+ except Exception as e:
368
+ print(model.summary())
369
+ print(e)
370
+ continue
371
+
372
+ # retrieve LL
373
+ get_llfs = K.function(
374
+ [inputs, inputs_c, inputs_infl, inputs_infl_c, inputs_theta, zinb.y],
375
+ [zinb.llf],
376
+ )
377
+ llft = get_llfs(
378
+ [
379
+ self.exog,
380
+ self.exog_c,
381
+ self.exog_infl,
382
+ self.exog_infl_c,
383
+ np.ones((num_sample, 1)),
384
+ self.endog,
385
+ ]
386
+ )[0]
387
+
388
+ llfs = -(llft * num_sample + np.sum(gammaln(self.endog + 1), axis=0))
389
+ aics = -2 * (llfs - self.df_model_each)
390
+
391
+ llf = np.sum(llfs)
392
+ aic = -2 * (llf - self.df_model)
393
+
394
+ # get weights
395
+ weights = model.get_weights()
396
+
397
+ names_t = [
398
+ (weight.name).split("/")[0]
399
+ for layer in model.layers
400
+ for weight in layer.weights
401
+ ]
402
+ # tf layer weight has subscript in names
403
+ weight_names = []
404
+ for name in names_t:
405
+ matched = name
406
+ for nt in weight_keys:
407
+ if nt in name:
408
+ matched = nt
409
+ break
410
+ weight_names.append(matched)
411
+
412
+ weights_dict = dict(zip(weight_names, weights))
413
+
414
+ res = {
415
+ "llf_total": llf,
416
+ "llfs": llfs,
417
+ "aic_total": aic,
418
+ "aics": aics,
419
+ "df_model_total": self.df_model,
420
+ "df": self.df_model_each,
421
+ "weights": weights_dict,
422
+ "cpu_time": cpu_time,
423
+ "num_sample": num_sample,
424
+ "epochs": len(losses.history["loss"]),
425
+ }
426
+
427
+ if return_history:
428
+ res["loss_history"] = losses.history["loss"]
429
+ res["weights_history"] = np.array(get_weights.weights)
430
+
431
+ return res
432
+
433
+ # https://github.com/statsmodels/statsmodels/blob/main/statsmodels/discrete/discrete_model.py#L3691
434
+ def _estimate_dispersion(self, mu, resid, df_resid=None, loglike_method="nb2"):
435
+ if df_resid is None:
436
+ df_resid = resid.shape[0]
437
+ if loglike_method == "nb2":
438
+ a = ((resid**2 / mu - 1) / mu).sum() / df_resid
439
+ else: # self.loglike_method == 'nb1':
440
+ a = (resid**2 / mu - 1).sum() / df_resid
441
+ return a
442
+
443
+ def _compute_pi_init(self, nz_prob, p_nonzero, infl_prob_max=0.99):
444
+ ww = 1 - min(nz_prob / p_nonzero, infl_prob_max)
445
+ return -np.log(1 / ww - 1)
446
+
447
+ def _poisson_init_each(
448
+ self,
449
+ Y,
450
+ estimate_infl=True,
451
+ eps=1e-10,
452
+ maxiter=100,
453
+ theta_lb=0.05,
454
+ intercept_var_th=1e-3,
455
+ infl_prob_max=0.99,
456
+ ):
457
+ find_poi_sol = True
458
+ with warnings.catch_warnings():
459
+ warnings.filterwarnings("ignore")
460
+ try:
461
+ poi_mod = sm.Poisson(Y, self.exog).fit(
462
+ maxiter=maxiter, disp=False, warn_convergence=False
463
+ )
464
+ if np.isnan(poi_mod.params).any():
465
+ find_poi_sol = False
466
+ else:
467
+ mu = poi_mod.predict()
468
+ a = self._estimate_dispersion(
469
+ mu, poi_mod.resid, df_resid=poi_mod.df_resid
470
+ )
471
+ theta = 1 / max(a, theta_lb)
472
+ x_mu = np.reshape(poi_mod.params, (self.k_exog, 1))
473
+ except:
474
+ find_poi_sol = False
475
+
476
+ if not find_poi_sol:
477
+ vs = np.std(self.exog, axis=0)
478
+ # find intercept index
479
+ min_idx = np.argmin(vs)
480
+ if vs[min_idx] < intercept_var_th:
481
+ x_mu = np.zeros((self.k_exog, 1))
482
+ mu = np.mean(Y)
483
+ x_mu[min_idx] = np.log(mu) / np.mean(self.exog[:, min_idx])
484
+ resid = Y - mu
485
+ a = self._estimate_dispersion(mu, resid)
486
+ if np.isnan(a) or np.isinf(a):
487
+ a = theta_lb
488
+ theta = 1 / max(a, theta_lb)
489
+ else:
490
+ x_mu = None
491
+ return {}
492
+
493
+ weights = {"x_mu": x_mu, "theta": np.array([np.log(theta)]).reshape((-1, 1))}
494
+
495
+ if not self._no_exog_infl and estimate_infl:
496
+ pred = np.maximum(mu, 10 * eps)
497
+ p_nonzero = 1 - np.mean(np.power(theta / (theta + pred + eps), theta))
498
+
499
+ # find intercept index
500
+ vs = np.std(self.exog_infl, axis=0)
501
+ min_idx = np.argmin(vs)
502
+ if vs[min_idx] < intercept_var_th:
503
+ nz_prob = np.mean(Y > 0)
504
+ x_pi = np.zeros((self.k_exog_infl, 1))
505
+ fv = np.mean(self.exog_infl[:, min_idx])
506
+ w_pi = self._compute_pi_init(
507
+ nz_prob, p_nonzero, infl_prob_max=infl_prob_max
508
+ )
509
+ x_pi[min_idx] = w_pi / fv
510
+ weights["x_pi"] = x_pi
511
+
512
+ return weights
513
+
514
+ def _poisson_init(
515
+ self,
516
+ eps=1e-10,
517
+ maxiter=100,
518
+ theta_lb=0.05,
519
+ intercept_var_th=1e-3,
520
+ infl_prob_max=0.99,
521
+ ):
522
+ x_mu = []
523
+ x_pi = []
524
+ theta = []
525
+ return_x_pi = True
526
+ return_theta = True
527
+ for i in range(self.num_out):
528
+ w = self._poisson_init_each(
529
+ self.endog[:, i],
530
+ eps=eps,
531
+ maxiter=maxiter,
532
+ theta_lb=theta_lb,
533
+ intercept_var_th=intercept_var_th,
534
+ infl_prob_max=infl_prob_max,
535
+ )
536
+
537
+ if len(w) == 0:
538
+ return {}
539
+
540
+ if "x_mu" in w:
541
+ x_mu.append(w["x_mu"])
542
+ else:
543
+ return {}
544
+
545
+ if "x_pi" in w:
546
+ x_pi.append(w["x_pi"])
547
+ else:
548
+ return_x_pi = False
549
+
550
+ if "theta" in w:
551
+ theta.append(w["theta"])
552
+ else:
553
+ return_theta = False
554
+ weights = {"x_mu": np.concatenate(x_mu, axis=1)}
555
+ if return_x_pi:
556
+ weights["x_pi"] = np.concatenate(x_pi, axis=1)
557
+ if return_theta:
558
+ t = np.concatenate(theta, axis=1)
559
+ if self.same_dispersion:
560
+ weights["theta"] = np.array(np.mean(t)).reshape((-1, 1))
561
+ else:
562
+ weights["theta"] = t
563
+ return weights
564
+
565
+ def _nb_init(self, infl_prob_max=0.99, intercept_var_th=1e-3):
566
+ nb_mod = TensorZINB(
567
+ self.endog,
568
+ self.exog,
569
+ exog_c=self.exog_c,
570
+ same_dispersion=self.same_dispersion,
571
+ nb_only=True,
572
+ )
573
+ nb_res = nb_mod.fit(init_method="poi")
574
+ weights = nb_res["weights"]
575
+
576
+ if self._no_exog_infl:
577
+ return weights
578
+ # find intercept index
579
+ vs = np.std(self.exog_infl, axis=0)
580
+ min_idx = np.argmin(vs)
581
+ # do not compute logit weight if there is no intercept
582
+ if vs[min_idx] < intercept_var_th:
583
+ x_pi = np.zeros((self.k_exog_infl, self.num_out))
584
+ fv = np.mean(self.exog_infl[:, min_idx])
585
+ if self.same_dispersion:
586
+ theta = np.exp(
587
+ np.array(list(weights["theta"].flatten()) * self.num_out)
588
+ )
589
+ else:
590
+ theta = np.exp(weights["theta"].flatten())
591
+
592
+ mu_c = 0
593
+ if self.k_exog_c > 0 and "z_mu" in weights:
594
+ mu_c = np.dot(self.exog_c, weights["z_mu"])
595
+
596
+ for i in range(self.num_out):
597
+ mu = np.dot(self.exog, weights["x_mu"][:, i]) + mu_c
598
+ mu = np.exp(mu)
599
+ p_nonzero = 1 - np.mean(np.power(theta[i] / (theta[i] + mu), theta[i]))
600
+ nz_prob = np.mean(self.endog[:, i] > 0)
601
+ w_pi = self._compute_pi_init(
602
+ nz_prob, p_nonzero, infl_prob_max=infl_prob_max
603
+ )
604
+ x_pi[min_idx, i] = w_pi / fv
605
+
606
+ weights["x_pi"] = x_pi
607
+
608
+ if self.k_exog_infl_c > 0:
609
+ weights["z_pi"] = np.zeros((self.k_exog_infl_c, 1))
610
+
611
+ return weights
tensorzinb/utils.py ADDED
@@ -0,0 +1,62 @@
1
+ import numpy as np
2
+ from sklearn.preprocessing import StandardScaler
3
+ from numpy.linalg import matrix_rank
4
+
5
+ def LI_vecs(dim,M):
6
+ LI=[M[0]]
7
+ idxs=[0]
8
+ for i in range(dim):
9
+ tmp=[]
10
+ for r in LI:
11
+ tmp.append(r)
12
+ tmp.append(M[i]) #set tmp=LI+[M[i]]
13
+ if matrix_rank(tmp)>len(LI): #test if M[i] is linearly independent from all (row) vectors in LI
14
+ LI.append(M[i]) #note that matrix_rank does not need to take in a square matrix
15
+ idxs.append(i)
16
+ return idxs
17
+
18
+ def find_independent_columns(exog):
19
+ if np.linalg.matrix_rank(exog)==np.shape(exog)[1]:
20
+ return np.array(exog.columns)
21
+ idxs = LI_vecs(np.shape(exog)[1], exog.T.values)
22
+ return np.array(exog.columns[idxs])
23
+
24
+ def normalize_features(df_feature, features_to_norm):
25
+ scaler = StandardScaler()
26
+ for f in features_to_norm:
27
+ df_feature[f] = scaler.fit_transform(df_feature[f].values.reshape(-1, 1)).flatten()
28
+ return df_feature
29
+
30
+ # https://stackoverflow.com/a/21739593
31
+ def correct_pvalues_for_multiple_testing(pvalues, correction_type = "Benjamini-Hochberg"):
32
+ """
33
+ consistent with R - print correct_pvalues_for_multiple_testing([0.0, 0.01, 0.029, 0.03, 0.031, 0.05, 0.069, 0.07, 0.071, 0.09, 0.1])
34
+ """
35
+ from numpy import array, empty
36
+ pvalues = array(pvalues)
37
+ n = pvalues.shape[0]
38
+ new_pvalues = np.zeros(n)
39
+ if correction_type == "Bonferroni":
40
+ new_pvalues = n * pvalues
41
+ elif correction_type == "Bonferroni-Holm":
42
+ values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
43
+ values.sort()
44
+ for rank, vals in enumerate(values):
45
+ pvalue, i = vals
46
+ new_pvalues[i] = (n-rank) * pvalue
47
+ elif correction_type == "Benjamini-Hochberg":
48
+ values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
49
+ values.sort()
50
+ values.reverse()
51
+ new_values = []
52
+ for i, vals in enumerate(values):
53
+ rank = n - i
54
+ pvalue, index = vals
55
+ new_values.append((n/rank) * pvalue)
56
+ for i in range(0, int(n)-1):
57
+ if new_values[i] < new_values[i+1]:
58
+ new_values[i+1] = new_values[i]
59
+ for i, vals in enumerate(values):
60
+ pvalue, index = vals
61
+ new_pvalues[index] = new_values[i]
62
+ return new_pvalues
@@ -0,0 +1,218 @@
1
+ Metadata-Version: 2.4
2
+ Name: tensorzinb-plusplus
3
+ Version: 0.0.2
4
+ Summary: Zero Inflated Negative Binomial regression for scRNA-seq (maintained fork of tensorzinb by Tao Cui & Tingting Wang)
5
+ Project-URL: Homepage, https://github.com/saarantras/tensorzinb-plusplus
6
+ Project-URL: Original Repository, https://github.com/wanglab-georgetown/tensorzinb
7
+ Author: Tingting Wang
8
+ Author-email: Tao Cui <taocui.caltech@gmail.com>
9
+ Maintainer-email: Mackenzie Noon <me@mackenzienoon.com>
10
+ License: Apache-2.0
11
+ License-File: LICENSE
12
+ Keywords: differential expression,scRNA-seq,tensorflow,zero-inflated negative binomial
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
23
+ Requires-Python: <3.13,>=3.9
24
+ Requires-Dist: numpy>=1.23.5
25
+ Requires-Dist: pandas>=1.5.2
26
+ Requires-Dist: patsy>=0.5.3
27
+ Requires-Dist: scikit-learn>=1.2.0
28
+ Requires-Dist: scipy>=1.9.3
29
+ Requires-Dist: statsmodels>=0.13.5
30
+ Requires-Dist: tensorflow>=2.16
31
+ Requires-Dist: tf-keras>=2.16
32
+ Description-Content-Type: text/markdown
33
+
34
+ ## Zero-inflated Negative Binomial Model using TensorFlow
35
+
36
+ TensorZINB is a Python module that uses TensorFlow to effectively solve negative binomial (NB) and zero-inflated negative binomial (ZINB) models. One of its key strengths is its ability to accurately calculate the NB/ZINB log likelihood. Additionally, it can be used for differentially expressed gene (DEG) analysis in the context of single-cell RNA sequencing (scRNA-seq). This package distinguishes itself by ensuring numerical stability, enabling the processing of datasets in batches, and delivering superior computing speeds compared to other existing NB/ZINB solvers. To guarantee the reliability of its analysis results, TensorZINB has undergone rigorous testing against various statistical packages. TensorZINB supports the execution of various features on both the negative binomial and zero-inflated (logit) components. Furthermore, it allows for the use of common features with the same weights across multiple subjects within a batch.
37
+
38
+
39
+ The negative binomial distribution is
40
+ $$NB(y;\mu,\theta)=\frac{\Gamma(y+\theta)}{\Gamma(\theta)\Gamma(y+1)}\left( \frac{\theta}{\theta+\mu}\right)^\theta\left(\frac{\mu}{\theta+\mu}\right)^y$$
41
+ where $\mu$ is the mean and $\theta$ is the dispersion parameter. For zero-inflated models, the counts are modelled as a mixture of the Bernoulli distribution and count distribution, i.e.,
42
+
43
+ $$
44
+ Pr(Y=0)=\pi+(1-\pi)NB(0),\\
45
+ Pr(Y=y)=(1-\pi)NB(y),y>0.
46
+ $$
47
+
48
+ We use the following model parameterization
49
+
50
+ $$
51
+ \log \mu_g =X_{\mu}\beta_{g,\mu}+Z_{\mu}\alpha_{\mu},
52
+ logit \pi_g =X_{\pi}\beta_{g,\pi}+Z_{\pi}\alpha_{\pi}, \log \theta_g = \beta_{g,\theta},
53
+ $$
54
+
55
+ where $\mu_g$ is the mean of subject $g$, $X_{\mu}$, $Z_{\mu}$, $X_{\pi}$ and $Z_{\pi}$ are feature matrices, $\beta_{g,\mu}$ and $\beta_{g,\pi}$ are coefficients for each subject $g$, $\alpha_{\mu}$ and $\alpha_{\pi}$ are common coefficients shared across all subjects.
56
+
57
+
58
+ ## Installation
59
+
60
+ After downloading this repo, `cd` to the directory of downloaded repo and run:
61
+
62
+ `python setup.py install`
63
+
64
+ or
65
+
66
+ `pip install .`
67
+
68
+ For Apple silicon (M1, M2 and etc), it is recommended to install TensorFlow by following the command in Troubleshooting Section below.
69
+
70
+ ## Model Estimation
71
+
72
+ `TensorZINB` solves the negative binomial (NB) and zero-inflated negative binomial (ZINB) models with given read counts.
73
+
74
+ ### Model initialization
75
+
76
+ ``` r
77
+ TensorZINB(
78
+ endog, # counts data: number of samples x number of subjects
79
+ exog, # observed variables for the negative binomial part
80
+ exog_c=None, # common observed variables across all subjects for the nb part
81
+ exog_infl=None, # observed variables for the logit part
82
+ exog_infl_c=None, # common observed variables across all subjects for the logit part
83
+ same_dispersion=False, # whether all subjects use the same dispersion
84
+ nb_only=False, # whether negative binomial only without logit or zero-inflation part
85
+ )
86
+ ```
87
+
88
+ ### Model fit
89
+
90
+ ``` r
91
+ TensorZINB.fit(
92
+ init_weights={}, # initial model weights. If empty, init_method is used to find init weights
93
+ init_method="poi", # initialization method: `poi` for Poisson and `nb` for negative binomial
94
+ device_type="CPU", # device_type: `CPU` or `GPU`
95
+ device_name=None, # None or one from `tf.config.list_logical_devices()`
96
+ return_history=False, # whether return loss and weights history during training
97
+ epochs=5000, # maximum number of epochs to run
98
+ learning_rate=0.008, # start learning rate
99
+ num_epoch_skip=3, # number of epochs to skip learning rate reduction
100
+ is_early_stop=True, # whether use early stop
101
+ min_delta_early_stop=0.05,# minimum change in loss to qualify as an improvement
102
+ patience_early_stop=50, # number of epochs with no improvement after which training will be stopped
103
+ factor_reduce_lr=0.8, # factor by which the learning rate will be reduced
104
+ patience_reduce_lr=10, # number of epochs with no improvement after which learning rate will be reduced
105
+ min_lr=0.001, # lower bound on the learning rate
106
+ reset_keras_session=False,# reset keras session at the beginning
107
+ )
108
+ ```
109
+
110
+ ### Model results
111
+
112
+ ``` r
113
+ {
114
+ "llf_total": # sum of log likelihood across all subjects
115
+ "llfs": # an array contains log likelihood for each subject
116
+ "aic_total": # sum of AIC across all subjects
117
+ "aics": # an array contains AIC for each subject
118
+ "df_model_total": # total degree of freedom of all subjects
119
+ "df_model": # degree of freedom for each subject
120
+ "weights": # model weights
121
+ "cpu_time": # total computing time for all subjects
122
+ "num_sample": # number of samples
123
+ "epochs": # number of epochs run
124
+ "loss_history": # loss history over epochs if return_history=True
125
+ "weights_history": # weights history over epochs if return_history=True
126
+ }
127
+ ```
128
+
129
+ ## DEG Analysis
130
+
131
+ `LRTest` provides utility for scRNA-seq DEG analysis. It runs the likelihood ratio test (LRT) by computing the log likelihood difference with and without conditions being added to the model.
132
+
133
+ To construct a `LRTest` object, we use
134
+ ``` r
135
+ LRTest(
136
+ df_data, # count data frame. columns: subjects (genes), rows: samples
137
+ df_feature, # feature data frame. columns: features, rows: samples
138
+ conditions, # list of features to test DEG, e.g., diagnosis
139
+ nb_features, # list of features for the negative binomial model
140
+ nb_features_c=None, # list of common features for the negative binomial model
141
+ infl_features=None, # list of features for the zero inflated (logit) model
142
+ infl_features_c=None, # list of common features for the zero inflated (logit) model
143
+ add_intercept=True, # whether add intercept. False if df_feature already contains intercept
144
+ nb_only=False, # whether only do negative binomial without zero inflation
145
+ same_dispersion=False, # whether all subjects use the same dispersion
146
+ )
147
+ ```
148
+
149
+ We then call `LRTest.run` to run the likelihood ratio test
150
+ ``` r
151
+ LRTest.run(
152
+ learning_rate=0.008, # learning rate
153
+ epochs=5000, # number of epochs run
154
+ )
155
+ ```
156
+
157
+ The `LRTest.run` returns a result dataframe `dfr` with columns:
158
+ ``` r
159
+ [
160
+ "ll0": # log likelihood without conditions
161
+ "aic0": # AIC without conditions
162
+ "df0": # degree of freedom without conditions
163
+ "cpu_time0": # computing time for each subject without conditions
164
+ "ll1": # log likelihood without conditions
165
+ "aic1": # AIC with conditions
166
+ "df1": # degree of freedom with conditions
167
+ "cpu_time1": # computing time for each subject with conditions
168
+ "lld": # ll1 - ll0
169
+ "aicd": # aic1 - aic0
170
+ "pvalue": # p-value: 1 - stats.chi2.cdf(2 * lld, df1 - df0)
171
+ ]
172
+ ```
173
+
174
+
175
+ `tensorzinb.utils` provides utility functions:
176
+
177
+ - `normalize_features`: normalize scRNA-seq features by removing the mean and scaling to unit variance.
178
+ - `correct_pvalues_for_multiple_testing`: correct pvalues for multiple testing in Python, which is the same as `p.adjust` in `R`.
179
+
180
+ We can further correct pvalues for multiple testing by calling `correct_pvalues_for_multiple_testing(dfr['pvalue'])`.
181
+
182
+ ## Example
183
+
184
+ An example code to show how to use `TensorZINB` and `LRTest` to perform DEG analysis can be found at [`examples/deg_example.ipynb`](examples/deg_example.ipynb). The example runs DEG analysis on a sample dataset with 17 clusters and 20 genes in each cluster.
185
+
186
+
187
+ ## Tests
188
+
189
+ In `tests/tensorzinb.ipynb`, we show several tests:
190
+
191
+ - validate the Poisson weights initialization.
192
+ - compare with `statsmodels` for negative binomial model only without zero-inflation to make sure the results match.
193
+ - show `statsmodels` is not numerically stable for zero-inflated negative binomial. `statsmodels` can only return results when initialized with TensorZINB results. TensorZINB results match the true parameters used to generate the samples.
194
+
195
+ More tests can be found in https://github.com/wanglab-georgetown/countmodels/blob/main/tests/zinb_test.ipynb
196
+
197
+
198
+ ## Troubleshooting
199
+
200
+ ### Run on Apple silicon
201
+ To run tensorflow on Apple silicon (M1, M2, etc), install TensorFlow using the following:
202
+
203
+ `conda install -c apple tensorflow-deps`
204
+
205
+ `python -m pip install tensorflow-macos==2.9.2`
206
+
207
+ `python -m pip install tensorflow-metal==0.5.1`
208
+
209
+ ### Feature normalization
210
+
211
+ If the solver cannot return correct results, please ensure features in $X$ are normalized by using `StandardScaler()`. Please refer to the example in [`examples/deg_example.ipynb`](examples/deg_example.ipynb).
212
+
213
+
214
+ ## Reference
215
+ Cui, T., Wang, T. [A Comprehensive Assessment of Hurdle and Zero-inflated Models for Single Cell RNA-sequencing Analysis](https://doi.org/10.1093/bib/bbad272), Briefings in Bioinformatics, July 2023. https://doi.org/10.1093/bib/bbad272
216
+
217
+ ## Support and Contribution
218
+ If you encounter any bugs while using the code, please don't hesitate to create an issue on GitHub here.
@@ -0,0 +1,8 @@
1
+ tensorzinb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ tensorzinb/lrtest.py,sha256=eAwPdr5cKdJzVED5IGdHe9ZTjewhzqu_56Q6HSqGrt4,5680
3
+ tensorzinb/tensorzinb.py,sha256=mR995dyAegF51pIcDf_WtLsxAVTUn6A6ycMTTT9q6cQ,20535
4
+ tensorzinb/utils.py,sha256=whRWe4TpGqsQ3R6p8eHjvquEBDm0IKX00WMisduN1sg,3429
5
+ tensorzinb_plusplus-0.0.2.dist-info/METADATA,sha256=AMuET10jEXK_oELgMvRLFRrHN0tv3joAKQN2NgArprI,10850
6
+ tensorzinb_plusplus-0.0.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
7
+ tensorzinb_plusplus-0.0.2.dist-info/licenses/LICENSE,sha256=vdzedejlUZ0EzkJtQR3a_QIOkx5EHPrn04D_T5k-uDk,566
8
+ tensorzinb_plusplus-0.0.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,13 @@
1
+ Copyright 2020-2023 Tao Cui, Tingting Wang
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.