tensorzinb-plusplus 0.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorzinb_plusplus-0.0.2/.github/workflows/publish.yml +54 -0
- tensorzinb_plusplus-0.0.2/.gitignore +47 -0
- tensorzinb_plusplus-0.0.2/LICENSE +13 -0
- tensorzinb_plusplus-0.0.2/PKG-INFO +218 -0
- tensorzinb_plusplus-0.0.2/README.md +185 -0
- tensorzinb_plusplus-0.0.2/examples/deg_example.ipynb +521 -0
- tensorzinb_plusplus-0.0.2/examples/meta.zip +0 -0
- tensorzinb_plusplus-0.0.2/examples/model_sel_count.zip +0 -0
- tensorzinb_plusplus-0.0.2/examples/model_sel_genes.csv +341 -0
- tensorzinb_plusplus-0.0.2/pyproject.toml +54 -0
- tensorzinb_plusplus-0.0.2/tensorzinb/__init__.py +0 -0
- tensorzinb_plusplus-0.0.2/tensorzinb/lrtest.py +176 -0
- tensorzinb_plusplus-0.0.2/tensorzinb/tensorzinb.py +611 -0
- tensorzinb_plusplus-0.0.2/tensorzinb/utils.py +62 -0
- tensorzinb_plusplus-0.0.2/tests/lrtest.ipynb +359 -0
- tensorzinb_plusplus-0.0.2/tests/tensorzinb.ipynb +1246 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
# Trigger: only when a tag matching v* is pushed (e.g. v0.0.2, v1.0.0)
|
|
4
|
+
# This means normal commits to main never trigger a release — you control
|
|
5
|
+
# releases explicitly by pushing a tag.
|
|
6
|
+
on:
|
|
7
|
+
push:
|
|
8
|
+
tags:
|
|
9
|
+
- "v*"
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
build:
|
|
13
|
+
name: Build distribution
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.11"
|
|
21
|
+
|
|
22
|
+
- name: Build wheel and sdist
|
|
23
|
+
run: |
|
|
24
|
+
pip install build
|
|
25
|
+
python -m build
|
|
26
|
+
|
|
27
|
+
# Upload the dist/ files as a workflow artifact so the publish job can
|
|
28
|
+
# download them. The two jobs run in separate VMs, so artifacts are how
|
|
29
|
+
# files travel between them.
|
|
30
|
+
- uses: actions/upload-artifact@v4
|
|
31
|
+
with:
|
|
32
|
+
name: dist
|
|
33
|
+
path: dist/
|
|
34
|
+
|
|
35
|
+
publish:
|
|
36
|
+
name: Publish to PyPI
|
|
37
|
+
needs: build # only runs if build succeeds
|
|
38
|
+
runs-on: ubuntu-latest
|
|
39
|
+
|
|
40
|
+
# These two settings enable Trusted Publishing (OIDC). PyPI verifies that
|
|
41
|
+
# this workflow is running from your repo before accepting the upload —
|
|
42
|
+
# no API token needed.
|
|
43
|
+
environment: pypi
|
|
44
|
+
permissions:
|
|
45
|
+
id-token: write # required for OIDC trusted publishing
|
|
46
|
+
|
|
47
|
+
steps:
|
|
48
|
+
- uses: actions/download-artifact@v4
|
|
49
|
+
with:
|
|
50
|
+
name: dist
|
|
51
|
+
path: dist/
|
|
52
|
+
|
|
53
|
+
- name: Publish to PyPI
|
|
54
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
|
|
2
|
+
# Compiled Java class files
|
|
3
|
+
*.class
|
|
4
|
+
|
|
5
|
+
# Compiled Python bytecode
|
|
6
|
+
*.py[cod]
|
|
7
|
+
|
|
8
|
+
# Log files
|
|
9
|
+
*.log
|
|
10
|
+
|
|
11
|
+
# Package files
|
|
12
|
+
*.jar
|
|
13
|
+
|
|
14
|
+
# Maven
|
|
15
|
+
target/
|
|
16
|
+
dist/
|
|
17
|
+
|
|
18
|
+
# JetBrains IDE
|
|
19
|
+
.idea/
|
|
20
|
+
|
|
21
|
+
# Unit test reports
|
|
22
|
+
TEST*.xml
|
|
23
|
+
|
|
24
|
+
# Generated by MacOS
|
|
25
|
+
.DS_Store
|
|
26
|
+
|
|
27
|
+
# Generated by Windows
|
|
28
|
+
Thumbs.db
|
|
29
|
+
|
|
30
|
+
# Applications
|
|
31
|
+
*.app
|
|
32
|
+
*.exe
|
|
33
|
+
*.war
|
|
34
|
+
|
|
35
|
+
# Large media files
|
|
36
|
+
*.mp4
|
|
37
|
+
*.tiff
|
|
38
|
+
*.avi
|
|
39
|
+
*.flv
|
|
40
|
+
*.mov
|
|
41
|
+
*.wmv
|
|
42
|
+
|
|
43
|
+
*checkpoint.ipynb
|
|
44
|
+
|
|
45
|
+
# Python packaging
|
|
46
|
+
*.egg-info/
|
|
47
|
+
__pycache__/
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Copyright 2020-2023 Tao Cui, Tingting Wang
|
|
2
|
+
|
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
you may not use this file except in compliance with the License.
|
|
5
|
+
You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
See the License for the specific language governing permissions and
|
|
13
|
+
limitations under the License.
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tensorzinb-plusplus
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Zero Inflated Negative Binomial regression for scRNA-seq (maintained fork of tensorzinb by Tao Cui & Tingting Wang)
|
|
5
|
+
Project-URL: Homepage, https://github.com/saarantras/tensorzinb-plusplus
|
|
6
|
+
Project-URL: Original Repository, https://github.com/wanglab-georgetown/tensorzinb
|
|
7
|
+
Author: Tingting Wang
|
|
8
|
+
Author-email: Tao Cui <taocui.caltech@gmail.com>
|
|
9
|
+
Maintainer-email: Mackenzie Noon <me@mackenzienoon.com>
|
|
10
|
+
License: Apache-2.0
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: differential expression,scRNA-seq,tensorflow,zero-inflated negative binomial
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
23
|
+
Requires-Python: <3.13,>=3.9
|
|
24
|
+
Requires-Dist: numpy>=1.23.5
|
|
25
|
+
Requires-Dist: pandas>=1.5.2
|
|
26
|
+
Requires-Dist: patsy>=0.5.3
|
|
27
|
+
Requires-Dist: scikit-learn>=1.2.0
|
|
28
|
+
Requires-Dist: scipy>=1.9.3
|
|
29
|
+
Requires-Dist: statsmodels>=0.13.5
|
|
30
|
+
Requires-Dist: tensorflow>=2.16
|
|
31
|
+
Requires-Dist: tf-keras>=2.16
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
## Zero-inflated Negative Binomial Model using TensorFlow
|
|
35
|
+
|
|
36
|
+
TensorZINB is a Python module that uses TensorFlow to effectively solve negative binomial (NB) and zero-inflated negative binomial (ZINB) models. One of its key strengths is its ability to accurately calculate the NB/ZINB log likelihood. Additionally, it can be used for differentially expressed gene (DEG) analysis in the context of single-cell RNA sequencing (scRNA-seq). This package distinguishes itself by ensuring numerical stability, enabling the processing of datasets in batches, and delivering superior computing speeds compared to other existing NB/ZINB solvers. To guarantee the reliability of its analysis results, TensorZINB has undergone rigorous testing against various statistical packages. TensorZINB supports the execution of various features on both the negative binomial and zero-inflated (logit) components. Furthermore, it allows for the use of common features with the same weights across multiple subjects within a batch.
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
The negative binomial distribution is
|
|
40
|
+
$$NB(y;\mu,\theta)=\frac{\Gamma(y+\theta)}{\Gamma(\theta)\Gamma(y+1)}\left( \frac{\theta}{\theta+\mu}\right)^\theta\left(\frac{\mu}{\theta+\mu}\right)^y$$
|
|
41
|
+
where $\mu$ is the mean and $\theta$ is the dispersion parameter. For zero-inflated models, the counts are modelled as a mixture of the Bernoulli distribution and count distribution, i.e.,
|
|
42
|
+
|
|
43
|
+
$$
|
|
44
|
+
Pr(Y=0)=\pi+(1-\pi)NB(0),\\
|
|
45
|
+
Pr(Y=y)=(1-\pi)NB(y),y>0.
|
|
46
|
+
$$
|
|
47
|
+
|
|
48
|
+
We use the following model parameterization
|
|
49
|
+
|
|
50
|
+
$$
|
|
51
|
+
\log \mu_g =X_{\mu}\beta_{g,\mu}+Z_{\mu}\alpha_{\mu},
|
|
52
|
+
logit \pi_g =X_{\pi}\beta_{g,\pi}+Z_{\pi}\alpha_{\pi}, \log \theta_g = \beta_{g,\theta},
|
|
53
|
+
$$
|
|
54
|
+
|
|
55
|
+
where $\mu_g$ is the mean of subject $g$, $X_{\mu}$, $Z_{\mu}$, $X_{\pi}$ and $Z_{\pi}$ are feature matrices, $\beta_{g,\mu}$ and $\beta_{g,\pi}$ are coefficients for each subject $g$, $\alpha_{\mu}$ and $\alpha_{\pi}$ are common coefficients shared across all subjects.
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## Installation
|
|
59
|
+
|
|
60
|
+
After downloading this repo, `cd` to the directory of downloaded repo and run:
|
|
61
|
+
|
|
62
|
+
`python setup.py install`
|
|
63
|
+
|
|
64
|
+
or
|
|
65
|
+
|
|
66
|
+
`pip install .`
|
|
67
|
+
|
|
68
|
+
For Apple silicon (M1, M2 and etc), it is recommended to install TensorFlow by following the command in Troubleshooting Section below.
|
|
69
|
+
|
|
70
|
+
## Model Estimation
|
|
71
|
+
|
|
72
|
+
`TensorZINB` solves the negative binomial (NB) and zero-inflated negative binomial (ZINB) models with given read counts.
|
|
73
|
+
|
|
74
|
+
### Model initialization
|
|
75
|
+
|
|
76
|
+
``` r
|
|
77
|
+
TensorZINB(
|
|
78
|
+
endog, # counts data: number of samples x number of subjects
|
|
79
|
+
exog, # observed variables for the negative binomial part
|
|
80
|
+
exog_c=None, # common observed variables across all subjects for the nb part
|
|
81
|
+
exog_infl=None, # observed variables for the logit part
|
|
82
|
+
exog_infl_c=None, # common observed variables across all subjects for the logit part
|
|
83
|
+
same_dispersion=False, # whether all subjects use the same dispersion
|
|
84
|
+
nb_only=False, # whether negative binomial only without logit or zero-inflation part
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Model fit
|
|
89
|
+
|
|
90
|
+
``` r
|
|
91
|
+
TensorZINB.fit(
|
|
92
|
+
init_weights={}, # initial model weights. If empty, init_method is used to find init weights
|
|
93
|
+
init_method="poi", # initialization method: `poi` for Poisson and `nb` for negative binomial
|
|
94
|
+
device_type="CPU", # device_type: `CPU` or `GPU`
|
|
95
|
+
device_name=None, # None or one from `tf.config.list_logical_devices()`
|
|
96
|
+
return_history=False, # whether return loss and weights history during training
|
|
97
|
+
epochs=5000, # maximum number of epochs to run
|
|
98
|
+
learning_rate=0.008, # start learning rate
|
|
99
|
+
num_epoch_skip=3, # number of epochs to skip learning rate reduction
|
|
100
|
+
is_early_stop=True, # whether use early stop
|
|
101
|
+
min_delta_early_stop=0.05,# minimum change in loss to qualify as an improvement
|
|
102
|
+
patience_early_stop=50, # number of epochs with no improvement after which training will be stopped
|
|
103
|
+
factor_reduce_lr=0.8, # factor by which the learning rate will be reduced
|
|
104
|
+
patience_reduce_lr=10, # number of epochs with no improvement after which learning rate will be reduced
|
|
105
|
+
min_lr=0.001, # lower bound on the learning rate
|
|
106
|
+
reset_keras_session=False,# reset keras session at the beginning
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Model results
|
|
111
|
+
|
|
112
|
+
``` r
|
|
113
|
+
{
|
|
114
|
+
"llf_total": # sum of log likelihood across all subjects
|
|
115
|
+
"llfs": # an array contains log likelihood for each subject
|
|
116
|
+
"aic_total": # sum of AIC across all subjects
|
|
117
|
+
"aics": # an array contains AIC for each subject
|
|
118
|
+
"df_model_total": # total degree of freedom of all subjects
|
|
119
|
+
"df_model": # degree of freedom for each subject
|
|
120
|
+
"weights": # model weights
|
|
121
|
+
"cpu_time": # total computing time for all subjects
|
|
122
|
+
"num_sample": # number of samples
|
|
123
|
+
"epochs": # number of epochs run
|
|
124
|
+
"loss_history": # loss history over epochs if return_history=True
|
|
125
|
+
"weights_history": # weights history over epochs if return_history=True
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## DEG Analysis
|
|
130
|
+
|
|
131
|
+
`LRTest` provides utility for scRNA-seq DEG analysis. It runs the likelihood ratio test (LRT) by computing the log likelihood difference with and without conditions being added to the model.
|
|
132
|
+
|
|
133
|
+
To construct a `LRTest` object, we use
|
|
134
|
+
``` r
|
|
135
|
+
LRTest(
|
|
136
|
+
df_data, # count data frame. columns: subjects (genes), rows: samples
|
|
137
|
+
df_feature, # feature data frame. columns: features, rows: samples
|
|
138
|
+
conditions, # list of features to test DEG, e.g., diagnosis
|
|
139
|
+
nb_features, # list of features for the negative binomial model
|
|
140
|
+
nb_features_c=None, # list of common features for the negative binomial model
|
|
141
|
+
infl_features=None, # list of features for the zero inflated (logit) model
|
|
142
|
+
infl_features_c=None, # list of common features for the zero inflated (logit) model
|
|
143
|
+
add_intercept=True, # whether add intercept. False if df_feature already contains intercept
|
|
144
|
+
nb_only=False, # whether only do negative binomial without zero inflation
|
|
145
|
+
same_dispersion=False, # whether all subjects use the same dispersion
|
|
146
|
+
)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
We then call `LRTest.run` to run the likelihood ratio test
|
|
150
|
+
``` r
|
|
151
|
+
LRTest.run(
|
|
152
|
+
learning_rate=0.008, # learning rate
|
|
153
|
+
epochs=5000, # number of epochs run
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
The `LRTest.run` returns a result dataframe `dfr` with columns:
|
|
158
|
+
``` r
|
|
159
|
+
[
|
|
160
|
+
"ll0": # log likelihood without conditions
|
|
161
|
+
"aic0": # AIC without conditions
|
|
162
|
+
"df0": # degree of freedom without conditions
|
|
163
|
+
"cpu_time0": # computing time for each subject without conditions
|
|
164
|
+
"ll1": # log likelihood without conditions
|
|
165
|
+
"aic1": # AIC with conditions
|
|
166
|
+
"df1": # degree of freedom with conditions
|
|
167
|
+
"cpu_time1": # computing time for each subject with conditions
|
|
168
|
+
"lld": # ll1 - ll0
|
|
169
|
+
"aicd": # aic1 - aic0
|
|
170
|
+
"pvalue": # p-value: 1 - stats.chi2.cdf(2 * lld, df1 - df0)
|
|
171
|
+
]
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
`tensorzinb.utils` provides utility functions:
|
|
176
|
+
|
|
177
|
+
- `normalize_features`: normalize scRNA-seq features by removing the mean and scaling to unit variance.
|
|
178
|
+
- `correct_pvalues_for_multiple_testing`: correct pvalues for multiple testing in Python, which is the same as `p.adjust` in `R`.
|
|
179
|
+
|
|
180
|
+
We can further correct pvalues for multiple testing by calling `correct_pvalues_for_multiple_testing(dfr['pvalue'])`.
|
|
181
|
+
|
|
182
|
+
## Example
|
|
183
|
+
|
|
184
|
+
An example code to show how to use `TensorZINB` and `LRTest` to perform DEG analysis can be found at [`examples/deg_example.ipynb`](examples/deg_example.ipynb). The example runs DEG analysis on a sample dataset with 17 clusters and 20 genes in each cluster.
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
## Tests
|
|
188
|
+
|
|
189
|
+
In `tests/tensorzinb.ipynb`, we show several tests:
|
|
190
|
+
|
|
191
|
+
- validate the Poisson weights initialization.
|
|
192
|
+
- compare with `statsmodels` for negative binomial model only without zero-inflation to make sure the results match.
|
|
193
|
+
- show `statsmodels` is not numerically stable for zero-inflated negative binomial. `statsmodels` can only return results when initialized with TensorZINB results. TensorZINB results match the true parameters used to generate the samples.
|
|
194
|
+
|
|
195
|
+
More tests can be found in https://github.com/wanglab-georgetown/countmodels/blob/main/tests/zinb_test.ipynb
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
## Troubleshooting
|
|
199
|
+
|
|
200
|
+
### Run on Apple silicon
|
|
201
|
+
To run tensorflow on Apple silicon (M1, M2, etc), install TensorFlow using the following:
|
|
202
|
+
|
|
203
|
+
`conda install -c apple tensorflow-deps`
|
|
204
|
+
|
|
205
|
+
`python -m pip install tensorflow-macos==2.9.2`
|
|
206
|
+
|
|
207
|
+
`python -m pip install tensorflow-metal==0.5.1`
|
|
208
|
+
|
|
209
|
+
### Feature normalization
|
|
210
|
+
|
|
211
|
+
If the solver cannot return correct results, please ensure features in $X$ are normalized by using `StandardScaler()`. Please refer to the example in [`examples/deg_example.ipynb`](examples/deg_example.ipynb).
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
## Reference
|
|
215
|
+
Cui, T., Wang, T. [A Comprehensive Assessment of Hurdle and Zero-inflated Models for Single Cell RNA-sequencing Analysis](https://doi.org/10.1093/bib/bbad272), Briefings in Bioinformatics, July 2023. https://doi.org/10.1093/bib/bbad272
|
|
216
|
+
|
|
217
|
+
## Support and Contribution
|
|
218
|
+
If you encounter any bugs while using the code, please don't hesitate to create an issue on GitHub here.
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
## Zero-inflated Negative Binomial Model using TensorFlow
|
|
2
|
+
|
|
3
|
+
TensorZINB is a Python module that uses TensorFlow to effectively solve negative binomial (NB) and zero-inflated negative binomial (ZINB) models. One of its key strengths is its ability to accurately calculate the NB/ZINB log likelihood. Additionally, it can be used for differentially expressed gene (DEG) analysis in the context of single-cell RNA sequencing (scRNA-seq). This package distinguishes itself by ensuring numerical stability, enabling the processing of datasets in batches, and delivering superior computing speeds compared to other existing NB/ZINB solvers. To guarantee the reliability of its analysis results, TensorZINB has undergone rigorous testing against various statistical packages. TensorZINB supports the execution of various features on both the negative binomial and zero-inflated (logit) components. Furthermore, it allows for the use of common features with the same weights across multiple subjects within a batch.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
The negative binomial distribution is
|
|
7
|
+
$$NB(y;\mu,\theta)=\frac{\Gamma(y+\theta)}{\Gamma(\theta)\Gamma(y+1)}\left( \frac{\theta}{\theta+\mu}\right)^\theta\left(\frac{\mu}{\theta+\mu}\right)^y$$
|
|
8
|
+
where $\mu$ is the mean and $\theta$ is the dispersion parameter. For zero-inflated models, the counts are modelled as a mixture of the Bernoulli distribution and count distribution, i.e.,
|
|
9
|
+
|
|
10
|
+
$$
|
|
11
|
+
Pr(Y=0)=\pi+(1-\pi)NB(0),\\
|
|
12
|
+
Pr(Y=y)=(1-\pi)NB(y),y>0.
|
|
13
|
+
$$
|
|
14
|
+
|
|
15
|
+
We use the following model parameterization
|
|
16
|
+
|
|
17
|
+
$$
|
|
18
|
+
\log \mu_g =X_{\mu}\beta_{g,\mu}+Z_{\mu}\alpha_{\mu},
|
|
19
|
+
logit \pi_g =X_{\pi}\beta_{g,\pi}+Z_{\pi}\alpha_{\pi}, \log \theta_g = \beta_{g,\theta},
|
|
20
|
+
$$
|
|
21
|
+
|
|
22
|
+
where $\mu_g$ is the mean of subject $g$, $X_{\mu}$, $Z_{\mu}$, $X_{\pi}$ and $Z_{\pi}$ are feature matrices, $\beta_{g,\mu}$ and $\beta_{g,\pi}$ are coefficients for each subject $g$, $\alpha_{\mu}$ and $\alpha_{\pi}$ are common coefficients shared across all subjects.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
After downloading this repo, `cd` to the directory of downloaded repo and run:
|
|
28
|
+
|
|
29
|
+
`python setup.py install`
|
|
30
|
+
|
|
31
|
+
or
|
|
32
|
+
|
|
33
|
+
`pip install .`
|
|
34
|
+
|
|
35
|
+
For Apple silicon (M1, M2 and etc), it is recommended to install TensorFlow by following the command in Troubleshooting Section below.
|
|
36
|
+
|
|
37
|
+
## Model Estimation
|
|
38
|
+
|
|
39
|
+
`TensorZINB` solves the negative binomial (NB) and zero-inflated negative binomial (ZINB) models with given read counts.
|
|
40
|
+
|
|
41
|
+
### Model initialization
|
|
42
|
+
|
|
43
|
+
``` r
|
|
44
|
+
TensorZINB(
|
|
45
|
+
endog, # counts data: number of samples x number of subjects
|
|
46
|
+
exog, # observed variables for the negative binomial part
|
|
47
|
+
exog_c=None, # common observed variables across all subjects for the nb part
|
|
48
|
+
exog_infl=None, # observed variables for the logit part
|
|
49
|
+
exog_infl_c=None, # common observed variables across all subjects for the logit part
|
|
50
|
+
same_dispersion=False, # whether all subjects use the same dispersion
|
|
51
|
+
nb_only=False, # whether negative binomial only without logit or zero-inflation part
|
|
52
|
+
)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Model fit
|
|
56
|
+
|
|
57
|
+
``` r
|
|
58
|
+
TensorZINB.fit(
|
|
59
|
+
init_weights={}, # initial model weights. If empty, init_method is used to find init weights
|
|
60
|
+
init_method="poi", # initialization method: `poi` for Poisson and `nb` for negative binomial
|
|
61
|
+
device_type="CPU", # device_type: `CPU` or `GPU`
|
|
62
|
+
device_name=None, # None or one from `tf.config.list_logical_devices()`
|
|
63
|
+
return_history=False, # whether return loss and weights history during training
|
|
64
|
+
epochs=5000, # maximum number of epochs to run
|
|
65
|
+
learning_rate=0.008, # start learning rate
|
|
66
|
+
num_epoch_skip=3, # number of epochs to skip learning rate reduction
|
|
67
|
+
is_early_stop=True, # whether use early stop
|
|
68
|
+
min_delta_early_stop=0.05,# minimum change in loss to qualify as an improvement
|
|
69
|
+
patience_early_stop=50, # number of epochs with no improvement after which training will be stopped
|
|
70
|
+
factor_reduce_lr=0.8, # factor by which the learning rate will be reduced
|
|
71
|
+
patience_reduce_lr=10, # number of epochs with no improvement after which learning rate will be reduced
|
|
72
|
+
min_lr=0.001, # lower bound on the learning rate
|
|
73
|
+
reset_keras_session=False,# reset keras session at the beginning
|
|
74
|
+
)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Model results
|
|
78
|
+
|
|
79
|
+
``` r
|
|
80
|
+
{
|
|
81
|
+
"llf_total": # sum of log likelihood across all subjects
|
|
82
|
+
"llfs": # an array contains log likelihood for each subject
|
|
83
|
+
"aic_total": # sum of AIC across all subjects
|
|
84
|
+
"aics": # an array contains AIC for each subject
|
|
85
|
+
"df_model_total": # total degree of freedom of all subjects
|
|
86
|
+
"df_model": # degree of freedom for each subject
|
|
87
|
+
"weights": # model weights
|
|
88
|
+
"cpu_time": # total computing time for all subjects
|
|
89
|
+
"num_sample": # number of samples
|
|
90
|
+
"epochs": # number of epochs run
|
|
91
|
+
"loss_history": # loss history over epochs if return_history=True
|
|
92
|
+
"weights_history": # weights history over epochs if return_history=True
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## DEG Analysis
|
|
97
|
+
|
|
98
|
+
`LRTest` provides utility for scRNA-seq DEG analysis. It runs the likelihood ratio test (LRT) by computing the log likelihood difference with and without conditions being added to the model.
|
|
99
|
+
|
|
100
|
+
To construct a `LRTest` object, we use
|
|
101
|
+
``` r
|
|
102
|
+
LRTest(
|
|
103
|
+
df_data, # count data frame. columns: subjects (genes), rows: samples
|
|
104
|
+
df_feature, # feature data frame. columns: features, rows: samples
|
|
105
|
+
conditions, # list of features to test DEG, e.g., diagnosis
|
|
106
|
+
nb_features, # list of features for the negative binomial model
|
|
107
|
+
nb_features_c=None, # list of common features for the negative binomial model
|
|
108
|
+
infl_features=None, # list of features for the zero inflated (logit) model
|
|
109
|
+
infl_features_c=None, # list of common features for the zero inflated (logit) model
|
|
110
|
+
add_intercept=True, # whether add intercept. False if df_feature already contains intercept
|
|
111
|
+
nb_only=False, # whether only do negative binomial without zero inflation
|
|
112
|
+
same_dispersion=False, # whether all subjects use the same dispersion
|
|
113
|
+
)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
We then call `LRTest.run` to run the likelihood ratio test
|
|
117
|
+
``` r
|
|
118
|
+
LRTest.run(
|
|
119
|
+
learning_rate=0.008, # learning rate
|
|
120
|
+
epochs=5000, # number of epochs run
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
The `LRTest.run` returns a result dataframe `dfr` with columns:
|
|
125
|
+
``` r
|
|
126
|
+
[
|
|
127
|
+
"ll0": # log likelihood without conditions
|
|
128
|
+
"aic0": # AIC without conditions
|
|
129
|
+
"df0": # degree of freedom without conditions
|
|
130
|
+
"cpu_time0": # computing time for each subject without conditions
|
|
131
|
+
"ll1": # log likelihood without conditions
|
|
132
|
+
"aic1": # AIC with conditions
|
|
133
|
+
"df1": # degree of freedom with conditions
|
|
134
|
+
"cpu_time1": # computing time for each subject with conditions
|
|
135
|
+
"lld": # ll1 - ll0
|
|
136
|
+
"aicd": # aic1 - aic0
|
|
137
|
+
"pvalue": # p-value: 1 - stats.chi2.cdf(2 * lld, df1 - df0)
|
|
138
|
+
]
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
`tensorzinb.utils` provides utility functions:
|
|
143
|
+
|
|
144
|
+
- `normalize_features`: normalize scRNA-seq features by removing the mean and scaling to unit variance.
|
|
145
|
+
- `correct_pvalues_for_multiple_testing`: correct pvalues for multiple testing in Python, which is the same as `p.adjust` in `R`.
|
|
146
|
+
|
|
147
|
+
We can further correct pvalues for multiple testing by calling `correct_pvalues_for_multiple_testing(dfr['pvalue'])`.
|
|
148
|
+
|
|
149
|
+
## Example
|
|
150
|
+
|
|
151
|
+
An example code to show how to use `TensorZINB` and `LRTest` to perform DEG analysis can be found at [`examples/deg_example.ipynb`](examples/deg_example.ipynb). The example runs DEG analysis on a sample dataset with 17 clusters and 20 genes in each cluster.
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
## Tests
|
|
155
|
+
|
|
156
|
+
In `tests/tensorzinb.ipynb`, we show several tests:
|
|
157
|
+
|
|
158
|
+
- validate the Poisson weights initialization.
|
|
159
|
+
- compare with `statsmodels` for negative binomial model only without zero-inflation to make sure the results match.
|
|
160
|
+
- show `statsmodels` is not numerically stable for zero-inflated negative binomial. `statsmodels` can only return results when initialized with TensorZINB results. TensorZINB results match the true parameters used to generate the samples.
|
|
161
|
+
|
|
162
|
+
More tests can be found in https://github.com/wanglab-georgetown/countmodels/blob/main/tests/zinb_test.ipynb
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
## Troubleshooting
|
|
166
|
+
|
|
167
|
+
### Run on Apple silicon
|
|
168
|
+
To run tensorflow on Apple silicon (M1, M2, etc), install TensorFlow using the following:
|
|
169
|
+
|
|
170
|
+
`conda install -c apple tensorflow-deps`
|
|
171
|
+
|
|
172
|
+
`python -m pip install tensorflow-macos==2.9.2`
|
|
173
|
+
|
|
174
|
+
`python -m pip install tensorflow-metal==0.5.1`
|
|
175
|
+
|
|
176
|
+
### Feature normalization
|
|
177
|
+
|
|
178
|
+
If the solver cannot return correct results, please ensure features in $X$ are normalized by using `StandardScaler()`. Please refer to the example in [`examples/deg_example.ipynb`](examples/deg_example.ipynb).
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
## Reference
|
|
182
|
+
Cui, T., Wang, T. [A Comprehensive Assessment of Hurdle and Zero-inflated Models for Single Cell RNA-sequencing Analysis](https://doi.org/10.1093/bib/bbad272), Briefings in Bioinformatics, July 2023. https://doi.org/10.1093/bib/bbad272
|
|
183
|
+
|
|
184
|
+
## Support and Contribution
|
|
185
|
+
If you encounter any bugs while using the code, please don't hesitate to create an issue on GitHub here.
|