bdext 0.1.65__py3-none-any.whl → 0.1.67__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- README.md +165 -103
- bdeissct_dl/__init__.py +1 -3
- bdeissct_dl/bdeissct_model.py +11 -65
- bdeissct_dl/dl_model.py +7 -119
- bdeissct_dl/estimator.py +8 -108
- bdeissct_dl/model_serializer.py +4 -34
- bdeissct_dl/scaler_fitting.py +3 -6
- bdeissct_dl/sumstat_checker.py +2 -2
- bdeissct_dl/training.py +9 -30
- bdeissct_dl/tree_encoder.py +13 -32
- bdext-0.1.67.dist-info/METADATA +240 -0
- bdext-0.1.67.dist-info/RECORD +17 -0
- {bdext-0.1.65.dist-info → bdext-0.1.67.dist-info}/entry_points.txt +0 -2
- bdeissct_dl/estimator_ct.py +0 -63
- bdeissct_dl/main_covid.py +0 -76
- bdeissct_dl/model_finder.py +0 -47
- bdeissct_dl/pinball_loss.py +0 -48
- bdeissct_dl/train_ct.py +0 -125
- bdext-0.1.65.dist-info/METADATA +0 -178
- bdext-0.1.65.dist-info/RECORD +0 -22
- {bdext-0.1.65.dist-info → bdext-0.1.67.dist-info}/LICENSE +0 -0
- {bdext-0.1.65.dist-info → bdext-0.1.67.dist-info}/WHEEL +0 -0
- {bdext-0.1.65.dist-info → bdext-0.1.67.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: bdext
|
|
3
|
+
Version: 0.1.67
|
|
4
|
+
Summary: Estimation of BDEISS-CT parameters from phylogenetic trees.
|
|
5
|
+
Home-page: https://github.com/modpath/bdeissct
|
|
6
|
+
Author: Anna Zhukova
|
|
7
|
+
Author-email: anna.zhukova@pasteur.fr
|
|
8
|
+
License: UNKNOWN
|
|
9
|
+
Keywords: phylogenetics,birth-death model,incubation,super-spreading,contact tracing
|
|
10
|
+
Platform: UNKNOWN
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
15
|
+
Classifier: Topic :: Software Development
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
Requires-Dist: tensorflow==2.19.0
|
|
19
|
+
Requires-Dist: six
|
|
20
|
+
Requires-Dist: ete3
|
|
21
|
+
Requires-Dist: numpy==2.0.2
|
|
22
|
+
Requires-Dist: scipy==1.14.1
|
|
23
|
+
Requires-Dist: biopython
|
|
24
|
+
Requires-Dist: scikit-learn==1.5.2
|
|
25
|
+
Requires-Dist: pandas==2.2.3
|
|
26
|
+
Requires-Dist: treesumstats==0.7
|
|
27
|
+
|
|
28
|
+
# bdext
|
|
29
|
+
|
|
30
|
+
The bdext package provides scripts to train and assess
|
|
31
|
+
Deep-Learning-enables estimators of BD(EI)(SS)(CT) model parameters from phylogenetic trees
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
[//]: # ([](https://doi.org/10.1093/sysbio/syad059))
|
|
36
|
+
[//]: # ([](https://github.com/evolbioinfo/bdext/releases))
|
|
37
|
+
[](https://pypi.org/project/bdext/)
|
|
38
|
+
[](https://pypi.org/project/bdext)
|
|
39
|
+
[](https://hub.docker.com/r/evolbioinfo/bdext/tags)
|
|
40
|
+
|
|
41
|
+
## BDEISS-CT model
|
|
42
|
+
|
|
43
|
+
The Birth-Death (BD) Exposed-Infectious (EI) with SuperSpreading (SS) and Contact-Tracing (CT) model (BDEISS-CT)
|
|
44
|
+
can be described with the following 8 parameters:
|
|
45
|
+
|
|
46
|
+
* average reproduction number R;
|
|
47
|
+
* average total infection duration d;
|
|
48
|
+
* incubation period d<sub>inc</sub>;
|
|
49
|
+
* sampling probability ρ;
|
|
50
|
+
* fraction of superspreaders f<sub>S</sub>;
|
|
51
|
+
* super-spreading transmission increase X<sub>S</sub>;
|
|
52
|
+
* contact tracing probability υ;
|
|
53
|
+
* contact-traced removal speed up X<sub>C</sub>.
|
|
54
|
+
|
|
55
|
+
Setting d<sub>inc</sub>=0 removes incubation (EI), setting f<sub>S</sub>=0 removes superspreading (SS), while setting υ=0 removes contact-tracing (CT).
|
|
56
|
+
|
|
57
|
+
For identifiability, we require the sampling probability ρ to be given by the user.
|
|
58
|
+
The other parameters are estimated from a time-scaled phylogenetic tree.
|
|
59
|
+
|
|
60
|
+
[//]: # (## BDEISS-CT parameter estimator)
|
|
61
|
+
|
|
62
|
+
[//]: # ()
|
|
63
|
+
[//]: # (The bdeissct_dl package provides deep-learning-based BDEISS-CT model parameter estimator )
|
|
64
|
+
|
|
65
|
+
[//]: # (from a user-supplied time-scaled phylogenetic tree. )
|
|
66
|
+
|
|
67
|
+
[//]: # (User must also provide a value for one of the three BD model parameters (λ, ψ, or ρ). )
|
|
68
|
+
|
|
69
|
+
[//]: # (We recommend providing the sampling probability ρ, )
|
|
70
|
+
|
|
71
|
+
[//]: # (which could be estimated as the number of tree tips divided by the number of declared cases for the same time period.)
|
|
72
|
+
|
|
73
|
+
[//]: # ()
|
|
74
|
+
[//]: # ()
|
|
75
|
+
[//]: # (## Input data)
|
|
76
|
+
|
|
77
|
+
[//]: # (One needs to supply a time-scaled phylogenetic tree in newick format. )
|
|
78
|
+
|
|
79
|
+
[//]: # (In the examples below we will use an HIV tree reconstructed from 200 sequences, )
|
|
80
|
+
|
|
81
|
+
[//]: # (published in [[Rasmussen _et al._ PLoS Comput. Biol. 2017]](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005448), )
|
|
82
|
+
|
|
83
|
+
[//]: # (which you can find at [PairTree GitHub](https://github.com/davidrasm/PairTree) )
|
|
84
|
+
|
|
85
|
+
[//]: # (and in [hiv_zurich/Zurich.nwk](hiv_zurich/Zurich.nwk). )
|
|
86
|
+
|
|
87
|
+
[//]: # ()
|
|
88
|
+
[//]: # (## Installation)
|
|
89
|
+
|
|
90
|
+
[//]: # ()
|
|
91
|
+
[//]: # (There are 4 alternative ways to run __bdeissct_dl__ on your computer: )
|
|
92
|
+
|
|
93
|
+
[//]: # (with [docker](https://www.docker.com/community-edition), )
|
|
94
|
+
|
|
95
|
+
[//]: # ([apptainer](https://apptainer.org/),)
|
|
96
|
+
|
|
97
|
+
[//]: # (in Python3, or via command line (requires installation with Python3).)
|
|
98
|
+
|
|
99
|
+
[//]: # ()
|
|
100
|
+
[//]: # ()
|
|
101
|
+
[//]: # ()
|
|
102
|
+
[//]: # (### Run in python3 or command-line (for linux systems, recommended Ubuntu 21 or newer versions))
|
|
103
|
+
|
|
104
|
+
[//]: # ()
|
|
105
|
+
[//]: # (You could either install python (version 3.9 or higher) system-wide and then install bdeissct_dl via pip:)
|
|
106
|
+
|
|
107
|
+
[//]: # (```bash)
|
|
108
|
+
|
|
109
|
+
[//]: # (sudo apt install -y python3 python3-pip python3-setuptools python3-distutils)
|
|
110
|
+
|
|
111
|
+
[//]: # (pip3 install bdeissct_dl)
|
|
112
|
+
|
|
113
|
+
[//]: # (```)
|
|
114
|
+
|
|
115
|
+
[//]: # ()
|
|
116
|
+
[//]: # (or alternatively, you could install python (version 3.9 or higher) and bdeissct_dl via [conda](https://conda.io/docs/) (make sure that conda is installed first). )
|
|
117
|
+
|
|
118
|
+
[//]: # (Here we will create a conda environment called _phyloenv_:)
|
|
119
|
+
|
|
120
|
+
[//]: # (```bash)
|
|
121
|
+
|
|
122
|
+
[//]: # (conda create --name phyloenv python=3.12)
|
|
123
|
+
|
|
124
|
+
[//]: # (conda activate phyloenv)
|
|
125
|
+
|
|
126
|
+
[//]: # (pip install bdeissct_dl)
|
|
127
|
+
|
|
128
|
+
[//]: # (```)
|
|
129
|
+
|
|
130
|
+
[//]: # ()
|
|
131
|
+
[//]: # ()
|
|
132
|
+
[//]: # (#### Basic usage in a command line)
|
|
133
|
+
|
|
134
|
+
[//]: # (If you installed __bdeissct_dl__ in a conda environment (here named _phyloenv_), do not forget to first activate it, e.g.)
|
|
135
|
+
|
|
136
|
+
[//]: # ()
|
|
137
|
+
[//]: # (```bash)
|
|
138
|
+
|
|
139
|
+
[//]: # (conda activate phyloenv)
|
|
140
|
+
|
|
141
|
+
[//]: # (```)
|
|
142
|
+
|
|
143
|
+
[//]: # ()
|
|
144
|
+
[//]: # (Run the following command to estimate the BDEISS_CT parameters and their 95% CIs for this tree, assuming the sampling probability of 0.25, )
|
|
145
|
+
|
|
146
|
+
[//]: # (and save the estimated parameters to a comma-separated file estimates.csv.)
|
|
147
|
+
|
|
148
|
+
[//]: # (```bash)
|
|
149
|
+
|
|
150
|
+
[//]: # (bdeissct_infer --nwk Zurich.nwk --ci --p 0.25 --log estimates.csv)
|
|
151
|
+
|
|
152
|
+
[//]: # (```)
|
|
153
|
+
|
|
154
|
+
[//]: # ()
|
|
155
|
+
[//]: # (#### Help)
|
|
156
|
+
|
|
157
|
+
[//]: # ()
|
|
158
|
+
[//]: # (To see detailed options, run:)
|
|
159
|
+
|
|
160
|
+
[//]: # (```bash)
|
|
161
|
+
|
|
162
|
+
[//]: # (bdeissct_infer --help)
|
|
163
|
+
|
|
164
|
+
[//]: # (```)
|
|
165
|
+
|
|
166
|
+
[//]: # ()
|
|
167
|
+
[//]: # ()
|
|
168
|
+
[//]: # (### Run with docker)
|
|
169
|
+
|
|
170
|
+
[//]: # ()
|
|
171
|
+
[//]: # (#### Basic usage)
|
|
172
|
+
|
|
173
|
+
[//]: # (Once [docker](https://www.docker.com/community-edition) is installed, )
|
|
174
|
+
|
|
175
|
+
[//]: # (run the following command to estimate BDEISS-CT model parameters:)
|
|
176
|
+
|
|
177
|
+
[//]: # (```bash)
|
|
178
|
+
|
|
179
|
+
[//]: # (docker run -v <path_to_the_folder_containing_the_tree>:/data:rw -t evolbioinfo/bdeissct --nwk /data/Zurich.nwk --ci --p 0.25 --log /data/estimates.csv)
|
|
180
|
+
|
|
181
|
+
[//]: # (```)
|
|
182
|
+
|
|
183
|
+
[//]: # ()
|
|
184
|
+
[//]: # (This will produce a comma-separated file estimates.csv in the <path_to_the_folder_containing_the_tree> folder,)
|
|
185
|
+
|
|
186
|
+
[//]: # ( containing the estimated parameter values and their 95% CIs (can be viewed with a text editor, Excel or Libre Office Calc).)
|
|
187
|
+
|
|
188
|
+
[//]: # ()
|
|
189
|
+
[//]: # (#### Help)
|
|
190
|
+
|
|
191
|
+
[//]: # ()
|
|
192
|
+
[//]: # (To see advanced options, run)
|
|
193
|
+
|
|
194
|
+
[//]: # (```bash)
|
|
195
|
+
|
|
196
|
+
[//]: # (docker run -t evolbioinfo/bdeissct -h)
|
|
197
|
+
|
|
198
|
+
[//]: # (```)
|
|
199
|
+
|
|
200
|
+
[//]: # ()
|
|
201
|
+
[//]: # ()
|
|
202
|
+
[//]: # ()
|
|
203
|
+
[//]: # (### Run with apptainer)
|
|
204
|
+
|
|
205
|
+
[//]: # ()
|
|
206
|
+
[//]: # (#### Basic usage)
|
|
207
|
+
|
|
208
|
+
[//]: # (Once [apptainer](https://apptainer.org/docs/user/latest/quick_start.html#installation) is installed, )
|
|
209
|
+
|
|
210
|
+
[//]: # (run the following command to estimate BDEISS-CT model parameters (from the folder where the Zurich.nwk tree is contained):)
|
|
211
|
+
|
|
212
|
+
[//]: # ()
|
|
213
|
+
[//]: # (```bash)
|
|
214
|
+
|
|
215
|
+
[//]: # (apptainer run docker://evolbioinfo/bdeissct --nwk Zurich.nwk --ci --p 0.25 --log estimates.csv)
|
|
216
|
+
|
|
217
|
+
[//]: # (```)
|
|
218
|
+
|
|
219
|
+
[//]: # ()
|
|
220
|
+
[//]: # (This will produce a comma-separated file estimates.csv,)
|
|
221
|
+
|
|
222
|
+
[//]: # ( containing the estimated parameter values and their 95% CIs (can be viewed with a text editor, Excel or Libre Office Calc).)
|
|
223
|
+
|
|
224
|
+
[//]: # ()
|
|
225
|
+
[//]: # ()
|
|
226
|
+
[//]: # (#### Help)
|
|
227
|
+
|
|
228
|
+
[//]: # ()
|
|
229
|
+
[//]: # (To see advanced options, run)
|
|
230
|
+
|
|
231
|
+
[//]: # (```bash)
|
|
232
|
+
|
|
233
|
+
[//]: # (apptainer run docker://evolbioinfo/bdeissct -h)
|
|
234
|
+
|
|
235
|
+
[//]: # (```)
|
|
236
|
+
|
|
237
|
+
[//]: # ()
|
|
238
|
+
[//]: # ()
|
|
239
|
+
|
|
240
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
README.md,sha256=Ngj8bt0Yu3LUsvwblmMtUqqjvGyqxv6ku2_cYCb5_DQ,6539
|
|
2
|
+
bdeissct_dl/__init__.py,sha256=QPEiIP-xVqGQgydeqN_9AZgT26IYWeJC4-JlHnd8Rjo,296
|
|
3
|
+
bdeissct_dl/bdeissct_model.py,sha256=sQclYN5V8utw6wEMDN0_Ua-0NeuyuWHG_e0_jQIUe8Q,1986
|
|
4
|
+
bdeissct_dl/dl_model.py,sha256=CT3tRvH7y7_guyR7SUELv0eLEVpGU_PcXci1lIGIb9M,3705
|
|
5
|
+
bdeissct_dl/estimator.py,sha256=QBWA8R0pBPZPd3JvItdJS2lN1J3VqvdJqBMzCi-NADs,3336
|
|
6
|
+
bdeissct_dl/model_serializer.py,sha256=s1yBzQjhtr-w7eT8bTsNkG9_xnYRZrUc3HkeOzNZpQY,2464
|
|
7
|
+
bdeissct_dl/scaler_fitting.py,sha256=9X0O7-Wc9xGTI-iF-Pfp1PPoW7j01wZUfJVZf8ky-IU,1752
|
|
8
|
+
bdeissct_dl/sumstat_checker.py,sha256=TQ0nb86-BXmusqgMnOJusLpR4ul3N3Hi886IWUovrMI,1846
|
|
9
|
+
bdeissct_dl/training.py,sha256=H5wA3V72nhc9Km7kvKmzjCYw0N1itMGDbj9c-Uat5BU,8350
|
|
10
|
+
bdeissct_dl/tree_encoder.py,sha256=vH__CDPNVvayhNoK6_Z4xJA9_R78G1RvneG0RY2aj-0,18340
|
|
11
|
+
bdeissct_dl/tree_manager.py,sha256=UXxUVmEkxwUhKpJeACVgiXZ8Kp1o_hiv8Qb80b6qmVU,11814
|
|
12
|
+
bdext-0.1.67.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
13
|
+
bdext-0.1.67.dist-info/METADATA,sha256=Ky7QOtU9xcrVvo_kjsPpx5jsaIcDwXFOYpoxYoCJaug,7479
|
|
14
|
+
bdext-0.1.67.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
15
|
+
bdext-0.1.67.dist-info/entry_points.txt,sha256=lcAwyk-Fc0G_w4Ex7KDivh7h1tzSA99PRMcy971b-nM,208
|
|
16
|
+
bdext-0.1.67.dist-info/top_level.txt,sha256=z4dadFfcLghr4lwROy7QR3zEICpa-eCPT6mmcoHeEJY,12
|
|
17
|
+
bdext-0.1.67.dist-info/RECORD,,
|
|
@@ -3,6 +3,4 @@ bdeissct_encode = bdeissct_dl.tree_encoder:main
|
|
|
3
3
|
bdeissct_fit_scaler = bdeissct_dl.scaler_fitting:main
|
|
4
4
|
bdeissct_infer = bdeissct_dl.estimator:main
|
|
5
5
|
bdeissct_train = bdeissct_dl.training:main
|
|
6
|
-
ct_infer = bdeissct_dl.estimator_ct:main
|
|
7
|
-
ct_train = bdeissct_dl.train_ct:main
|
|
8
6
|
|
bdeissct_dl/estimator_ct.py
DELETED
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
|
|
3
|
-
from bdeissct_dl import MODEL_PATH
|
|
4
|
-
from bdeissct_dl.bdeissct_model import CT_EPI_COLUMNS, CT_RATE_COLUMNS
|
|
5
|
-
from bdeissct_dl.model_serializer import load_model_keras, load_scaler_numpy
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def predict_parameters(df, model_path=MODEL_PATH):
|
|
9
|
-
feature_columns = CT_EPI_COLUMNS
|
|
10
|
-
X = df.loc[:, feature_columns].to_numpy(dtype=float, na_value=0)
|
|
11
|
-
|
|
12
|
-
# Standardization of the input features with a
|
|
13
|
-
# standard scaler
|
|
14
|
-
scaler_x = load_scaler_numpy(model_path, suffix='ct.x')
|
|
15
|
-
if scaler_x:
|
|
16
|
-
X = scaler_x.transform(X)
|
|
17
|
-
|
|
18
|
-
target_columns = CT_RATE_COLUMNS
|
|
19
|
-
|
|
20
|
-
result = None
|
|
21
|
-
for col in target_columns:
|
|
22
|
-
model = load_model_keras(model_path, f'CT.{col}')
|
|
23
|
-
Y_pred = model.predict(X)
|
|
24
|
-
|
|
25
|
-
if len(Y_pred[col].shape) == 2 and Y_pred[col].shape[1] == 1:
|
|
26
|
-
Y_pred[col] = Y_pred[col].squeeze(axis=1)
|
|
27
|
-
|
|
28
|
-
res_df = pd.DataFrame.from_dict(Y_pred, orient='columns')
|
|
29
|
-
result = result.join(res_df, how='outer') if result is not None else res_df
|
|
30
|
-
|
|
31
|
-
return result
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def main():
|
|
35
|
-
"""
|
|
36
|
-
Entry point for tree parameter estimation with a BDCT model with command-line arguments.
|
|
37
|
-
:return: void
|
|
38
|
-
"""
|
|
39
|
-
import argparse
|
|
40
|
-
|
|
41
|
-
parser = \
|
|
42
|
-
argparse.ArgumentParser(description="Estimate CT rates from other model parameters.")
|
|
43
|
-
parser.add_argument('--model_path', default=MODEL_PATH,
|
|
44
|
-
help='By default our pretrained CT model is used, '
|
|
45
|
-
'but it is possible to specify a path to a custom folder here, '
|
|
46
|
-
'containing files "CT.keras" (with the model), '
|
|
47
|
-
'and scaler-related files to rescale the input data X, and the output Y: '
|
|
48
|
-
'for X: "data_scalerct.x_mean.npy", "data_scalerct.x_scale.npy", "data_scalerct.x_var.npy" '
|
|
49
|
-
'(unpickled numpy-saved arrays), '
|
|
50
|
-
'and "data_scalerct.x_n_samples_seen.txt" '
|
|
51
|
-
'a text file containing the number of examples in the training set). '
|
|
52
|
-
'For Y the file names are the same, just x replaced by y, e.g., "data_scalerct.y_mean.npy". '
|
|
53
|
-
)
|
|
54
|
-
parser.add_argument('--log', default=None, type=str, help="output log file")
|
|
55
|
-
parser.add_argument('--sumstats', default=None, type=str, help="input file(s) with epi parameters")
|
|
56
|
-
params = parser.parse_args()
|
|
57
|
-
|
|
58
|
-
df = pd.read_csv(params.sumstats)
|
|
59
|
-
predict_parameters(df, model_path=params.model_path).to_csv(params.log, header=True)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
if '__main__' == __name__:
|
|
63
|
-
main()
|
bdeissct_dl/main_covid.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
|
|
3
|
-
from bdeissct_dl.tree_encoder import forest2sumstat_df
|
|
4
|
-
from bdeissct_dl.bdeissct_model import MODELS
|
|
5
|
-
from bdeissct_dl.estimator import predict_parameters
|
|
6
|
-
from bdeissct_dl.tree_manager import read_forest
|
|
7
|
-
from bdeissct_dl.sumstat_checker import check_sumstats
|
|
8
|
-
from pybdei import infer as bdei_infer
|
|
9
|
-
from bdct.bd_model import infer as bd_infer
|
|
10
|
-
from bdct.tree_manager import annotate_forest_with_time, get_T
|
|
11
|
-
|
|
12
|
-
MP = '/home/azhukova/projects/bdeissct_dl/simulations_bdeissct/models/200_500'
|
|
13
|
-
|
|
14
|
-
NWKS = ['/home/azhukova/projects/bdeissct_dl/covid/wave3.days.nwk',
|
|
15
|
-
'/home/azhukova/projects/bdeissct_dl/covid/wave4.days.nwk',
|
|
16
|
-
'/home/azhukova/projects/bdeissct_dl/covid/HIV_Zurich.nwk']
|
|
17
|
-
RHOS = [0.238, 0.154, 0.25, 0.6]
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
# for nwk, rho in zip(NWKS[:2], RHOS):
|
|
21
|
-
# forest = read_forest(nwk)
|
|
22
|
-
# if 'wave' in nwk:
|
|
23
|
-
# for n in forest[0].traverse():
|
|
24
|
-
# n.dist = n.dist * 365.25 # convert to days
|
|
25
|
-
# if n.is_leaf():
|
|
26
|
-
# n.name = n.name.split('|')[0]
|
|
27
|
-
# forest[0].write(outfile=nwk.split('.')[0] + '.days.nwk')
|
|
28
|
-
# exit()
|
|
29
|
-
|
|
30
|
-
# for nwk, rho in zip(NWKS[3:], RHOS):
|
|
31
|
-
# forest = read_forest(nwk)
|
|
32
|
-
# forest[0].write(outfile=nwk.replace('nexus', 'nwk'))
|
|
33
|
-
# exit()
|
|
34
|
-
|
|
35
|
-
for nwk, rho in zip(NWKS, RHOS):
|
|
36
|
-
forest = read_forest(nwk)
|
|
37
|
-
check_sumstats(forest2sumstat_df(forest, rho), model_path=MP)
|
|
38
|
-
|
|
39
|
-
sumstat_df = forest2sumstat_df(forest, rho)
|
|
40
|
-
result_df = pd.DataFrame()
|
|
41
|
-
for model in MODELS:
|
|
42
|
-
predictions = predict_parameters(sumstat_df, model_name=model, model_path=MP)
|
|
43
|
-
print(predictions)
|
|
44
|
-
predictions.index = [model]
|
|
45
|
-
result_df = pd.concat((result_df, predictions))
|
|
46
|
-
# result_df['d_E'] = result_df['f_E'] * result_df['d']
|
|
47
|
-
# result_df['d_I'] = (1 - result_df['f_E']) * result_df['d']
|
|
48
|
-
|
|
49
|
-
forest = read_forest(nwk)
|
|
50
|
-
# resolve_forest(forest)
|
|
51
|
-
annotate_forest_with_time(forest)
|
|
52
|
-
T = get_T(T=None, forest=forest)
|
|
53
|
-
|
|
54
|
-
(la, psi, _), _ = bd_infer(forest, T, p=rho)
|
|
55
|
-
result_df.loc['BD-ML', ['R', 'd']] = [la / psi, 1 / psi]
|
|
56
|
-
# result_df.loc['BD-ML', ['R', 'd', 'd_I']] = [la / psi, 1 / psi, 1 / psi]
|
|
57
|
-
|
|
58
|
-
bdei_res, _ = bdei_infer(nwk, p=rho)
|
|
59
|
-
mu, la, psi = bdei_res.mu, bdei_res.la, bdei_res.psi
|
|
60
|
-
result_df.loc['BDEI-ML', ['R', 'd']] = [la / psi, 1 / mu + 1 / psi]
|
|
61
|
-
# result_df.loc['BDEI-ML', ['R', 'd', 'd_E', 'd_I', 'f_E']] = [
|
|
62
|
-
# la / psi,
|
|
63
|
-
# 1 / mu + 1 / psi,
|
|
64
|
-
# 1 / mu,
|
|
65
|
-
# 1 / psi,
|
|
66
|
-
# (1 / mu) / (1 / mu + 1 / psi)
|
|
67
|
-
# ]
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
for col in result_df.columns:
|
|
72
|
-
result_df[col] = result_df[col].apply(lambda x: f'{x:.2f}' if not pd.isna(x) else '')
|
|
73
|
-
|
|
74
|
-
result_df[['R', 'd']].to_csv(nwk.replace('.nwk', '.small_est').replace('.nexus', '.small_est'))
|
|
75
|
-
# result_df[['R', 'd', 'd_E', 'd_I', 'f_S', 'X_S', 'upsilon', 'X_C', 'f_E']].to_csv(nwk.replace('.nwk', '.small_est').replace('.nexus', '.small_est'))
|
|
76
|
-
|
bdeissct_dl/model_finder.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
|
|
3
|
-
from bdeissct_dl import MODEL_FINDER_PATH
|
|
4
|
-
from bdeissct_dl.model_serializer import load_model_keras
|
|
5
|
-
from bdeissct_dl.training_model_finder import get_test_data
|
|
6
|
-
from bdeissct_dl.tree_encoder import forest2sumstat_df
|
|
7
|
-
from bdeissct_dl.tree_manager import read_forest
|
|
8
|
-
from bdeissct_dl.bdeissct_model import MODELS
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def predict_model(forest_sumstats):
|
|
12
|
-
X = get_test_data(forest_sumstats)
|
|
13
|
-
model = load_model_keras(MODEL_FINDER_PATH)
|
|
14
|
-
Y_pred = model.predict(X)
|
|
15
|
-
return pd.DataFrame(Y_pred, columns=MODELS)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def main():
|
|
19
|
-
"""
|
|
20
|
-
Entry point for BDCT model finder with command-line arguments.
|
|
21
|
-
:return: void
|
|
22
|
-
"""
|
|
23
|
-
import argparse
|
|
24
|
-
|
|
25
|
-
parser = \
|
|
26
|
-
argparse.ArgumentParser(description="Find the BDEISSCT model flavour that could have generated this forest.")
|
|
27
|
-
parser.add_argument('--nwk', required=False, default=None, type=str, help="input tree file")
|
|
28
|
-
parser.add_argument('--p', required=False, default=0, type=float, help='sampling probability')
|
|
29
|
-
parser.add_argument('--log', required=True, type=str, help="output log file")
|
|
30
|
-
parser.add_argument('--sumstats', default=None, type=str, help="input tree file(s) encoded as sumstats")
|
|
31
|
-
params = parser.parse_args()
|
|
32
|
-
|
|
33
|
-
if not params.sumstats:
|
|
34
|
-
if params.p <= 0 or params.p > 1:
|
|
35
|
-
raise ValueError('The sampling probability must be grater than 0 and not greater than 1.')
|
|
36
|
-
|
|
37
|
-
forest = read_forest(params.nwk)
|
|
38
|
-
print(f'Read a forest of {len(forest)} trees with {sum(len(_) for _ in forest)} tips in total')
|
|
39
|
-
forest_df = forest2sumstat_df(forest, rho=params.p)
|
|
40
|
-
else:
|
|
41
|
-
forest_df = pd.read_csv(params.sumstats)
|
|
42
|
-
|
|
43
|
-
predict_model(forest_df).to_csv(params.log, header=True)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
if '__main__' == __name__:
|
|
47
|
-
main()
|
bdeissct_dl/pinball_loss.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import tensorflow as tf
|
|
2
|
-
|
|
3
|
-
@tf.keras.utils.register_keras_serializable()
|
|
4
|
-
class MultiQuantilePinballLoss(tf.keras.losses.Loss):
|
|
5
|
-
def __init__(self, quantiles, name="multi_quantile_pinball_loss"):
|
|
6
|
-
super().__init__(name=name)
|
|
7
|
-
self.quantiles = quantiles
|
|
8
|
-
|
|
9
|
-
def call(self, y_true, y_pred):
|
|
10
|
-
"""
|
|
11
|
-
Compute the Pinball loss for multiple quantiles.
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
y_true: Tensor of true target values, shape (batch_size, n_targets).
|
|
15
|
-
y_pred: Tensor of predicted values, shape (batch_size, n_targets * n_quantiles).
|
|
16
|
-
|
|
17
|
-
Returns:
|
|
18
|
-
A scalar tensor representing the aggregated Pinball loss.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
n_quantiles = len(self.quantiles)
|
|
22
|
-
n_targets = tf.shape(y_true)[1]
|
|
23
|
-
|
|
24
|
-
# Reshape y_pred to (batch_size, n_targets, n_quantiles)
|
|
25
|
-
y_pred = tf.reshape(y_pred, [-1, n_targets, n_quantiles])
|
|
26
|
-
|
|
27
|
-
# Calculate the Pinball loss for each quantile
|
|
28
|
-
loss = 0
|
|
29
|
-
for i, tau in enumerate(self.quantiles):
|
|
30
|
-
error = y_true - y_pred[:, :, i] # Error for the i-th quantile
|
|
31
|
-
quantile_loss = tf.maximum(tau * error, (tau - 1) * error)
|
|
32
|
-
loss += tf.reduce_mean(quantile_loss)
|
|
33
|
-
|
|
34
|
-
# Return the mean loss over all quantiles
|
|
35
|
-
return loss / n_quantiles
|
|
36
|
-
|
|
37
|
-
def get_config(self):
|
|
38
|
-
# Serialize the quantiles
|
|
39
|
-
config = super().get_config()
|
|
40
|
-
config.update({
|
|
41
|
-
"quantiles": self.quantiles
|
|
42
|
-
})
|
|
43
|
-
return config
|
|
44
|
-
|
|
45
|
-
@classmethod
|
|
46
|
-
def from_config(cls, config):
|
|
47
|
-
# Deserialize the quantiles
|
|
48
|
-
return cls(quantiles=config["quantiles"])
|
bdeissct_dl/train_ct.py
DELETED
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import numpy as np
|
|
4
|
-
import tensorflow as tf
|
|
5
|
-
from sklearn.preprocessing import StandardScaler
|
|
6
|
-
|
|
7
|
-
from bdeissct_dl import MODEL_PATH, BATCH_SIZE, EPOCHS
|
|
8
|
-
from bdeissct_dl.bdeissct_model import LA, PSI, RHO
|
|
9
|
-
from bdeissct_dl.dl_model import build_model, LEARNING_RATE, LOSS_FUNCTIONS, LOSS_WEIGHTS
|
|
10
|
-
from bdeissct_dl.model_serializer import save_model_keras, save_scaler_joblib, save_scaler_numpy
|
|
11
|
-
from bdeissct_dl.scaler_fitting import fit_scalers
|
|
12
|
-
from bdeissct_dl.training import get_data_characteristics, get_train_data
|
|
13
|
-
from bdeissct_dl.bdeissct_model import CT_EPI_COLUMNS, CT_RATE_COLUMNS
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def build_model(target_columns, n_x, optimizer=None, metrics=None):
|
|
17
|
-
"""
|
|
18
|
-
Build a FFNN of funnel shape with 4 hidden layers.
|
|
19
|
-
We use a 50% dropout after the first 2 hidden layers.
|
|
20
|
-
This architecture follows the PhyloDeep paper [Voznica et al. Nature 2022].
|
|
21
|
-
|
|
22
|
-
:param n_x: input size (number of features)
|
|
23
|
-
:param optimizer: by default Adam with learning rate of 0.001
|
|
24
|
-
:param metrics: evaluation metrics, by default no metrics
|
|
25
|
-
:return: the model instance: tf.keras.models.Sequential
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
inputs = tf.keras.Input(shape=(n_x,))
|
|
29
|
-
|
|
30
|
-
# (Your hidden layers go here)
|
|
31
|
-
x = tf.keras.layers.Dense(8, activation='elu', name=f'layer1_dense8_elu')(inputs)
|
|
32
|
-
x = tf.keras.layers.Dense(4, activation='elu', name=f'layer2_dense6_elu')(x)
|
|
33
|
-
x = tf.keras.layers.Dense(4, activation='elu', name=f'layer2_dense4_elu')(x)
|
|
34
|
-
x = tf.keras.layers.Dense(2, activation='elu', name=f'layer3_dense2_elu')(x)
|
|
35
|
-
|
|
36
|
-
outputs = {}
|
|
37
|
-
|
|
38
|
-
if LA in target_columns:
|
|
39
|
-
outputs[LA] = tf.keras.layers.Dense(1, activation="softplus", name=LA)(x) # positive values only
|
|
40
|
-
if PSI in target_columns:
|
|
41
|
-
outputs[PSI] = tf.keras.layers.Dense(1, activation="softplus", name=PSI)(x) # positive values only
|
|
42
|
-
if RHO in target_columns:
|
|
43
|
-
outputs[RHO] = tf.keras.layers.Dense(1, activation="sigmoid", name=RHO)(x)
|
|
44
|
-
|
|
45
|
-
model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
|
|
46
|
-
|
|
47
|
-
if optimizer is None:
|
|
48
|
-
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
|
|
49
|
-
|
|
50
|
-
model.compile(optimizer=optimizer,
|
|
51
|
-
loss={col: LOSS_FUNCTIONS[col] for col in outputs.keys()},
|
|
52
|
-
loss_weights={col: LOSS_WEIGHTS[col] for col in outputs.keys()},
|
|
53
|
-
metrics=metrics)
|
|
54
|
-
return model
|
|
55
|
-
|
|
56
|
-
def main():
|
|
57
|
-
"""
|
|
58
|
-
Entry point for DL model training with command-line arguments.
|
|
59
|
-
:return: void
|
|
60
|
-
"""
|
|
61
|
-
import argparse
|
|
62
|
-
|
|
63
|
-
parser = \
|
|
64
|
-
argparse.ArgumentParser(description="Train -CT model rate parameter estimator from epi parameters.")
|
|
65
|
-
parser.add_argument('--train_data', type=str, nargs='+',
|
|
66
|
-
help="path to the files where the encoded training data are stored")
|
|
67
|
-
parser.add_argument('--val_data', type=str, nargs='+',
|
|
68
|
-
help="path to the files where the encoded validation data are stored")
|
|
69
|
-
|
|
70
|
-
parser.add_argument('--epochs', type=int, default=EPOCHS, help='number of epochs to train the model')
|
|
71
|
-
parser.add_argument('--model_path', default=MODEL_PATH, type=str,
|
|
72
|
-
help="path to the folder where the trained model should be stored.")
|
|
73
|
-
params = parser.parse_args()
|
|
74
|
-
|
|
75
|
-
os.makedirs(params.model_path, exist_ok=True)
|
|
76
|
-
|
|
77
|
-
# R,f_E,f_S,X_S,upsilon,X_C,kappa,la are given
|
|
78
|
-
# psi is to be predicted
|
|
79
|
-
|
|
80
|
-
feature_columns = CT_EPI_COLUMNS
|
|
81
|
-
target_columns = CT_RATE_COLUMNS
|
|
82
|
-
# reshuffle params.train_data order
|
|
83
|
-
if len(params.train_data) > 1:
|
|
84
|
-
np.random.shuffle(params.train_data)
|
|
85
|
-
if len(params.val_data) > 1:
|
|
86
|
-
np.random.shuffle(params.val_data)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
x_indices, y_col2index = get_data_characteristics(paths=params.train_data,
|
|
90
|
-
feature_columns=feature_columns,
|
|
91
|
-
target_columns=target_columns)
|
|
92
|
-
|
|
93
|
-
scaler_x = StandardScaler()
|
|
94
|
-
fit_scalers(paths=params.train_data, x_indices=x_indices, scaler_x=scaler_x)
|
|
95
|
-
|
|
96
|
-
if scaler_x is not None:
|
|
97
|
-
save_scaler_joblib(scaler_x, params.model_path, suffix='ct.x')
|
|
98
|
-
save_scaler_numpy(scaler_x, params.model_path, suffix='ct.x')
|
|
99
|
-
|
|
100
|
-
for col, y_idx in y_col2index.items():
|
|
101
|
-
print(f'Training to predict {col} for CT...')
|
|
102
|
-
|
|
103
|
-
model = build_model([col], n_x=len(x_indices))
|
|
104
|
-
print(f'Building a model from scratch with {len(x_indices)} input features and {col} as output.')
|
|
105
|
-
print(model.summary())
|
|
106
|
-
|
|
107
|
-
ds_train = get_train_data([col], x_indices, [y_idx], file_pattern=None, filenames=params.train_data, \
|
|
108
|
-
scaler_x=scaler_x, batch_size=BATCH_SIZE * 8, shuffle=True)
|
|
109
|
-
ds_val = get_train_data([col], x_indices, [y_idx], file_pattern=None, filenames=params.val_data, \
|
|
110
|
-
scaler_x=scaler_x, batch_size=BATCH_SIZE * 8, shuffle=True)
|
|
111
|
-
|
|
112
|
-
#early stopping to avoid overfitting
|
|
113
|
-
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=25)
|
|
114
|
-
|
|
115
|
-
#Training of the Network, with an independent validation set
|
|
116
|
-
model.fit(ds_train, verbose=1, epochs=params.epochs, validation_data=ds_val, callbacks=[early_stop])
|
|
117
|
-
|
|
118
|
-
print(f'Saving the trained model CT.{col} to {params.model_path}...')
|
|
119
|
-
|
|
120
|
-
save_model_keras(model, path=params.model_path, model_name=f'CT.{col}')
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
if '__main__' == __name__:
|
|
125
|
-
main()
|