bdext 0.1.65__py3-none-any.whl → 0.1.66__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- README.md +165 -103
- bdeissct_dl/__init__.py +1 -3
- bdeissct_dl/bdeissct_model.py +11 -65
- bdeissct_dl/dl_model.py +7 -119
- bdeissct_dl/estimator.py +8 -108
- bdeissct_dl/model_serializer.py +3 -33
- bdeissct_dl/scaler_fitting.py +3 -6
- bdeissct_dl/sumstat_checker.py +2 -2
- bdeissct_dl/training.py +9 -30
- bdeissct_dl/tree_encoder.py +13 -32
- bdext-0.1.66.dist-info/METADATA +240 -0
- bdext-0.1.66.dist-info/RECORD +17 -0
- {bdext-0.1.65.dist-info → bdext-0.1.66.dist-info}/entry_points.txt +0 -2
- bdeissct_dl/estimator_ct.py +0 -63
- bdeissct_dl/main_covid.py +0 -76
- bdeissct_dl/model_finder.py +0 -47
- bdeissct_dl/pinball_loss.py +0 -48
- bdeissct_dl/train_ct.py +0 -125
- bdext-0.1.65.dist-info/METADATA +0 -178
- bdext-0.1.65.dist-info/RECORD +0 -22
- {bdext-0.1.65.dist-info → bdext-0.1.66.dist-info}/LICENSE +0 -0
- {bdext-0.1.65.dist-info → bdext-0.1.66.dist-info}/WHEEL +0 -0
- {bdext-0.1.65.dist-info → bdext-0.1.66.dist-info}/top_level.txt +0 -0
bdeissct_dl/estimator.py
CHANGED
|
@@ -1,100 +1,13 @@
|
|
|
1
|
-
import numpy as np
|
|
2
1
|
import pandas as pd
|
|
3
2
|
|
|
4
3
|
from bdeissct_dl import MODEL_PATH
|
|
5
|
-
from bdeissct_dl.bdeissct_model import MODEL2TARGET_COLUMNS, BD, MODELS
|
|
6
|
-
MODEL_FINDER, F_S, X_S, X_C, UPSILON, UPS_X_C, F_S_X_S, F_E
|
|
4
|
+
from bdeissct_dl.bdeissct_model import MODEL2TARGET_COLUMNS, BD, MODELS
|
|
7
5
|
from bdeissct_dl.model_serializer import load_model_keras, load_scaler_numpy
|
|
8
6
|
from bdeissct_dl.training import get_test_data
|
|
9
7
|
from bdeissct_dl.tree_encoder import forest2sumstat_df, scale_back
|
|
10
8
|
from bdeissct_dl.tree_manager import read_forest
|
|
11
9
|
|
|
12
10
|
|
|
13
|
-
def predict_parameters_mf(forest_sumstats, model_name=MODEL_FINDER, model_path=MODEL_PATH):
|
|
14
|
-
n_forests = len(forest_sumstats)
|
|
15
|
-
n_models = len(MODELS)
|
|
16
|
-
|
|
17
|
-
if MODEL_FINDER == model_name:
|
|
18
|
-
import bdeissct_dl.training_model_finder
|
|
19
|
-
X = bdeissct_dl.training_model_finder.get_test_data(df=forest_sumstats)
|
|
20
|
-
model_weights = load_model_keras(model_path, model_name).predict(X)
|
|
21
|
-
else:
|
|
22
|
-
model_weights = np.zeros((n_forests, n_models), dtype=float)
|
|
23
|
-
model_weights[:, MODELS.index(model_name)] = 1
|
|
24
|
-
|
|
25
|
-
scaler_x = load_scaler_numpy(model_path, suffix='x')
|
|
26
|
-
X, SF = get_test_data(dfs=[forest_sumstats], scaler_x=scaler_x)
|
|
27
|
-
|
|
28
|
-
results = []
|
|
29
|
-
|
|
30
|
-
# result = pd.DataFrame(index=np.arange(X.shape[0]))
|
|
31
|
-
|
|
32
|
-
model_ids = [i for i in range(n_models) if not np.all(model_weights[:, i] == 0)]
|
|
33
|
-
for model_id in model_ids:
|
|
34
|
-
model_name = MODELS[model_id]
|
|
35
|
-
|
|
36
|
-
X_cur, SF_cur = np.array(X), np.array(SF)
|
|
37
|
-
|
|
38
|
-
model = load_model_keras(model_path, model_name)
|
|
39
|
-
Y_pred = model.predict(X_cur)
|
|
40
|
-
|
|
41
|
-
target_columns = MODEL2TARGET_COLUMNS[model_name]
|
|
42
|
-
if F_S_X_S in Y_pred:
|
|
43
|
-
if F_S in target_columns:
|
|
44
|
-
Y_pred[F_S] = Y_pred[F_S_X_S][:, 0]
|
|
45
|
-
if X_S in target_columns:
|
|
46
|
-
Y_pred[X_S] = Y_pred[F_S_X_S][:, 1]
|
|
47
|
-
del Y_pred[F_S_X_S]
|
|
48
|
-
if UPS_X_C in Y_pred:
|
|
49
|
-
if UPSILON in target_columns:
|
|
50
|
-
Y_pred[UPSILON] = Y_pred[UPS_X_C][:, 0]
|
|
51
|
-
if X_C in target_columns:
|
|
52
|
-
Y_pred[X_C] = Y_pred[UPS_X_C][:, 1]
|
|
53
|
-
del Y_pred[UPS_X_C]
|
|
54
|
-
|
|
55
|
-
for col in target_columns:
|
|
56
|
-
if len(Y_pred[col].shape) == 2 and Y_pred[col].shape[1] == 1:
|
|
57
|
-
Y_pred[col] = Y_pred[col].squeeze(axis=1)
|
|
58
|
-
|
|
59
|
-
scale_back(Y_pred, SF_cur)
|
|
60
|
-
|
|
61
|
-
results.append(pd.DataFrame.from_dict(Y_pred, orient='columns'))
|
|
62
|
-
|
|
63
|
-
if len(model_ids) == 1:
|
|
64
|
-
result = results[0]
|
|
65
|
-
else:
|
|
66
|
-
bdei_ids = {_[0] for _ in enumerate(model_ids) if 'EI' in MODELS[_[1]]}
|
|
67
|
-
bdss_ids = {_[0] for _ in enumerate(model_ids) if 'SS' in MODELS[_[1]]}
|
|
68
|
-
ct_ids = {_[0] for _ in enumerate(model_ids) if 'CT' in MODELS[_[1]]}
|
|
69
|
-
|
|
70
|
-
if ct_ids and len(ct_ids) < len(model_ids):
|
|
71
|
-
for idx in range(len(model_ids)):
|
|
72
|
-
if idx not in ct_ids:
|
|
73
|
-
results[idx].loc[:, UPSILON] = 0
|
|
74
|
-
results[idx].loc[:, X_C] = 1
|
|
75
|
-
|
|
76
|
-
if bdei_ids and len(bdei_ids) < len(model_ids):
|
|
77
|
-
for idx in range(len(model_ids)):
|
|
78
|
-
if idx not in bdei_ids:
|
|
79
|
-
results[idx].loc[:, F_E] = 0
|
|
80
|
-
|
|
81
|
-
if bdss_ids and len(bdss_ids) < len(model_ids):
|
|
82
|
-
for idx in range(len(model_ids)):
|
|
83
|
-
if not idx in bdss_ids:
|
|
84
|
-
results[idx].loc[:, F_S] = 0
|
|
85
|
-
results[idx].loc[:, X_S] = 1
|
|
86
|
-
|
|
87
|
-
columns = results[0].columns
|
|
88
|
-
result = pd.DataFrame(index=forest_sumstats.index)
|
|
89
|
-
for col in columns:
|
|
90
|
-
predictions = np.array([res[col].to_numpy(dtype=float, na_value=0) for res in results]).T
|
|
91
|
-
weights = model_weights[:, model_ids]
|
|
92
|
-
result[col] = np.average(predictions, weights=weights, axis=1)
|
|
93
|
-
|
|
94
|
-
return result
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
11
|
def predict_parameters(forest_sumstats, model_name=BD, model_path=MODEL_PATH):
|
|
99
12
|
scaler_x = load_scaler_numpy(model_path, suffix='x')
|
|
100
13
|
X, SF = get_test_data(dfs=[forest_sumstats], scaler_x=scaler_x)
|
|
@@ -106,20 +19,9 @@ def predict_parameters(forest_sumstats, model_name=BD, model_path=MODEL_PATH):
|
|
|
106
19
|
model = load_model_keras(model_path, f'{model_name}.{col}')
|
|
107
20
|
Y_pred = model.predict(X)
|
|
108
21
|
|
|
109
|
-
|
|
110
|
-
# if F_S in target_columns:
|
|
111
|
-
# Y_pred[F_S] = Y_pred[F_S_X_S][:, 0]
|
|
112
|
-
# Y_pred[X_S] = Y_pred[F_S_X_S][:, 1]
|
|
113
|
-
# del Y_pred[F_S_X_S]
|
|
114
|
-
# if UPSILON in target_columns:
|
|
115
|
-
# Y_pred[UPSILON] = Y_pred[UPS_X_C][:, 0]
|
|
116
|
-
# Y_pred[X_C] = Y_pred[UPS_X_C][:, 1]
|
|
117
|
-
# del Y_pred[UPS_X_C]
|
|
118
|
-
|
|
119
22
|
if len(Y_pred[col].shape) == 2 and Y_pred[col].shape[1] == 1:
|
|
120
23
|
Y_pred[col] = Y_pred[col].squeeze(axis=1)
|
|
121
24
|
|
|
122
|
-
print(Y_pred)
|
|
123
25
|
scale_back(Y_pred, SF)
|
|
124
26
|
res_df = pd.DataFrame.from_dict(Y_pred, orient='columns')
|
|
125
27
|
result = result.join(res_df, how='outer') if result is not None else res_df
|
|
@@ -136,19 +38,17 @@ def main():
|
|
|
136
38
|
|
|
137
39
|
parser = \
|
|
138
40
|
argparse.ArgumentParser(description="Estimate BD(EI)(SS)(CT) model parameters.")
|
|
139
|
-
parser.add_argument('--model_name', choices=MODELS
|
|
140
|
-
help=f'BDEISSCT model flavour
|
|
141
|
-
f'model finder will be used to pick the model.')
|
|
41
|
+
parser.add_argument('--model_name', choices=MODELS, default=BD, type=str,
|
|
42
|
+
help=f'BDEISSCT model flavour')
|
|
142
43
|
parser.add_argument('--model_path', default=MODEL_PATH,
|
|
143
44
|
help='By default our pretrained BD(EI)(SS)(CT) models are used, '
|
|
144
45
|
'but it is possible to specify a path to a custom folder here, '
|
|
145
46
|
'containing files "<model_name>.keras" (with the model), '
|
|
146
|
-
'and scaler-related files to rescale the input data X
|
|
147
|
-
'
|
|
47
|
+
'and scaler-related files to rescale the input data X: '
|
|
48
|
+
'"data_scalerx_mean.npy", "data_scalerx_scale.npy", "data_scalerx_var.npy" '
|
|
148
49
|
'(unpickled numpy-saved arrays), '
|
|
149
50
|
'and "data_scalerx_n_samples_seen.txt" '
|
|
150
|
-
'a text file containing the number of examples in the training set).
|
|
151
|
-
'For Y the file names are the same, just x replaced by y, e.g., "data_scalery_mean.npy".'
|
|
51
|
+
'a text file containing the number of examples in the training set).'
|
|
152
52
|
)
|
|
153
53
|
parser.add_argument('--p', default=0, type=float, help='sampling probability')
|
|
154
54
|
parser.add_argument('--log', default=None, type=str, help="output log file")
|
|
@@ -159,10 +59,10 @@ def main():
|
|
|
159
59
|
|
|
160
60
|
if not params.sumstats:
|
|
161
61
|
if params.p <= 0 or params.p > 1:
|
|
162
|
-
raise ValueError('The sampling probability must be
|
|
62
|
+
raise ValueError('The sampling probability must be between 0 (exclusive) and 1 (inclusive).')
|
|
163
63
|
|
|
164
64
|
forest = read_forest(params.nwk)
|
|
165
|
-
print(f'Read a
|
|
65
|
+
print(f'Read a tree with {sum(len(_) for _ in forest)} tips.')
|
|
166
66
|
forest_df = forest2sumstat_df(forest, rho=params.p)
|
|
167
67
|
else:
|
|
168
68
|
forest_df = pd.read_csv(params.sumstats)
|
bdeissct_dl/model_serializer.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
|
|
2
|
-
import tensorflow as tf
|
|
3
2
|
import os
|
|
4
|
-
|
|
3
|
+
|
|
5
4
|
import numpy as np
|
|
5
|
+
import tensorflow as tf
|
|
6
6
|
from sklearn.preprocessing import StandardScaler
|
|
7
7
|
|
|
8
8
|
from bdeissct_dl.dl_model import relu_plus_one, half_sigmoid, loss_ct, loss_ss, CTLayer, SSLayer, loss_prob
|
|
@@ -18,8 +18,7 @@ def save_model_keras(model, path, model_name):
|
|
|
18
18
|
def load_model_keras(path, model_name):
|
|
19
19
|
tf.keras.config.enable_unsafe_deserialization()
|
|
20
20
|
return tf.keras.models.load_model(os.path.join(path, f'{model_name}.keras'),
|
|
21
|
-
custom_objects={"
|
|
22
|
-
"relu_plus_one": relu_plus_one, "half_sigmoid": half_sigmoid, "CTLayer": CTLayer, "SSLayer": SSLayer})
|
|
21
|
+
custom_objects={"relu_plus_one": relu_plus_one, "half_sigmoid": half_sigmoid})
|
|
23
22
|
|
|
24
23
|
def save_model_h5(model, path, model_name):
|
|
25
24
|
model.save(os.path.join(path, f'{model_name}.h5'), overwrite=True, zipped=True)
|
|
@@ -38,35 +37,6 @@ def load_model_json(path, model_name):
|
|
|
38
37
|
model.load_weights(os.path.join(path, f'{model_name}.weights.h5'))
|
|
39
38
|
return model
|
|
40
39
|
|
|
41
|
-
def save_model_onnx(model, path, model_name):
|
|
42
|
-
import tf2onnx
|
|
43
|
-
import onnx
|
|
44
|
-
|
|
45
|
-
input_signature = [tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype, name='x')]
|
|
46
|
-
model.output_names = ['output']
|
|
47
|
-
onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature=input_signature)
|
|
48
|
-
onnx.save(onnx_model, os.path.join(path, f'{model_name}.onnx'))
|
|
49
|
-
|
|
50
|
-
def load_model_onnx(path, model_name):
|
|
51
|
-
"""
|
|
52
|
-
TODO: this does not work due to onnx vs keras naming issues
|
|
53
|
-
(keras does not accept slashes in names that onnx creates)
|
|
54
|
-
|
|
55
|
-
:param path:
|
|
56
|
-
:return:
|
|
57
|
-
"""
|
|
58
|
-
import onnx
|
|
59
|
-
from onnx2keras import onnx_to_keras
|
|
60
|
-
onnx_model = onnx.load(os.path.join(path, f'{model_name}.onnx'))
|
|
61
|
-
return onnx_to_keras(onnx_model, ['x'])
|
|
62
|
-
|
|
63
|
-
def save_scaler_joblib(scaler, prefix, suffix=''):
|
|
64
|
-
joblib.dump(scaler, os.path.join(prefix, f'data_scaler{suffix}.gz'))
|
|
65
|
-
|
|
66
|
-
def load_scaler_joblib(prefix, suffix=''):
|
|
67
|
-
return joblib.load(os.path.join(prefix, f'data_scaler{suffix}.gz')) \
|
|
68
|
-
if os.path.exists(os.path.join(prefix, f'data_scaler{suffix}.gz')) else None
|
|
69
|
-
|
|
70
40
|
def save_scaler_numpy(scaler, prefix, suffix=''):
|
|
71
41
|
np.save(os.path.join(prefix, f'data_scaler{suffix}_mean.npy'), scaler.mean_, allow_pickle=False)
|
|
72
42
|
np.save(os.path.join(prefix, f'data_scaler{suffix}_scale.npy'), scaler.scale_, allow_pickle=False)
|
bdeissct_dl/scaler_fitting.py
CHANGED
|
@@ -4,9 +4,8 @@ import pandas as pd
|
|
|
4
4
|
from sklearn.preprocessing import StandardScaler
|
|
5
5
|
|
|
6
6
|
from bdeissct_dl import MODEL_PATH
|
|
7
|
-
from bdeissct_dl.bdeissct_model import
|
|
8
|
-
|
|
9
|
-
from bdeissct_dl.model_serializer import save_scaler_joblib, save_scaler_numpy
|
|
7
|
+
from bdeissct_dl.bdeissct_model import TARGET_COLUMNS_BDEISSCT
|
|
8
|
+
from bdeissct_dl.model_serializer import save_scaler_numpy
|
|
10
9
|
from bdeissct_dl.training import get_data_characteristics
|
|
11
10
|
|
|
12
11
|
|
|
@@ -28,8 +27,7 @@ def main():
|
|
|
28
27
|
parser = \
|
|
29
28
|
argparse.ArgumentParser(description="Fit a BD(EI)(SS)(CT) data scaler.")
|
|
30
29
|
parser.add_argument('--train_data', type=str, nargs='+',
|
|
31
|
-
default=[f'/home/azhukova/projects/bdeissct_dl/simulations_bdeissct/training/500_1000/{model}/{i}/trees.csv.xz' for i in range(120) for model in [BD, BDCT, BDEI, BDEICT, BDSS, BDSSCT, BDEISS, BDEISSCT]]
|
|
32
|
-
,
|
|
30
|
+
# default=[f'/home/azhukova/projects/bdeissct_dl/simulations_bdeissct/training/500_1000/{model}/{i}/trees.csv.xz' for i in range(120) for model in [BD, BDCT, BDEI, BDEICT, BDSS, BDSSCT, BDEISS, BDEISSCT]],
|
|
33
31
|
help="path to the files where the encoded training data are stored")
|
|
34
32
|
parser.add_argument('--model_path', default=MODEL_PATH, type=str,
|
|
35
33
|
help="path to the folder where the scaler should be stored.")
|
|
@@ -43,7 +41,6 @@ def main():
|
|
|
43
41
|
fit_scalers(paths=params.train_data, x_indices=x_indices, scaler_x=scaler_x)
|
|
44
42
|
|
|
45
43
|
if scaler_x is not None:
|
|
46
|
-
save_scaler_joblib(scaler_x, params.model_path, suffix='x')
|
|
47
44
|
save_scaler_numpy(scaler_x, params.model_path, suffix='x')
|
|
48
45
|
|
|
49
46
|
|
bdeissct_dl/sumstat_checker.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from bdeissct_dl import MODEL_PATH
|
|
2
|
-
from bdeissct_dl.training import get_test_data,
|
|
2
|
+
from bdeissct_dl.training import get_test_data, FEATURE_COLUMNS
|
|
3
3
|
from bdeissct_dl.tree_encoder import forest2sumstat_df
|
|
4
4
|
from bdeissct_dl.tree_manager import read_forest
|
|
5
5
|
from bdeissct_dl.model_serializer import load_scaler_numpy
|
|
@@ -9,7 +9,7 @@ def check_sumstats(forest_sumstats, model_path=MODEL_PATH):
|
|
|
9
9
|
scaler_x = load_scaler_numpy(model_path, suffix='x')
|
|
10
10
|
X, SF = get_test_data(dfs=[forest_sumstats], scaler_x=scaler_x)
|
|
11
11
|
|
|
12
|
-
feature_columns =
|
|
12
|
+
feature_columns = FEATURE_COLUMNS
|
|
13
13
|
|
|
14
14
|
for i in range(len(feature_columns)):
|
|
15
15
|
value = X[0, i]
|
bdeissct_dl/training.py
CHANGED
|
@@ -6,8 +6,8 @@ import pandas as pd
|
|
|
6
6
|
import tensorflow as tf
|
|
7
7
|
|
|
8
8
|
from bdeissct_dl import MODEL_PATH, BATCH_SIZE, EPOCHS
|
|
9
|
-
from bdeissct_dl.bdeissct_model import MODEL2TARGET_COLUMNS, UPSILON, X_C, KAPPA,
|
|
10
|
-
X_S, TARGET_COLUMNS_BDCT,
|
|
9
|
+
from bdeissct_dl.bdeissct_model import MODEL2TARGET_COLUMNS, UPSILON, X_C, KAPPA, INCUBATION_PERIOD, F_S, \
|
|
10
|
+
X_S, TARGET_COLUMNS_BDCT, REPRODUCTIVE_NUMBER, INFECTION_DURATION
|
|
11
11
|
from bdeissct_dl.dl_model import build_model
|
|
12
12
|
from bdeissct_dl.model_serializer import save_model_keras, load_scaler_numpy, \
|
|
13
13
|
load_model_keras
|
|
@@ -16,7 +16,7 @@ from bdeissct_dl.tree_encoder import SCALING_FACTOR, STATS
|
|
|
16
16
|
FEATURE_COLUMNS = [_ for _ in STATS if _ not in {'n_trees', 'n_tips', 'n_inodes', 'len_forest',
|
|
17
17
|
REPRODUCTIVE_NUMBER, INFECTION_DURATION,
|
|
18
18
|
UPSILON, X_C, KAPPA,
|
|
19
|
-
|
|
19
|
+
INCUBATION_PERIOD,
|
|
20
20
|
F_S, X_S,
|
|
21
21
|
SCALING_FACTOR}]
|
|
22
22
|
|
|
@@ -29,14 +29,10 @@ def calc_validation_fraction(m):
|
|
|
29
29
|
return 0.01
|
|
30
30
|
|
|
31
31
|
|
|
32
|
-
def get_X_columns(columns):
|
|
33
|
-
return FEATURE_COLUMNS
|
|
34
|
-
|
|
35
|
-
|
|
36
32
|
def get_test_data(dfs=None, paths=None, scaler_x=None):
|
|
37
33
|
if not dfs:
|
|
38
34
|
dfs = [pd.read_csv(path) for path in paths]
|
|
39
|
-
feature_columns =
|
|
35
|
+
feature_columns = FEATURE_COLUMNS
|
|
40
36
|
|
|
41
37
|
Xs, SFs = [], []
|
|
42
38
|
for df in dfs:
|
|
@@ -53,12 +49,11 @@ def get_test_data(dfs=None, paths=None, scaler_x=None):
|
|
|
53
49
|
return X, SF
|
|
54
50
|
|
|
55
51
|
|
|
56
|
-
def get_data_characteristics(paths, target_columns=TARGET_COLUMNS_BDCT, feature_columns=
|
|
52
|
+
def get_data_characteristics(paths, target_columns=TARGET_COLUMNS_BDCT, feature_columns=FEATURE_COLUMNS):
|
|
57
53
|
col2index_y = {}
|
|
58
54
|
col2index_x = {}
|
|
59
55
|
|
|
60
56
|
df = pd.read_csv(paths[0])
|
|
61
|
-
feature_columns = get_X_columns(df.columns) if feature_columns is None else feature_columns
|
|
62
57
|
feature_column_set = set(feature_columns)
|
|
63
58
|
target_columns = target_columns if target_columns is not None else []
|
|
64
59
|
target_column_set = set(target_columns)
|
|
@@ -109,36 +104,21 @@ def get_train_data(target_columns, columns_x, columns_y, file_pattern=None, file
|
|
|
109
104
|
if INFECTION_DURATION in target_columns:
|
|
110
105
|
train_labels[INFECTION_DURATION] = Y[:, col_i]
|
|
111
106
|
col_i += 1
|
|
112
|
-
# if UPSILON in target_columns:
|
|
113
|
-
# train_labels[UPS_X_C] = Y[:, col_i: (col_i + 2)]
|
|
114
|
-
# col_i += 2
|
|
115
107
|
if UPSILON in target_columns:
|
|
116
108
|
train_labels[UPSILON] = Y[:, col_i]
|
|
117
109
|
col_i += 1
|
|
118
110
|
if X_C in target_columns:
|
|
119
111
|
train_labels[X_C] = Y[:, col_i]
|
|
120
112
|
col_i += 1
|
|
121
|
-
if
|
|
122
|
-
train_labels[
|
|
113
|
+
if INCUBATION_PERIOD in target_columns:
|
|
114
|
+
train_labels[INCUBATION_PERIOD] = Y[:, col_i]
|
|
123
115
|
col_i += 1
|
|
124
|
-
# if F_S in target_columns:
|
|
125
|
-
# train_labels[F_S_X_S] = Y[:, col_i: (col_i + 2)]
|
|
126
|
-
# col_i += 2
|
|
127
116
|
if F_S in target_columns:
|
|
128
117
|
train_labels[F_S] = Y[:, col_i]
|
|
129
118
|
col_i += 1
|
|
130
119
|
if X_S in target_columns:
|
|
131
120
|
train_labels[X_S] = Y[:, col_i]
|
|
132
121
|
col_i += 1
|
|
133
|
-
if LA in target_columns:
|
|
134
|
-
train_labels[LA] = Y[:, col_i]
|
|
135
|
-
col_i += 1
|
|
136
|
-
if PSI in target_columns:
|
|
137
|
-
train_labels[PSI] = Y[:, col_i]
|
|
138
|
-
col_i += 1
|
|
139
|
-
if RHO in target_columns:
|
|
140
|
-
train_labels[RHO] = Y[:, col_i]
|
|
141
|
-
col_i += 1
|
|
142
122
|
|
|
143
123
|
dataset = tf.data.Dataset.from_tensor_slices((X, train_labels))
|
|
144
124
|
|
|
@@ -209,9 +189,9 @@ def main():
|
|
|
209
189
|
print(model.summary())
|
|
210
190
|
|
|
211
191
|
ds_train = get_train_data([col], x_indices, [y_idx], file_pattern=None, filenames=params.train_data, \
|
|
212
|
-
scaler_x=scaler_x, batch_size=BATCH_SIZE
|
|
192
|
+
scaler_x=scaler_x, batch_size=BATCH_SIZE, shuffle=True)
|
|
213
193
|
ds_val = get_train_data([col], x_indices, [y_idx], file_pattern=None, filenames=params.val_data, \
|
|
214
|
-
scaler_x=scaler_x, batch_size=BATCH_SIZE
|
|
194
|
+
scaler_x=scaler_x, batch_size=BATCH_SIZE, shuffle=True)
|
|
215
195
|
|
|
216
196
|
#early stopping to avoid overfitting
|
|
217
197
|
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=25)
|
|
@@ -220,7 +200,6 @@ def main():
|
|
|
220
200
|
model.fit(ds_train, verbose=1, epochs=params.epochs, validation_data=ds_val, callbacks=[early_stop])
|
|
221
201
|
|
|
222
202
|
print(f'Saving the trained model {params.model_name}.{col} to {params.model_path}...')
|
|
223
|
-
|
|
224
203
|
save_model_keras(model, path=params.model_path, model_name=f'{params.model_name}.{col}')
|
|
225
204
|
|
|
226
205
|
|
bdeissct_dl/tree_encoder.py
CHANGED
|
@@ -2,7 +2,6 @@ import io
|
|
|
2
2
|
import os
|
|
3
3
|
from glob import iglob
|
|
4
4
|
|
|
5
|
-
import numpy as np
|
|
6
5
|
import pandas as pd
|
|
7
6
|
from treesumstats import FeatureCalculator, FeatureRegistry, FeatureManager
|
|
8
7
|
from treesumstats.balance_sumstats import BalanceFeatureCalculator
|
|
@@ -13,8 +12,8 @@ from treesumstats.ltt_sumstats import LTTFeatureCalculator
|
|
|
13
12
|
from treesumstats.subtree_sumstats import SubtreeFeatureCalculator
|
|
14
13
|
from treesumstats.transmission_chain_sumstats import TransmissionChainFeatureCalculator
|
|
15
14
|
|
|
16
|
-
from bdeissct_dl.bdeissct_model import RHO,
|
|
17
|
-
TIME_PARAMETERS,
|
|
15
|
+
from bdeissct_dl.bdeissct_model import RHO, UPSILON, X_C, KAPPA, F_S, X_S, RATE_PARAMETERS, \
|
|
16
|
+
TIME_PARAMETERS, INFECTION_DURATION, REPRODUCTIVE_NUMBER, INCUBATION_PERIOD
|
|
18
17
|
from bdeissct_dl.tree_manager import read_forest, rescale_forest_to_avg_brlen
|
|
19
18
|
|
|
20
19
|
TARGET_AVG_BL = 1
|
|
@@ -72,14 +71,14 @@ def parse_parameters(log):
|
|
|
72
71
|
R = df.loc[i, REPRODUCTIVE_NUMBER]
|
|
73
72
|
d = df.loc[i, INFECTION_DURATION]
|
|
74
73
|
rho = df.loc[i, RHO]
|
|
75
|
-
|
|
74
|
+
d_inc = df.loc[i, INCUBATION_PERIOD] if INCUBATION_PERIOD in df.columns else 0
|
|
76
75
|
f_ss = df.loc[i, F_S] if F_S in df.columns else 0
|
|
77
76
|
x_ss = df.loc[i, X_S] if X_S in df.columns else 1
|
|
78
77
|
upsilon = df.loc[i, UPSILON] if UPSILON in df.columns else 0
|
|
79
78
|
x_c = df.loc[i, X_C] if X_C in df.columns else 1
|
|
80
79
|
kappa = df.loc[i, KAPPA] if KAPPA in df.columns else 0
|
|
81
80
|
|
|
82
|
-
yield R, d, rho,
|
|
81
|
+
yield R, d, rho, d_inc, f_ss, x_ss, upsilon, x_c, kappa
|
|
83
82
|
|
|
84
83
|
|
|
85
84
|
class BDEISSCTFeatureCalculator(FeatureCalculator):
|
|
@@ -89,7 +88,7 @@ class BDEISSCTFeatureCalculator(FeatureCalculator):
|
|
|
89
88
|
pass
|
|
90
89
|
|
|
91
90
|
def feature_names(self):
|
|
92
|
-
return [REPRODUCTIVE_NUMBER, INFECTION_DURATION, RHO,
|
|
91
|
+
return [REPRODUCTIVE_NUMBER, INFECTION_DURATION, RHO, INCUBATION_PERIOD, F_S, X_S, UPSILON, X_C, KAPPA, \
|
|
93
92
|
SCALING_FACTOR]
|
|
94
93
|
|
|
95
94
|
def set_forest(self, forest, **kwargs):
|
|
@@ -99,12 +98,6 @@ class BDEISSCTFeatureCalculator(FeatureCalculator):
|
|
|
99
98
|
return kwargs[feature_name] if feature_name in kwargs else None
|
|
100
99
|
|
|
101
100
|
def help(self, feature_name, *args, **kwargs):
|
|
102
|
-
if LA == feature_name:
|
|
103
|
-
return 'transmission rate.'
|
|
104
|
-
if LA_AVG == feature_name:
|
|
105
|
-
return 'average transmission rate.'
|
|
106
|
-
if PSI == feature_name:
|
|
107
|
-
return 'removal rate.'
|
|
108
101
|
if RHO == feature_name:
|
|
109
102
|
return 'sampling probability.'
|
|
110
103
|
if UPSILON == feature_name:
|
|
@@ -117,26 +110,14 @@ class BDEISSCTFeatureCalculator(FeatureCalculator):
|
|
|
117
110
|
return 'super-spreading ratio.'
|
|
118
111
|
if F_S == feature_name:
|
|
119
112
|
return 'fraction of super-spreaders.'
|
|
120
|
-
if F_E == feature_name:
|
|
121
|
-
return 'fraction of incubation over total infected-to-removed time.'
|
|
122
|
-
if PI_E == feature_name:
|
|
123
|
-
return 'fraction of unnotified exposed individuals'
|
|
124
|
-
if PI_EC == feature_name:
|
|
125
|
-
return 'fraction of notified exposed individuals'
|
|
126
|
-
if PI_I == feature_name:
|
|
127
|
-
return 'fraction of unnotified infectious regular spreaders'
|
|
128
|
-
if PI_IC == feature_name:
|
|
129
|
-
return 'fraction of notified infectious regular spreaders'
|
|
130
|
-
if PI_S == feature_name:
|
|
131
|
-
return 'fraction of unnotified infectious superpreaders'
|
|
132
|
-
if PI_SC == feature_name:
|
|
133
|
-
return 'fraction of notified infectious superspreaders'
|
|
134
113
|
if SCALING_FACTOR == feature_name:
|
|
135
114
|
return 'tree scaling factor.'
|
|
136
115
|
if REPRODUCTIVE_NUMBER == feature_name:
|
|
137
116
|
return 'reproduction number.'
|
|
138
117
|
if INFECTION_DURATION == feature_name:
|
|
139
118
|
return 'infection duration.'
|
|
119
|
+
if INCUBATION_PERIOD == feature_name:
|
|
120
|
+
return 'incubation period.'
|
|
140
121
|
return None
|
|
141
122
|
|
|
142
123
|
|
|
@@ -252,14 +233,14 @@ TIME_DIFF_STATS = ['time_diff_in_2_real_mean', 'time_diff_in_3L_real_mean', 'tim
|
|
|
252
233
|
|
|
253
234
|
EPI_STATS = [REPRODUCTIVE_NUMBER, INFECTION_DURATION, RHO,
|
|
254
235
|
UPSILON, X_C, KAPPA,
|
|
255
|
-
|
|
236
|
+
INCUBATION_PERIOD,
|
|
256
237
|
F_S, X_S]
|
|
257
238
|
|
|
258
239
|
STATS = ['n_tips'] \
|
|
259
240
|
+ BRLEN_STATS + TIME_STATS + CHAIN_STATS + LTT_STATS + BALANCE_STATS + TOPOLOGY_STATS + TIME_DIFF_STATS \
|
|
260
241
|
+ EPI_STATS + [SCALING_FACTOR]
|
|
261
242
|
|
|
262
|
-
def forest2sumstat_df(forest, rho, R=0, d=0, x_c=0, upsilon=0, kappa=1,
|
|
243
|
+
def forest2sumstat_df(forest, rho, R=0, d=0, x_c=0, upsilon=0, kappa=1, d_inc=0, f_ss=0, x_ss=1,
|
|
263
244
|
target_avg_brlen=TARGET_AVG_BL):
|
|
264
245
|
"""
|
|
265
246
|
Rescales the input forest to have mean branch lengths of 1, calculates its summary statistics,
|
|
@@ -269,7 +250,7 @@ def forest2sumstat_df(forest, rho, R=0, d=0, x_c=0, upsilon=0, kappa=1, f_e=0, f
|
|
|
269
250
|
:param x_ss: presumed superspreading ratio (how many times superspreader's transmission rate is higher
|
|
270
251
|
than that of a standard spreader, 1 by default)
|
|
271
252
|
:param f_ss: presumed fraction of superspreaders in the infectious population (0 by default)
|
|
272
|
-
:param
|
|
253
|
+
:param d_inc: presumed incubation period length (0 by default)
|
|
273
254
|
:param forest: list(ete3.Tree) forest to encode
|
|
274
255
|
:param rho: presumed sampling probability
|
|
275
256
|
:param upsilon: presumed notification probability
|
|
@@ -286,7 +267,7 @@ def forest2sumstat_df(forest, rho, R=0, d=0, x_c=0, upsilon=0, kappa=1, f_e=0, f
|
|
|
286
267
|
|
|
287
268
|
kwargs = {SCALING_FACTOR: scaling_factor,
|
|
288
269
|
REPRODUCTIVE_NUMBER: R, INFECTION_DURATION: d, RHO: rho,
|
|
289
|
-
|
|
270
|
+
INCUBATION_PERIOD: d_inc,
|
|
290
271
|
F_S: f_ss, X_S: x_ss,
|
|
291
272
|
X_C: x_c, UPSILON: upsilon, KAPPA: kappa}
|
|
292
273
|
scale(kwargs, scaling_factor)
|
|
@@ -337,11 +318,11 @@ def save_forests_as_sumstats(output, nwks=None, logs=None, patterns=None, target
|
|
|
337
318
|
for ps, forest in zip(parameters, forests):
|
|
338
319
|
|
|
339
320
|
scaling_factor = rescale_forest_to_avg_brlen(forest, target_avg_length=target_avg_brlen)
|
|
340
|
-
R, d, rho,
|
|
321
|
+
R, d, rho, d_inc, f_ss, x_ss, upsilon, x_c, kappa = ps
|
|
341
322
|
kwargs = {SCALING_FACTOR: scaling_factor}
|
|
342
323
|
kwargs[REPRODUCTIVE_NUMBER], kwargs[INFECTION_DURATION], kwargs[RHO] = R, d, rho
|
|
343
324
|
kwargs[UPSILON], kwargs[KAPPA], kwargs[X_C] = upsilon, kappa, x_c
|
|
344
|
-
kwargs[
|
|
325
|
+
kwargs[INCUBATION_PERIOD] = d_inc
|
|
345
326
|
kwargs[F_S], kwargs[X_S] = f_ss, x_ss
|
|
346
327
|
|
|
347
328
|
scale(kwargs, scaling_factor)
|