bdext 0.1.65__py3-none-any.whl → 0.1.67__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bdeissct_dl/estimator.py CHANGED
@@ -1,100 +1,13 @@
1
- import numpy as np
2
1
  import pandas as pd
3
2
 
4
3
  from bdeissct_dl import MODEL_PATH
5
- from bdeissct_dl.bdeissct_model import MODEL2TARGET_COLUMNS, BD, MODELS, \
6
- MODEL_FINDER, F_S, X_S, X_C, UPSILON, UPS_X_C, F_S_X_S, F_E
4
+ from bdeissct_dl.bdeissct_model import MODEL2TARGET_COLUMNS, BD, MODELS
7
5
  from bdeissct_dl.model_serializer import load_model_keras, load_scaler_numpy
8
6
  from bdeissct_dl.training import get_test_data
9
7
  from bdeissct_dl.tree_encoder import forest2sumstat_df, scale_back
10
8
  from bdeissct_dl.tree_manager import read_forest
11
9
 
12
10
 
13
- def predict_parameters_mf(forest_sumstats, model_name=MODEL_FINDER, model_path=MODEL_PATH):
14
- n_forests = len(forest_sumstats)
15
- n_models = len(MODELS)
16
-
17
- if MODEL_FINDER == model_name:
18
- import bdeissct_dl.training_model_finder
19
- X = bdeissct_dl.training_model_finder.get_test_data(df=forest_sumstats)
20
- model_weights = load_model_keras(model_path, model_name).predict(X)
21
- else:
22
- model_weights = np.zeros((n_forests, n_models), dtype=float)
23
- model_weights[:, MODELS.index(model_name)] = 1
24
-
25
- scaler_x = load_scaler_numpy(model_path, suffix='x')
26
- X, SF = get_test_data(dfs=[forest_sumstats], scaler_x=scaler_x)
27
-
28
- results = []
29
-
30
- # result = pd.DataFrame(index=np.arange(X.shape[0]))
31
-
32
- model_ids = [i for i in range(n_models) if not np.all(model_weights[:, i] == 0)]
33
- for model_id in model_ids:
34
- model_name = MODELS[model_id]
35
-
36
- X_cur, SF_cur = np.array(X), np.array(SF)
37
-
38
- model = load_model_keras(model_path, model_name)
39
- Y_pred = model.predict(X_cur)
40
-
41
- target_columns = MODEL2TARGET_COLUMNS[model_name]
42
- if F_S_X_S in Y_pred:
43
- if F_S in target_columns:
44
- Y_pred[F_S] = Y_pred[F_S_X_S][:, 0]
45
- if X_S in target_columns:
46
- Y_pred[X_S] = Y_pred[F_S_X_S][:, 1]
47
- del Y_pred[F_S_X_S]
48
- if UPS_X_C in Y_pred:
49
- if UPSILON in target_columns:
50
- Y_pred[UPSILON] = Y_pred[UPS_X_C][:, 0]
51
- if X_C in target_columns:
52
- Y_pred[X_C] = Y_pred[UPS_X_C][:, 1]
53
- del Y_pred[UPS_X_C]
54
-
55
- for col in target_columns:
56
- if len(Y_pred[col].shape) == 2 and Y_pred[col].shape[1] == 1:
57
- Y_pred[col] = Y_pred[col].squeeze(axis=1)
58
-
59
- scale_back(Y_pred, SF_cur)
60
-
61
- results.append(pd.DataFrame.from_dict(Y_pred, orient='columns'))
62
-
63
- if len(model_ids) == 1:
64
- result = results[0]
65
- else:
66
- bdei_ids = {_[0] for _ in enumerate(model_ids) if 'EI' in MODELS[_[1]]}
67
- bdss_ids = {_[0] for _ in enumerate(model_ids) if 'SS' in MODELS[_[1]]}
68
- ct_ids = {_[0] for _ in enumerate(model_ids) if 'CT' in MODELS[_[1]]}
69
-
70
- if ct_ids and len(ct_ids) < len(model_ids):
71
- for idx in range(len(model_ids)):
72
- if idx not in ct_ids:
73
- results[idx].loc[:, UPSILON] = 0
74
- results[idx].loc[:, X_C] = 1
75
-
76
- if bdei_ids and len(bdei_ids) < len(model_ids):
77
- for idx in range(len(model_ids)):
78
- if idx not in bdei_ids:
79
- results[idx].loc[:, F_E] = 0
80
-
81
- if bdss_ids and len(bdss_ids) < len(model_ids):
82
- for idx in range(len(model_ids)):
83
- if not idx in bdss_ids:
84
- results[idx].loc[:, F_S] = 0
85
- results[idx].loc[:, X_S] = 1
86
-
87
- columns = results[0].columns
88
- result = pd.DataFrame(index=forest_sumstats.index)
89
- for col in columns:
90
- predictions = np.array([res[col].to_numpy(dtype=float, na_value=0) for res in results]).T
91
- weights = model_weights[:, model_ids]
92
- result[col] = np.average(predictions, weights=weights, axis=1)
93
-
94
- return result
95
-
96
-
97
-
98
11
  def predict_parameters(forest_sumstats, model_name=BD, model_path=MODEL_PATH):
99
12
  scaler_x = load_scaler_numpy(model_path, suffix='x')
100
13
  X, SF = get_test_data(dfs=[forest_sumstats], scaler_x=scaler_x)
@@ -106,20 +19,9 @@ def predict_parameters(forest_sumstats, model_name=BD, model_path=MODEL_PATH):
106
19
  model = load_model_keras(model_path, f'{model_name}.{col}')
107
20
  Y_pred = model.predict(X)
108
21
 
109
-
110
- # if F_S in target_columns:
111
- # Y_pred[F_S] = Y_pred[F_S_X_S][:, 0]
112
- # Y_pred[X_S] = Y_pred[F_S_X_S][:, 1]
113
- # del Y_pred[F_S_X_S]
114
- # if UPSILON in target_columns:
115
- # Y_pred[UPSILON] = Y_pred[UPS_X_C][:, 0]
116
- # Y_pred[X_C] = Y_pred[UPS_X_C][:, 1]
117
- # del Y_pred[UPS_X_C]
118
-
119
22
  if len(Y_pred[col].shape) == 2 and Y_pred[col].shape[1] == 1:
120
23
  Y_pred[col] = Y_pred[col].squeeze(axis=1)
121
24
 
122
- print(Y_pred)
123
25
  scale_back(Y_pred, SF)
124
26
  res_df = pd.DataFrame.from_dict(Y_pred, orient='columns')
125
27
  result = result.join(res_df, how='outer') if result is not None else res_df
@@ -136,19 +38,17 @@ def main():
136
38
 
137
39
  parser = \
138
40
  argparse.ArgumentParser(description="Estimate BD(EI)(SS)(CT) model parameters.")
139
- parser.add_argument('--model_name', choices=MODELS + (MODEL_FINDER,), default=BD, type=str,
140
- help=f'BDEISSCT model flavour. If {MODEL_FINDER} is specified, '
141
- f'model finder will be used to pick the model.')
41
+ parser.add_argument('--model_name', choices=MODELS, default=BD, type=str,
42
+ help=f'BDEISSCT model flavour')
142
43
  parser.add_argument('--model_path', default=MODEL_PATH,
143
44
  help='By default our pretrained BD(EI)(SS)(CT) models are used, '
144
45
  'but it is possible to specify a path to a custom folder here, '
145
46
  'containing files "<model_name>.keras" (with the model), '
146
- 'and scaler-related files to rescale the input data X, and the output Y: '
147
- 'for X: "data_scalerx_mean.npy", "data_scalerx_scale.npy", "data_scalerx_var.npy" '
47
+ 'and scaler-related files to rescale the input data X: '
48
+ '"data_scalerx_mean.npy", "data_scalerx_scale.npy", "data_scalerx_var.npy" '
148
49
  '(unpickled numpy-saved arrays), '
149
50
  'and "data_scalerx_n_samples_seen.txt" '
150
- 'a text file containing the number of examples in the training set). '
151
- 'For Y the file names are the same, just x replaced by y, e.g., "data_scalery_mean.npy".'
51
+ 'a text file containing the number of examples in the training set).'
152
52
  )
153
53
  parser.add_argument('--p', default=0, type=float, help='sampling probability')
154
54
  parser.add_argument('--log', default=None, type=str, help="output log file")
@@ -159,10 +59,10 @@ def main():
159
59
 
160
60
  if not params.sumstats:
161
61
  if params.p <= 0 or params.p > 1:
162
- raise ValueError('The sampling probability must be grater than 0 and not greater than 1.')
62
+ raise ValueError('The sampling probability must be between 0 (exclusive) and 1 (inclusive).')
163
63
 
164
64
  forest = read_forest(params.nwk)
165
- print(f'Read a forest of {len(forest)} trees with {sum(len(_) for _ in forest)} tips in total')
65
+ print(f'Read a tree with {sum(len(_) for _ in forest)} tips.')
166
66
  forest_df = forest2sumstat_df(forest, rho=params.p)
167
67
  else:
168
68
  forest_df = pd.read_csv(params.sumstats)
@@ -1,11 +1,11 @@
1
1
 
2
- import tensorflow as tf
3
2
  import os
4
- import joblib
3
+
5
4
  import numpy as np
5
+ import tensorflow as tf
6
6
  from sklearn.preprocessing import StandardScaler
7
7
 
8
- from bdeissct_dl.dl_model import relu_plus_one, half_sigmoid, loss_ct, loss_ss, CTLayer, SSLayer, loss_prob
8
+ from bdeissct_dl.dl_model import relu_plus_one, half_sigmoid
9
9
 
10
10
  np.random.seed(239)
11
11
  tf.random.set_seed(239)
@@ -18,8 +18,7 @@ def save_model_keras(model, path, model_name):
18
18
  def load_model_keras(path, model_name):
19
19
  tf.keras.config.enable_unsafe_deserialization()
20
20
  return tf.keras.models.load_model(os.path.join(path, f'{model_name}.keras'),
21
- custom_objects={"loss_ct": loss_ct, "loss_ss": loss_ss, "loss_prob": loss_prob, \
22
- "relu_plus_one": relu_plus_one, "half_sigmoid": half_sigmoid, "CTLayer": CTLayer, "SSLayer": SSLayer})
21
+ custom_objects={"relu_plus_one": relu_plus_one, "half_sigmoid": half_sigmoid})
23
22
 
24
23
  def save_model_h5(model, path, model_name):
25
24
  model.save(os.path.join(path, f'{model_name}.h5'), overwrite=True, zipped=True)
@@ -38,35 +37,6 @@ def load_model_json(path, model_name):
38
37
  model.load_weights(os.path.join(path, f'{model_name}.weights.h5'))
39
38
  return model
40
39
 
41
- def save_model_onnx(model, path, model_name):
42
- import tf2onnx
43
- import onnx
44
-
45
- input_signature = [tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype, name='x')]
46
- model.output_names = ['output']
47
- onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature=input_signature)
48
- onnx.save(onnx_model, os.path.join(path, f'{model_name}.onnx'))
49
-
50
- def load_model_onnx(path, model_name):
51
- """
52
- TODO: this does not work due to onnx vs keras naming issues
53
- (keras does not accept slashes in names that onnx creates)
54
-
55
- :param path:
56
- :return:
57
- """
58
- import onnx
59
- from onnx2keras import onnx_to_keras
60
- onnx_model = onnx.load(os.path.join(path, f'{model_name}.onnx'))
61
- return onnx_to_keras(onnx_model, ['x'])
62
-
63
- def save_scaler_joblib(scaler, prefix, suffix=''):
64
- joblib.dump(scaler, os.path.join(prefix, f'data_scaler{suffix}.gz'))
65
-
66
- def load_scaler_joblib(prefix, suffix=''):
67
- return joblib.load(os.path.join(prefix, f'data_scaler{suffix}.gz')) \
68
- if os.path.exists(os.path.join(prefix, f'data_scaler{suffix}.gz')) else None
69
-
70
40
  def save_scaler_numpy(scaler, prefix, suffix=''):
71
41
  np.save(os.path.join(prefix, f'data_scaler{suffix}_mean.npy'), scaler.mean_, allow_pickle=False)
72
42
  np.save(os.path.join(prefix, f'data_scaler{suffix}_scale.npy'), scaler.scale_, allow_pickle=False)
@@ -4,9 +4,8 @@ import pandas as pd
4
4
  from sklearn.preprocessing import StandardScaler
5
5
 
6
6
  from bdeissct_dl import MODEL_PATH
7
- from bdeissct_dl.bdeissct_model import BD, BDCT, BDEI, BDEICT, \
8
- BDSS, BDSSCT, BDEISS, BDEISSCT, TARGET_COLUMNS_BDEISSCT
9
- from bdeissct_dl.model_serializer import save_scaler_joblib, save_scaler_numpy
7
+ from bdeissct_dl.bdeissct_model import TARGET_COLUMNS_BDEISSCT
8
+ from bdeissct_dl.model_serializer import save_scaler_numpy
10
9
  from bdeissct_dl.training import get_data_characteristics
11
10
 
12
11
 
@@ -28,8 +27,7 @@ def main():
28
27
  parser = \
29
28
  argparse.ArgumentParser(description="Fit a BD(EI)(SS)(CT) data scaler.")
30
29
  parser.add_argument('--train_data', type=str, nargs='+',
31
- default=[f'/home/azhukova/projects/bdeissct_dl/simulations_bdeissct/training/500_1000/{model}/{i}/trees.csv.xz' for i in range(120) for model in [BD, BDCT, BDEI, BDEICT, BDSS, BDSSCT, BDEISS, BDEISSCT]]
32
- ,
30
+ # default=[f'/home/azhukova/projects/bdeissct_dl/simulations_bdeissct/training/500_1000/{model}/{i}/trees.csv.xz' for i in range(120) for model in [BD, BDCT, BDEI, BDEICT, BDSS, BDSSCT, BDEISS, BDEISSCT]],
33
31
  help="path to the files where the encoded training data are stored")
34
32
  parser.add_argument('--model_path', default=MODEL_PATH, type=str,
35
33
  help="path to the folder where the scaler should be stored.")
@@ -43,7 +41,6 @@ def main():
43
41
  fit_scalers(paths=params.train_data, x_indices=x_indices, scaler_x=scaler_x)
44
42
 
45
43
  if scaler_x is not None:
46
- save_scaler_joblib(scaler_x, params.model_path, suffix='x')
47
44
  save_scaler_numpy(scaler_x, params.model_path, suffix='x')
48
45
 
49
46
 
@@ -1,5 +1,5 @@
1
1
  from bdeissct_dl import MODEL_PATH
2
- from bdeissct_dl.training import get_test_data, get_X_columns
2
+ from bdeissct_dl.training import get_test_data, FEATURE_COLUMNS
3
3
  from bdeissct_dl.tree_encoder import forest2sumstat_df
4
4
  from bdeissct_dl.tree_manager import read_forest
5
5
  from bdeissct_dl.model_serializer import load_scaler_numpy
@@ -9,7 +9,7 @@ def check_sumstats(forest_sumstats, model_path=MODEL_PATH):
9
9
  scaler_x = load_scaler_numpy(model_path, suffix='x')
10
10
  X, SF = get_test_data(dfs=[forest_sumstats], scaler_x=scaler_x)
11
11
 
12
- feature_columns = get_X_columns(forest_sumstats.columns)
12
+ feature_columns = FEATURE_COLUMNS
13
13
 
14
14
  for i in range(len(feature_columns)):
15
15
  value = X[0, i]
bdeissct_dl/training.py CHANGED
@@ -6,8 +6,8 @@ import pandas as pd
6
6
  import tensorflow as tf
7
7
 
8
8
  from bdeissct_dl import MODEL_PATH, BATCH_SIZE, EPOCHS
9
- from bdeissct_dl.bdeissct_model import MODEL2TARGET_COLUMNS, UPSILON, X_C, KAPPA, F_E, F_S, \
10
- X_S, TARGET_COLUMNS_BDCT, UPS_X_C, F_S_X_S, REPRODUCTIVE_NUMBER, INFECTION_DURATION, BDEI, LA, PSI, RHO
9
+ from bdeissct_dl.bdeissct_model import MODEL2TARGET_COLUMNS, UPSILON, X_C, KAPPA, INCUBATION_PERIOD, F_S, \
10
+ X_S, TARGET_COLUMNS_BDCT, REPRODUCTIVE_NUMBER, INFECTION_DURATION
11
11
  from bdeissct_dl.dl_model import build_model
12
12
  from bdeissct_dl.model_serializer import save_model_keras, load_scaler_numpy, \
13
13
  load_model_keras
@@ -16,7 +16,7 @@ from bdeissct_dl.tree_encoder import SCALING_FACTOR, STATS
16
16
  FEATURE_COLUMNS = [_ for _ in STATS if _ not in {'n_trees', 'n_tips', 'n_inodes', 'len_forest',
17
17
  REPRODUCTIVE_NUMBER, INFECTION_DURATION,
18
18
  UPSILON, X_C, KAPPA,
19
- F_E,
19
+ INCUBATION_PERIOD,
20
20
  F_S, X_S,
21
21
  SCALING_FACTOR}]
22
22
 
@@ -29,14 +29,10 @@ def calc_validation_fraction(m):
29
29
  return 0.01
30
30
 
31
31
 
32
- def get_X_columns(columns):
33
- return FEATURE_COLUMNS
34
-
35
-
36
32
  def get_test_data(dfs=None, paths=None, scaler_x=None):
37
33
  if not dfs:
38
34
  dfs = [pd.read_csv(path) for path in paths]
39
- feature_columns = get_X_columns(dfs[0].columns)
35
+ feature_columns = FEATURE_COLUMNS
40
36
 
41
37
  Xs, SFs = [], []
42
38
  for df in dfs:
@@ -53,12 +49,11 @@ def get_test_data(dfs=None, paths=None, scaler_x=None):
53
49
  return X, SF
54
50
 
55
51
 
56
- def get_data_characteristics(paths, target_columns=TARGET_COLUMNS_BDCT, feature_columns=None):
52
+ def get_data_characteristics(paths, target_columns=TARGET_COLUMNS_BDCT, feature_columns=FEATURE_COLUMNS):
57
53
  col2index_y = {}
58
54
  col2index_x = {}
59
55
 
60
56
  df = pd.read_csv(paths[0])
61
- feature_columns = get_X_columns(df.columns) if feature_columns is None else feature_columns
62
57
  feature_column_set = set(feature_columns)
63
58
  target_columns = target_columns if target_columns is not None else []
64
59
  target_column_set = set(target_columns)
@@ -109,36 +104,21 @@ def get_train_data(target_columns, columns_x, columns_y, file_pattern=None, file
109
104
  if INFECTION_DURATION in target_columns:
110
105
  train_labels[INFECTION_DURATION] = Y[:, col_i]
111
106
  col_i += 1
112
- # if UPSILON in target_columns:
113
- # train_labels[UPS_X_C] = Y[:, col_i: (col_i + 2)]
114
- # col_i += 2
115
107
  if UPSILON in target_columns:
116
108
  train_labels[UPSILON] = Y[:, col_i]
117
109
  col_i += 1
118
110
  if X_C in target_columns:
119
111
  train_labels[X_C] = Y[:, col_i]
120
112
  col_i += 1
121
- if F_E in target_columns:
122
- train_labels[F_E] = Y[:, col_i]
113
+ if INCUBATION_PERIOD in target_columns:
114
+ train_labels[INCUBATION_PERIOD] = Y[:, col_i]
123
115
  col_i += 1
124
- # if F_S in target_columns:
125
- # train_labels[F_S_X_S] = Y[:, col_i: (col_i + 2)]
126
- # col_i += 2
127
116
  if F_S in target_columns:
128
117
  train_labels[F_S] = Y[:, col_i]
129
118
  col_i += 1
130
119
  if X_S in target_columns:
131
120
  train_labels[X_S] = Y[:, col_i]
132
121
  col_i += 1
133
- if LA in target_columns:
134
- train_labels[LA] = Y[:, col_i]
135
- col_i += 1
136
- if PSI in target_columns:
137
- train_labels[PSI] = Y[:, col_i]
138
- col_i += 1
139
- if RHO in target_columns:
140
- train_labels[RHO] = Y[:, col_i]
141
- col_i += 1
142
122
 
143
123
  dataset = tf.data.Dataset.from_tensor_slices((X, train_labels))
144
124
 
@@ -209,9 +189,9 @@ def main():
209
189
  print(model.summary())
210
190
 
211
191
  ds_train = get_train_data([col], x_indices, [y_idx], file_pattern=None, filenames=params.train_data, \
212
- scaler_x=scaler_x, batch_size=BATCH_SIZE * 8, shuffle=True)
192
+ scaler_x=scaler_x, batch_size=BATCH_SIZE, shuffle=True)
213
193
  ds_val = get_train_data([col], x_indices, [y_idx], file_pattern=None, filenames=params.val_data, \
214
- scaler_x=scaler_x, batch_size=BATCH_SIZE * 8, shuffle=True)
194
+ scaler_x=scaler_x, batch_size=BATCH_SIZE, shuffle=True)
215
195
 
216
196
  #early stopping to avoid overfitting
217
197
  early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=25)
@@ -220,7 +200,6 @@ def main():
220
200
  model.fit(ds_train, verbose=1, epochs=params.epochs, validation_data=ds_val, callbacks=[early_stop])
221
201
 
222
202
  print(f'Saving the trained model {params.model_name}.{col} to {params.model_path}...')
223
-
224
203
  save_model_keras(model, path=params.model_path, model_name=f'{params.model_name}.{col}')
225
204
 
226
205
 
@@ -2,7 +2,6 @@ import io
2
2
  import os
3
3
  from glob import iglob
4
4
 
5
- import numpy as np
6
5
  import pandas as pd
7
6
  from treesumstats import FeatureCalculator, FeatureRegistry, FeatureManager
8
7
  from treesumstats.balance_sumstats import BalanceFeatureCalculator
@@ -13,8 +12,8 @@ from treesumstats.ltt_sumstats import LTTFeatureCalculator
13
12
  from treesumstats.subtree_sumstats import SubtreeFeatureCalculator
14
13
  from treesumstats.transmission_chain_sumstats import TransmissionChainFeatureCalculator
15
14
 
16
- from bdeissct_dl.bdeissct_model import RHO, LA, PSI, F_E, UPSILON, X_C, KAPPA, F_S, X_S, RATE_PARAMETERS, \
17
- TIME_PARAMETERS, PI_E, PI_I, PI_S, PI_EC, PI_IC, PI_SC, LA_AVG, INFECTION_DURATION, REPRODUCTIVE_NUMBER
15
+ from bdeissct_dl.bdeissct_model import RHO, UPSILON, X_C, KAPPA, F_S, X_S, RATE_PARAMETERS, \
16
+ TIME_PARAMETERS, INFECTION_DURATION, REPRODUCTIVE_NUMBER, INCUBATION_PERIOD
18
17
  from bdeissct_dl.tree_manager import read_forest, rescale_forest_to_avg_brlen
19
18
 
20
19
  TARGET_AVG_BL = 1
@@ -72,14 +71,14 @@ def parse_parameters(log):
72
71
  R = df.loc[i, REPRODUCTIVE_NUMBER]
73
72
  d = df.loc[i, INFECTION_DURATION]
74
73
  rho = df.loc[i, RHO]
75
- f_e = df.loc[i, F_E] if F_E in df.columns else 0
74
+ d_inc = df.loc[i, INCUBATION_PERIOD] if INCUBATION_PERIOD in df.columns else 0
76
75
  f_ss = df.loc[i, F_S] if F_S in df.columns else 0
77
76
  x_ss = df.loc[i, X_S] if X_S in df.columns else 1
78
77
  upsilon = df.loc[i, UPSILON] if UPSILON in df.columns else 0
79
78
  x_c = df.loc[i, X_C] if X_C in df.columns else 1
80
79
  kappa = df.loc[i, KAPPA] if KAPPA in df.columns else 0
81
80
 
82
- yield R, d, rho, f_e, f_ss, x_ss, upsilon, x_c, kappa
81
+ yield R, d, rho, d_inc, f_ss, x_ss, upsilon, x_c, kappa
83
82
 
84
83
 
85
84
  class BDEISSCTFeatureCalculator(FeatureCalculator):
@@ -89,7 +88,7 @@ class BDEISSCTFeatureCalculator(FeatureCalculator):
89
88
  pass
90
89
 
91
90
  def feature_names(self):
92
- return [REPRODUCTIVE_NUMBER, INFECTION_DURATION, RHO, F_E, F_S, X_S, UPSILON, X_C, KAPPA, \
91
+ return [REPRODUCTIVE_NUMBER, INFECTION_DURATION, RHO, INCUBATION_PERIOD, F_S, X_S, UPSILON, X_C, KAPPA, \
93
92
  SCALING_FACTOR]
94
93
 
95
94
  def set_forest(self, forest, **kwargs):
@@ -99,12 +98,6 @@ class BDEISSCTFeatureCalculator(FeatureCalculator):
99
98
  return kwargs[feature_name] if feature_name in kwargs else None
100
99
 
101
100
  def help(self, feature_name, *args, **kwargs):
102
- if LA == feature_name:
103
- return 'transmission rate.'
104
- if LA_AVG == feature_name:
105
- return 'average transmission rate.'
106
- if PSI == feature_name:
107
- return 'removal rate.'
108
101
  if RHO == feature_name:
109
102
  return 'sampling probability.'
110
103
  if UPSILON == feature_name:
@@ -117,26 +110,14 @@ class BDEISSCTFeatureCalculator(FeatureCalculator):
117
110
  return 'super-spreading ratio.'
118
111
  if F_S == feature_name:
119
112
  return 'fraction of super-spreaders.'
120
- if F_E == feature_name:
121
- return 'fraction of incubation over total infected-to-removed time.'
122
- if PI_E == feature_name:
123
- return 'fraction of unnotified exposed individuals'
124
- if PI_EC == feature_name:
125
- return 'fraction of notified exposed individuals'
126
- if PI_I == feature_name:
127
- return 'fraction of unnotified infectious regular spreaders'
128
- if PI_IC == feature_name:
129
- return 'fraction of notified infectious regular spreaders'
130
- if PI_S == feature_name:
131
- return 'fraction of unnotified infectious superpreaders'
132
- if PI_SC == feature_name:
133
- return 'fraction of notified infectious superspreaders'
134
113
  if SCALING_FACTOR == feature_name:
135
114
  return 'tree scaling factor.'
136
115
  if REPRODUCTIVE_NUMBER == feature_name:
137
116
  return 'reproduction number.'
138
117
  if INFECTION_DURATION == feature_name:
139
118
  return 'infection duration.'
119
+ if INCUBATION_PERIOD == feature_name:
120
+ return 'incubation period.'
140
121
  return None
141
122
 
142
123
 
@@ -252,14 +233,14 @@ TIME_DIFF_STATS = ['time_diff_in_2_real_mean', 'time_diff_in_3L_real_mean', 'tim
252
233
 
253
234
  EPI_STATS = [REPRODUCTIVE_NUMBER, INFECTION_DURATION, RHO,
254
235
  UPSILON, X_C, KAPPA,
255
- F_E,
236
+ INCUBATION_PERIOD,
256
237
  F_S, X_S]
257
238
 
258
239
  STATS = ['n_tips'] \
259
240
  + BRLEN_STATS + TIME_STATS + CHAIN_STATS + LTT_STATS + BALANCE_STATS + TOPOLOGY_STATS + TIME_DIFF_STATS \
260
241
  + EPI_STATS + [SCALING_FACTOR]
261
242
 
262
- def forest2sumstat_df(forest, rho, R=0, d=0, x_c=0, upsilon=0, kappa=1, f_e=0, f_ss=0, x_ss=1,
243
+ def forest2sumstat_df(forest, rho, R=0, d=0, x_c=0, upsilon=0, kappa=1, d_inc=0, f_ss=0, x_ss=1,
263
244
  target_avg_brlen=TARGET_AVG_BL):
264
245
  """
265
246
  Rescales the input forest to have mean branch lengths of 1, calculates its summary statistics,
@@ -269,7 +250,7 @@ def forest2sumstat_df(forest, rho, R=0, d=0, x_c=0, upsilon=0, kappa=1, f_e=0, f
269
250
  :param x_ss: presumed superspreading ratio (how many times superspreader's transmission rate is higher
270
251
  than that of a standard spreader, 1 by default)
271
252
  :param f_ss: presumed fraction of superspreaders in the infectious population (0 by default)
272
- :param f_e: presumed fraction of incubation over total infected-to-removed time (0 by default)
253
+ :param d_inc: presumed incubation period length (0 by default)
273
254
  :param forest: list(ete3.Tree) forest to encode
274
255
  :param rho: presumed sampling probability
275
256
  :param upsilon: presumed notification probability
@@ -286,7 +267,7 @@ def forest2sumstat_df(forest, rho, R=0, d=0, x_c=0, upsilon=0, kappa=1, f_e=0, f
286
267
 
287
268
  kwargs = {SCALING_FACTOR: scaling_factor,
288
269
  REPRODUCTIVE_NUMBER: R, INFECTION_DURATION: d, RHO: rho,
289
- F_E: f_e,
270
+ INCUBATION_PERIOD: d_inc,
290
271
  F_S: f_ss, X_S: x_ss,
291
272
  X_C: x_c, UPSILON: upsilon, KAPPA: kappa}
292
273
  scale(kwargs, scaling_factor)
@@ -337,11 +318,11 @@ def save_forests_as_sumstats(output, nwks=None, logs=None, patterns=None, target
337
318
  for ps, forest in zip(parameters, forests):
338
319
 
339
320
  scaling_factor = rescale_forest_to_avg_brlen(forest, target_avg_length=target_avg_brlen)
340
- R, d, rho, f_e, f_ss, x_ss, upsilon, x_c, kappa = ps
321
+ R, d, rho, d_inc, f_ss, x_ss, upsilon, x_c, kappa = ps
341
322
  kwargs = {SCALING_FACTOR: scaling_factor}
342
323
  kwargs[REPRODUCTIVE_NUMBER], kwargs[INFECTION_DURATION], kwargs[RHO] = R, d, rho
343
324
  kwargs[UPSILON], kwargs[KAPPA], kwargs[X_C] = upsilon, kappa, x_c
344
- kwargs[F_E] = f_e
325
+ kwargs[INCUBATION_PERIOD] = d_inc
345
326
  kwargs[F_S], kwargs[X_S] = f_ss, x_ss
346
327
 
347
328
  scale(kwargs, scaling_factor)