PyEvoMotion 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/cli.py +87 -3
- PyEvoMotion/core/base.py +296 -20
- PyEvoMotion/core/core.py +73 -24
- {pyevomotion-0.1.1.dist-info → pyevomotion-0.1.2.dist-info}/METADATA +1 -1
- pyevomotion-0.1.2.dist-info/RECORD +35 -0
- share/analyze_model_selection_accuracy.py +316 -0
- share/analyze_test_runs.py +436 -0
- share/anomalous_diffusion.pdf +0 -0
- share/confusion_matrix_heatmap.pdf +0 -0
- share/figUK_plots.pdf +0 -0
- share/figUK_regression_results.json +54 -7
- share/figUK_run_args.json +1 -0
- share/figUK_stats.tsv +41 -41
- share/figUSA_plots.pdf +0 -0
- share/figUSA_regression_results.json +54 -7
- share/figUSA_run_args.json +1 -0
- share/figUSA_stats.tsv +34 -34
- share/generate_sequences_from_test5_data.py +107 -0
- share/manuscript_figure.py +450 -80
- share/run_parallel_analysis.py +196 -0
- share/synth_figure.pdf +0 -0
- share/uk_time_windows.pdf +0 -0
- share/weekly_size.pdf +0 -0
- pyevomotion-0.1.1.dist-info/RECORD +0 -31
- share/figure.pdf +0 -0
- {pyevomotion-0.1.1.dist-info → pyevomotion-0.1.2.dist-info}/WHEEL +0 -0
- {pyevomotion-0.1.1.dist-info → pyevomotion-0.1.2.dist-info}/entry_points.txt +0 -0
share/figUSA_run_args.json
CHANGED
share/figUSA_stats.tsv
CHANGED
|
@@ -1,34 +1,34 @@
|
|
|
1
|
-
date mean number of mutations var number of mutations size
|
|
2
|
-
2020-12-17 34.666666666666664 5.066666666666665 6
|
|
3
|
-
2020-12-24 33.976190476190474 3.7311265969802596 42
|
|
4
|
-
2020-12-31 34.19230769230769 2.546953046953047 78
|
|
5
|
-
2021-01-07 35.15238095238095 5.4188644688644745 105
|
|
6
|
-
2021-01-14 36.39887640449438 4.094236018536149 178
|
|
7
|
-
2021-01-21 37.67487684729064 4.557137979807834 203
|
|
8
|
-
2021-01-28 38.28421052631579 4.2745737583395 285
|
|
9
|
-
2021-02-04 38.86852589641434 4.146645418326694 251
|
|
10
|
-
2021-02-11 39.538699690402474 4.584677807049599 323
|
|
11
|
-
2021-02-18 40.1993769470405 5.741374610591899 321
|
|
12
|
-
2021-02-25 40.53168044077135 5.183385842351184 363
|
|
13
|
-
2021-03-04 40.43465045592705 7.520868856104973 329
|
|
14
|
-
2021-03-11 41.03508771929825 6.573545300200646 342
|
|
15
|
-
2021-03-18 41.24050632911393 7.756730707447153 395
|
|
16
|
-
2021-03-25 41.31578947368421 6.91238966982674 323
|
|
17
|
-
2021-04-01 41.463611859838274 6.357456108399504 371
|
|
18
|
-
2021-04-08 41.80653266331658 8.771040340240246 398
|
|
19
|
-
2021-04-15 42.03133903133903 8.596157916157916 351
|
|
20
|
-
2021-04-22 42.390243902439025 7.597295864262986 369
|
|
21
|
-
2021-04-29 42.429752066115704 7.4004383361490325 363
|
|
22
|
-
2021-05-06 42.70674486803519 7.531395549422115 341
|
|
23
|
-
2021-05-13 42.96060606060606 9.405738233397797 330
|
|
24
|
-
2021-05-20 43.21902017291066 11.183105395545644 347
|
|
25
|
-
2021-05-27 43.90149253731343 11.154937885423184 335
|
|
26
|
-
2021-06-03 43.76347305389221 11.958902015788244 334
|
|
27
|
-
2021-06-10 44.061624649859944 11.277090611525502 357
|
|
28
|
-
2021-06-17 44.22287390029326 11.103122304640328 341
|
|
29
|
-
2021-06-24 44.62145110410095 9.539791558519338 317
|
|
30
|
-
2021-07-01 44.512121212121215 11.460338951828312 330
|
|
31
|
-
2021-07-08 44.526490066225165 8.841488636113615 302
|
|
32
|
-
2021-07-15 44.91830065359477 9.275270545376623 306
|
|
33
|
-
2021-07-22 44.656462585034014 9.31161338255439 294
|
|
34
|
-
2021-07-29 45.55797101449275 9.13895059769385 138
|
|
1
|
+
date mean number of mutations var number of mutations size dt_idx
|
|
2
|
+
2020-12-17 34.666666666666664 5.066666666666665 6 0.0
|
|
3
|
+
2020-12-24 33.976190476190474 3.7311265969802596 42 1.0
|
|
4
|
+
2020-12-31 34.19230769230769 2.546953046953047 78 2.0
|
|
5
|
+
2021-01-07 35.15238095238095 5.4188644688644745 105 3.0
|
|
6
|
+
2021-01-14 36.39887640449438 4.094236018536149 178 4.0
|
|
7
|
+
2021-01-21 37.67487684729064 4.557137979807834 203 5.0
|
|
8
|
+
2021-01-28 38.28421052631579 4.2745737583395 285 6.0
|
|
9
|
+
2021-02-04 38.86852589641434 4.146645418326694 251 7.0
|
|
10
|
+
2021-02-11 39.538699690402474 4.584677807049599 323 8.0
|
|
11
|
+
2021-02-18 40.1993769470405 5.741374610591899 321 9.0
|
|
12
|
+
2021-02-25 40.53168044077135 5.183385842351184 363 10.0
|
|
13
|
+
2021-03-04 40.43465045592705 7.520868856104973 329 11.0
|
|
14
|
+
2021-03-11 41.03508771929825 6.573545300200646 342 12.0
|
|
15
|
+
2021-03-18 41.24050632911393 7.756730707447153 395 13.0
|
|
16
|
+
2021-03-25 41.31578947368421 6.91238966982674 323 14.0
|
|
17
|
+
2021-04-01 41.463611859838274 6.357456108399504 371 15.0
|
|
18
|
+
2021-04-08 41.80653266331658 8.771040340240246 398 16.0
|
|
19
|
+
2021-04-15 42.03133903133903 8.596157916157916 351 17.0
|
|
20
|
+
2021-04-22 42.390243902439025 7.597295864262986 369 18.0
|
|
21
|
+
2021-04-29 42.429752066115704 7.4004383361490325 363 19.0
|
|
22
|
+
2021-05-06 42.70674486803519 7.531395549422115 341 20.0
|
|
23
|
+
2021-05-13 42.96060606060606 9.405738233397797 330 21.0
|
|
24
|
+
2021-05-20 43.21902017291066 11.183105395545644 347 22.0
|
|
25
|
+
2021-05-27 43.90149253731343 11.154937885423184 335 23.0
|
|
26
|
+
2021-06-03 43.76347305389221 11.958902015788244 334 24.0
|
|
27
|
+
2021-06-10 44.061624649859944 11.277090611525502 357 25.0
|
|
28
|
+
2021-06-17 44.22287390029326 11.103122304640328 341 26.0
|
|
29
|
+
2021-06-24 44.62145110410095 9.539791558519338 317 27.0
|
|
30
|
+
2021-07-01 44.512121212121215 11.460338951828312 330 28.0
|
|
31
|
+
2021-07-08 44.526490066225165 8.841488636113615 302 29.0
|
|
32
|
+
2021-07-15 44.91830065359477 9.275270545376623 306 30.0
|
|
33
|
+
2021-07-22 44.656462585034014 9.31161338255439 294 31.0
|
|
34
|
+
2021-07-29 45.55797101449275 9.13895059769385 138 32.0
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
2
|
+
# IMPORTS #
|
|
3
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from Bio import SeqIO
|
|
10
|
+
from Bio.SeqRecord import SeqRecord
|
|
11
|
+
|
|
12
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
13
|
+
# CONSTANTS #
|
|
14
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
15
|
+
|
|
16
|
+
WUHAN_REF_PATH = "tests/data/test3/test3UK.fasta"
|
|
17
|
+
DATE_MAP = dict(zip(
|
|
18
|
+
range(41),
|
|
19
|
+
pd.date_range(start="2020-01-01", periods=41, freq="7D")
|
|
20
|
+
))
|
|
21
|
+
NUCLEOTIDES = {"A", "C", "G", "T"}
|
|
22
|
+
TEST5_BASE_PATH = "tests/data/test5"
|
|
23
|
+
|
|
24
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
25
|
+
# FUNCTIONS #
|
|
26
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
27
|
+
|
|
28
|
+
def load_synthdata(path: str) -> pd.DataFrame:
|
|
29
|
+
"""Loads the synthdata from the given path.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
df = pd.read_csv(path, sep="\t", index_col=0)
|
|
33
|
+
df.columns = [f"seq{k}" for k in range(len(df.columns))]
|
|
34
|
+
|
|
35
|
+
return df
|
|
36
|
+
|
|
37
|
+
def get_wuhan_ref() -> SeqRecord:
|
|
38
|
+
"""Returns the Wuhan reference sequence.
|
|
39
|
+
"""
|
|
40
|
+
# We know the Wuhan sequence is the first one in the file
|
|
41
|
+
with open(WUHAN_REF_PATH, "r") as f:
|
|
42
|
+
for record in SeqIO.parse(f, "fasta"):
|
|
43
|
+
return record
|
|
44
|
+
|
|
45
|
+
def create_synthetic_sequence(mut_num: int, wuhan_ref: SeqRecord) -> str:
|
|
46
|
+
"""Creates a synthetic sequence from the Wuhan reference sequence.
|
|
47
|
+
"""
|
|
48
|
+
_seq = list(wuhan_ref.seq)
|
|
49
|
+
mut_positions = np.random.choice(range(len(_seq)), mut_num, replace=False)
|
|
50
|
+
nucleotides = [
|
|
51
|
+
np.random.choice(list(NUCLEOTIDES - {_seq[i]}))
|
|
52
|
+
for i in mut_positions
|
|
53
|
+
]
|
|
54
|
+
for pos, nuc in zip(mut_positions, nucleotides):
|
|
55
|
+
_seq[pos] = nuc
|
|
56
|
+
return "".join(_seq)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def create_synthetic_sequences(output_path: str, synthdata: pd.DataFrame, wuhan_ref: SeqRecord) -> None:
|
|
60
|
+
"""Creates the synthetic dataset from the synthdata reference and the Wuhan reference sequence.
|
|
61
|
+
"""
|
|
62
|
+
with open(f"{output_path}.fasta", "w") as f, open(f"{output_path}.tsv", "w") as g:
|
|
63
|
+
f.write(f">{wuhan_ref.id}\n{wuhan_ref.seq}\n")
|
|
64
|
+
g.write(f"id\tdate\n{wuhan_ref.id}\t2019-12-29\n")
|
|
65
|
+
for row_idx, row in synthdata.iterrows():
|
|
66
|
+
for col_name, col in row.items():
|
|
67
|
+
# Fetch the row number, the column name and the entry in the synthdata
|
|
68
|
+
f.write(f">{col_name}_{row_idx}\n{create_synthetic_sequence(col, wuhan_ref)}\n")
|
|
69
|
+
g.write(f"{col_name}_{row_idx}\t{DATE_MAP[row_idx].strftime('%Y-%m-%d')}\n")
|
|
70
|
+
|
|
71
|
+
def process_test5_files():
|
|
72
|
+
"""Process all .txt files in test5 subdirectories and generate corresponding .fasta and .tsv files.
|
|
73
|
+
"""
|
|
74
|
+
WUHAN_REF = get_wuhan_ref()
|
|
75
|
+
|
|
76
|
+
for subdir in os.listdir(TEST5_BASE_PATH):
|
|
77
|
+
txt_files = [
|
|
78
|
+
file
|
|
79
|
+
for file in os.listdir(f"{TEST5_BASE_PATH}/{subdir}")
|
|
80
|
+
if file.endswith(".txt")
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
for txt_file in txt_files:
|
|
84
|
+
|
|
85
|
+
name = txt_file.strip(".txt")
|
|
86
|
+
create_synthetic_sequences(
|
|
87
|
+
f"{TEST5_BASE_PATH}/{subdir}/{name}",
|
|
88
|
+
load_synthdata(
|
|
89
|
+
f"{TEST5_BASE_PATH}/{subdir}/{txt_file}"
|
|
90
|
+
),
|
|
91
|
+
WUHAN_REF
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
96
|
+
# MAIN #
|
|
97
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
98
|
+
|
|
99
|
+
def main():
|
|
100
|
+
"""Main function to process all test5 files.
|
|
101
|
+
"""
|
|
102
|
+
print("Processing all .txt files in test5 subdirectories...")
|
|
103
|
+
process_test5_files()
|
|
104
|
+
print("All synthetic datasets created successfully.")
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
main()
|