PyEvoMotion 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/cli.py +88 -11
- PyEvoMotion/core/base.py +373 -34
- PyEvoMotion/core/core.py +136 -43
- PyEvoMotion/core/parser.py +4 -1
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/METADATA +72 -4
- pyevomotion-0.1.2.dist-info/RECORD +35 -0
- share/analyze_model_selection_accuracy.py +316 -0
- share/analyze_test_runs.py +436 -0
- share/anomalous_diffusion.pdf +0 -0
- share/confusion_matrix_heatmap.pdf +0 -0
- share/figUK.tsv +9949 -0
- share/figUK_plots.pdf +0 -0
- share/figUK_regression_results.json +65 -0
- share/figUK_run_args.json +14 -0
- share/figUK_stats.tsv +41 -0
- share/figUSA.tsv +9470 -0
- share/figUSA_plots.pdf +0 -0
- share/figUSA_regression_results.json +65 -0
- share/figUSA_run_args.json +14 -0
- share/figUSA_stats.tsv +34 -0
- share/figdataUK.tsv +10001 -0
- share/figdataUSA.tsv +10001 -0
- share/generate_sequences_from_synthdata.py +85 -0
- share/generate_sequences_from_test5_data.py +107 -0
- share/manuscript_figure.py +858 -43
- share/run_parallel_analysis.py +196 -0
- share/synth_figure.pdf +0 -0
- share/uk_time_windows.pdf +0 -0
- share/weekly_size.pdf +0 -0
- pyevomotion-0.1.0.dist-info/RECORD +0 -13
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/WHEEL +0 -0
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
2
|
+
# IMPORTS #
|
|
3
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from Bio import SeqIO
|
|
9
|
+
from Bio.SeqRecord import SeqRecord
|
|
10
|
+
|
|
11
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
12
|
+
# CONSTANTS #
|
|
13
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
14
|
+
|
|
15
|
+
WUHAN_REF_PATH = "tests/data/test3/test3UK.fasta"
|
|
16
|
+
DATE_MAP = dict(zip(
|
|
17
|
+
range(41),
|
|
18
|
+
pd.date_range(start="2020-01-01", periods=41, freq="7D")
|
|
19
|
+
))
|
|
20
|
+
NUCLEOTIDES = {"A", "C", "G", "T"}
|
|
21
|
+
|
|
22
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
23
|
+
# FUNCTIONS #
|
|
24
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
25
|
+
|
|
26
|
+
def load_synthdata(path: str) -> pd.DataFrame:
|
|
27
|
+
"""Loads the synthdata from the given path.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
df = pd.read_csv(path, sep="\t", index_col=0)
|
|
31
|
+
df.columns = [f"seq{k}" for k in range(len(df.columns))]
|
|
32
|
+
|
|
33
|
+
return df
|
|
34
|
+
|
|
35
|
+
def get_wuhan_ref() -> SeqRecord:
|
|
36
|
+
"""Returns the Wuhan reference sequence.
|
|
37
|
+
"""
|
|
38
|
+
# We know the Wuhan sequence is the first one in the file
|
|
39
|
+
with open(WUHAN_REF_PATH, "r") as f:
|
|
40
|
+
for record in SeqIO.parse(f, "fasta"):
|
|
41
|
+
return record
|
|
42
|
+
|
|
43
|
+
def create_synthetic_sequence(mut_num: int, wuhan_ref: SeqRecord) -> str:
|
|
44
|
+
"""Creates a synthetic sequence from the Wuhan reference sequence.
|
|
45
|
+
"""
|
|
46
|
+
_seq = list(wuhan_ref.seq)
|
|
47
|
+
mut_positions = np.random.choice(range(len(_seq)), mut_num, replace=False)
|
|
48
|
+
nucleotides = [
|
|
49
|
+
np.random.choice(list(NUCLEOTIDES - {_seq[i]}))
|
|
50
|
+
for i in mut_positions
|
|
51
|
+
]
|
|
52
|
+
for pos, nuc in zip(mut_positions, nucleotides):
|
|
53
|
+
_seq[pos] = nuc
|
|
54
|
+
return "".join(_seq)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def create_synthetic_sequences(output_path: str, synthdata: pd.DataFrame, wuhan_ref: SeqRecord) -> None:
|
|
58
|
+
"""Creates the synthetic dataset from the synthdata reference and the Wuhan reference sequence.
|
|
59
|
+
"""
|
|
60
|
+
with open(f"{output_path}.fasta", "w") as f, open(f"{output_path}.tsv", "w") as g:
|
|
61
|
+
f.write(f">{wuhan_ref.id}\n{wuhan_ref.seq}\n")
|
|
62
|
+
g.write(f"id\tdate\n{wuhan_ref.id}\t2019-12-29\n")
|
|
63
|
+
for row_idx, row in synthdata.iterrows():
|
|
64
|
+
for col_name, col in row.items():
|
|
65
|
+
# Fetch the row number, the column name and the entry in the synthdata
|
|
66
|
+
f.write(f">{col_name}_{row_idx}\n{create_synthetic_sequence(col, wuhan_ref)}\n")
|
|
67
|
+
g.write(f"{col_name}_{row_idx}\t{DATE_MAP[row_idx].strftime('%Y-%m-%d')}\n")
|
|
68
|
+
|
|
69
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
70
|
+
# MAIN #
|
|
71
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
72
|
+
|
|
73
|
+
def main():
|
|
74
|
+
WUHAN_REF = get_wuhan_ref()
|
|
75
|
+
|
|
76
|
+
for name in ("synthdata1", "synthdata2"):
|
|
77
|
+
create_synthetic_sequences(
|
|
78
|
+
f"{name}",
|
|
79
|
+
load_synthdata(f"tests/data/test4/{name}.txt"),
|
|
80
|
+
WUHAN_REF
|
|
81
|
+
)
|
|
82
|
+
print("Synthetic datasets created successfully.")
|
|
83
|
+
|
|
84
|
+
if __name__ == "__main__":
|
|
85
|
+
main()
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
2
|
+
# IMPORTS #
|
|
3
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from Bio import SeqIO
|
|
10
|
+
from Bio.SeqRecord import SeqRecord
|
|
11
|
+
|
|
12
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
13
|
+
# CONSTANTS #
|
|
14
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
15
|
+
|
|
16
|
+
WUHAN_REF_PATH = "tests/data/test3/test3UK.fasta"
|
|
17
|
+
DATE_MAP = dict(zip(
|
|
18
|
+
range(41),
|
|
19
|
+
pd.date_range(start="2020-01-01", periods=41, freq="7D")
|
|
20
|
+
))
|
|
21
|
+
NUCLEOTIDES = {"A", "C", "G", "T"}
|
|
22
|
+
TEST5_BASE_PATH = "tests/data/test5"
|
|
23
|
+
|
|
24
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
25
|
+
# FUNCTIONS #
|
|
26
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
27
|
+
|
|
28
|
+
def load_synthdata(path: str) -> pd.DataFrame:
|
|
29
|
+
"""Loads the synthdata from the given path.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
df = pd.read_csv(path, sep="\t", index_col=0)
|
|
33
|
+
df.columns = [f"seq{k}" for k in range(len(df.columns))]
|
|
34
|
+
|
|
35
|
+
return df
|
|
36
|
+
|
|
37
|
+
def get_wuhan_ref() -> SeqRecord:
|
|
38
|
+
"""Returns the Wuhan reference sequence.
|
|
39
|
+
"""
|
|
40
|
+
# We know the Wuhan sequence is the first one in the file
|
|
41
|
+
with open(WUHAN_REF_PATH, "r") as f:
|
|
42
|
+
for record in SeqIO.parse(f, "fasta"):
|
|
43
|
+
return record
|
|
44
|
+
|
|
45
|
+
def create_synthetic_sequence(mut_num: int, wuhan_ref: SeqRecord) -> str:
|
|
46
|
+
"""Creates a synthetic sequence from the Wuhan reference sequence.
|
|
47
|
+
"""
|
|
48
|
+
_seq = list(wuhan_ref.seq)
|
|
49
|
+
mut_positions = np.random.choice(range(len(_seq)), mut_num, replace=False)
|
|
50
|
+
nucleotides = [
|
|
51
|
+
np.random.choice(list(NUCLEOTIDES - {_seq[i]}))
|
|
52
|
+
for i in mut_positions
|
|
53
|
+
]
|
|
54
|
+
for pos, nuc in zip(mut_positions, nucleotides):
|
|
55
|
+
_seq[pos] = nuc
|
|
56
|
+
return "".join(_seq)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def create_synthetic_sequences(output_path: str, synthdata: pd.DataFrame, wuhan_ref: SeqRecord) -> None:
|
|
60
|
+
"""Creates the synthetic dataset from the synthdata reference and the Wuhan reference sequence.
|
|
61
|
+
"""
|
|
62
|
+
with open(f"{output_path}.fasta", "w") as f, open(f"{output_path}.tsv", "w") as g:
|
|
63
|
+
f.write(f">{wuhan_ref.id}\n{wuhan_ref.seq}\n")
|
|
64
|
+
g.write(f"id\tdate\n{wuhan_ref.id}\t2019-12-29\n")
|
|
65
|
+
for row_idx, row in synthdata.iterrows():
|
|
66
|
+
for col_name, col in row.items():
|
|
67
|
+
# Fetch the row number, the column name and the entry in the synthdata
|
|
68
|
+
f.write(f">{col_name}_{row_idx}\n{create_synthetic_sequence(col, wuhan_ref)}\n")
|
|
69
|
+
g.write(f"{col_name}_{row_idx}\t{DATE_MAP[row_idx].strftime('%Y-%m-%d')}\n")
|
|
70
|
+
|
|
71
|
+
def process_test5_files():
|
|
72
|
+
"""Process all .txt files in test5 subdirectories and generate corresponding .fasta and .tsv files.
|
|
73
|
+
"""
|
|
74
|
+
WUHAN_REF = get_wuhan_ref()
|
|
75
|
+
|
|
76
|
+
for subdir in os.listdir(TEST5_BASE_PATH):
|
|
77
|
+
txt_files = [
|
|
78
|
+
file
|
|
79
|
+
for file in os.listdir(f"{TEST5_BASE_PATH}/{subdir}")
|
|
80
|
+
if file.endswith(".txt")
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
for txt_file in txt_files:
|
|
84
|
+
|
|
85
|
+
name = txt_file.strip(".txt")
|
|
86
|
+
create_synthetic_sequences(
|
|
87
|
+
f"{TEST5_BASE_PATH}/{subdir}/{name}",
|
|
88
|
+
load_synthdata(
|
|
89
|
+
f"{TEST5_BASE_PATH}/{subdir}/{txt_file}"
|
|
90
|
+
),
|
|
91
|
+
WUHAN_REF
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
|
96
|
+
# MAIN #
|
|
97
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
|
98
|
+
|
|
99
|
+
def main():
|
|
100
|
+
"""Main function to process all test5 files.
|
|
101
|
+
"""
|
|
102
|
+
print("Processing all .txt files in test5 subdirectories...")
|
|
103
|
+
process_test5_files()
|
|
104
|
+
print("All synthetic datasets created successfully.")
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
main()
|