PyEvoMotion 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,85 @@
1
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
2
+ # IMPORTS #
3
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from Bio import SeqIO
9
+ from Bio.SeqRecord import SeqRecord
10
+
11
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
12
+ # CONSTANTS #
13
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
14
+
15
+ WUHAN_REF_PATH = "tests/data/test3/test3UK.fasta"
16
+ DATE_MAP = dict(zip(
17
+ range(41),
18
+ pd.date_range(start="2020-01-01", periods=41, freq="7D")
19
+ ))
20
+ NUCLEOTIDES = {"A", "C", "G", "T"}
21
+
22
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
23
+ # FUNCTIONS #
24
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
25
+
26
+ def load_synthdata(path: str) -> pd.DataFrame:
27
+ """Loads the synthdata from the given path.
28
+ """
29
+
30
+ df = pd.read_csv(path, sep="\t", index_col=0)
31
+ df.columns = [f"seq{k}" for k in range(len(df.columns))]
32
+
33
+ return df
34
+
35
+ def get_wuhan_ref() -> SeqRecord:
36
+ """Returns the Wuhan reference sequence.
37
+ """
38
+ # We know the Wuhan sequence is the first one in the file
39
+ with open(WUHAN_REF_PATH, "r") as f:
40
+ for record in SeqIO.parse(f, "fasta"):
41
+ return record
42
+
43
+ def create_synthetic_sequence(mut_num: int, wuhan_ref: SeqRecord) -> str:
44
+ """Creates a synthetic sequence from the Wuhan reference sequence.
45
+ """
46
+ _seq = list(wuhan_ref.seq)
47
+ mut_positions = np.random.choice(range(len(_seq)), mut_num, replace=False)
48
+ nucleotides = [
49
+ np.random.choice(list(NUCLEOTIDES - {_seq[i]}))
50
+ for i in mut_positions
51
+ ]
52
+ for pos, nuc in zip(mut_positions, nucleotides):
53
+ _seq[pos] = nuc
54
+ return "".join(_seq)
55
+
56
+
57
+ def create_synthetic_sequences(output_path: str, synthdata: pd.DataFrame, wuhan_ref: SeqRecord) -> None:
58
+ """Creates the synthetic dataset from the synthdata reference and the Wuhan reference sequence.
59
+ """
60
+ with open(f"{output_path}.fasta", "w") as f, open(f"{output_path}.tsv", "w") as g:
61
+ f.write(f">{wuhan_ref.id}\n{wuhan_ref.seq}\n")
62
+ g.write(f"id\tdate\n{wuhan_ref.id}\t2019-12-29\n")
63
+ for row_idx, row in synthdata.iterrows():
64
+ for col_name, col in row.items():
65
+ # Fetch the row number, the column name and the entry in the synthdata
66
+ f.write(f">{col_name}_{row_idx}\n{create_synthetic_sequence(col, wuhan_ref)}\n")
67
+ g.write(f"{col_name}_{row_idx}\t{DATE_MAP[row_idx].strftime('%Y-%m-%d')}\n")
68
+
69
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
70
+ # MAIN #
71
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
72
+
73
+ def main():
74
+ WUHAN_REF = get_wuhan_ref()
75
+
76
+ for name in ("synthdata1", "synthdata2"):
77
+ create_synthetic_sequences(
78
+ f"{name}",
79
+ load_synthdata(f"tests/data/test4/{name}.txt"),
80
+ WUHAN_REF
81
+ )
82
+ print("Synthetic datasets created successfully.")
83
+
84
+ if __name__ == "__main__":
85
+ main()
@@ -0,0 +1,107 @@
1
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
2
+ # IMPORTS #
3
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
4
+
5
+ import os
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ from Bio import SeqIO
10
+ from Bio.SeqRecord import SeqRecord
11
+
12
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
13
+ # CONSTANTS #
14
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
15
+
16
+ WUHAN_REF_PATH = "tests/data/test3/test3UK.fasta"
17
+ DATE_MAP = dict(zip(
18
+ range(41),
19
+ pd.date_range(start="2020-01-01", periods=41, freq="7D")
20
+ ))
21
+ NUCLEOTIDES = {"A", "C", "G", "T"}
22
+ TEST5_BASE_PATH = "tests/data/test5"
23
+
24
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
25
+ # FUNCTIONS #
26
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
27
+
28
+ def load_synthdata(path: str) -> pd.DataFrame:
29
+ """Loads the synthdata from the given path.
30
+ """
31
+
32
+ df = pd.read_csv(path, sep="\t", index_col=0)
33
+ df.columns = [f"seq{k}" for k in range(len(df.columns))]
34
+
35
+ return df
36
+
37
+ def get_wuhan_ref() -> SeqRecord:
38
+ """Returns the Wuhan reference sequence.
39
+ """
40
+ # We know the Wuhan sequence is the first one in the file
41
+ with open(WUHAN_REF_PATH, "r") as f:
42
+ for record in SeqIO.parse(f, "fasta"):
43
+ return record
44
+
45
+ def create_synthetic_sequence(mut_num: int, wuhan_ref: SeqRecord) -> str:
46
+ """Creates a synthetic sequence from the Wuhan reference sequence.
47
+ """
48
+ _seq = list(wuhan_ref.seq)
49
+ mut_positions = np.random.choice(range(len(_seq)), mut_num, replace=False)
50
+ nucleotides = [
51
+ np.random.choice(list(NUCLEOTIDES - {_seq[i]}))
52
+ for i in mut_positions
53
+ ]
54
+ for pos, nuc in zip(mut_positions, nucleotides):
55
+ _seq[pos] = nuc
56
+ return "".join(_seq)
57
+
58
+
59
+ def create_synthetic_sequences(output_path: str, synthdata: pd.DataFrame, wuhan_ref: SeqRecord) -> None:
60
+ """Creates the synthetic dataset from the synthdata reference and the Wuhan reference sequence.
61
+ """
62
+ with open(f"{output_path}.fasta", "w") as f, open(f"{output_path}.tsv", "w") as g:
63
+ f.write(f">{wuhan_ref.id}\n{wuhan_ref.seq}\n")
64
+ g.write(f"id\tdate\n{wuhan_ref.id}\t2019-12-29\n")
65
+ for row_idx, row in synthdata.iterrows():
66
+ for col_name, col in row.items():
67
+ # Fetch the row number, the column name and the entry in the synthdata
68
+ f.write(f">{col_name}_{row_idx}\n{create_synthetic_sequence(col, wuhan_ref)}\n")
69
+ g.write(f"{col_name}_{row_idx}\t{DATE_MAP[row_idx].strftime('%Y-%m-%d')}\n")
70
+
71
+ def process_test5_files():
72
+ """Process all .txt files in test5 subdirectories and generate corresponding .fasta and .tsv files.
73
+ """
74
+ WUHAN_REF = get_wuhan_ref()
75
+
76
+ for subdir in os.listdir(TEST5_BASE_PATH):
77
+ txt_files = [
78
+ file
79
+ for file in os.listdir(f"{TEST5_BASE_PATH}/{subdir}")
80
+ if file.endswith(".txt")
81
+ ]
82
+
83
+ for txt_file in txt_files:
84
+
85
+ name = txt_file.strip(".txt")
86
+ create_synthetic_sequences(
87
+ f"{TEST5_BASE_PATH}/{subdir}/{name}",
88
+ load_synthdata(
89
+ f"{TEST5_BASE_PATH}/{subdir}/{txt_file}"
90
+ ),
91
+ WUHAN_REF
92
+ )
93
+
94
+
95
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
96
+ # MAIN #
97
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
98
+
99
+ def main():
100
+ """Main function to process all test5 files.
101
+ """
102
+ print("Processing all .txt files in test5 subdirectories...")
103
+ process_test5_files()
104
+ print("All synthetic datasets created successfully.")
105
+
106
+ if __name__ == "__main__":
107
+ main()