PyEvoMotion 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ PyEvoMotion=PyEvoMotion.cli:_main
3
+
share/mafft_install.sh ADDED
@@ -0,0 +1,44 @@
1
+ # Utility script that installs mafft when it is not available on the system
2
+ # It requires git to be installed on the system
3
+ # Version will be bumped to latest version available on each revision
4
+
5
+ MAFFT_VERSION=v7.526;
6
+ echo "Installing MAFFT $MAFFT_VERSION";
7
+
8
+ PREFIX=$HOME/.local;
9
+
10
+ if ! test -d $PREFIX; then
11
+ mkdir -p $PREFIX;
12
+ fi
13
+
14
+ mkdir -p mafft.temp;
15
+ cd mafft.temp;
16
+
17
+ # Clone the repository and checkout the required version
18
+ git clone https://gitlab.com/sysimm/mafft;
19
+ cd mafft/core;
20
+ git checkout $MAFFT_VERSION;
21
+
22
+ # Modify the Makefile to install MAFFT locally
23
+ sed -i "s|PREFIX = /usr/local|PREFIX = $PREFIX|" Makefile;
24
+ if [ $? -ne 0 ]; then
25
+ echo "Failed to modify Makefile for MAFFT installation. Aborting...";
26
+ exit 1;
27
+ fi
28
+
29
+ # Build and install
30
+ make clean;
31
+ make;
32
+ make install;
33
+
34
+ # Cleanup
35
+ cd ..;
36
+ rm -rf mafft.temp;
37
+
38
+ if ! test -f $HOME/.bashrc; then
39
+ touch $HOME/.bashrc;
40
+ fi
41
+
42
+ if ! grep -q "PATH=$PREFIX/bin:\$PATH" $HOME/.bashrc; then
43
+ echo "export PATH=$PREFIX/bin:\$PATH" >> $HOME/.bashrc;
44
+ fi
@@ -0,0 +1,316 @@
1
+ import os
2
+ import sys
3
+ import json
4
+ import zipfile
5
+ import warnings
6
+ import urllib.request
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import matplotlib as mpl
11
+ import matplotlib.pyplot as plt
12
+
13
+
14
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
15
+ # FUNCTIONS #
16
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
17
+
18
+ def set_matplotlib_global_params() -> None:
19
+ mpl_params = {
20
+ "font.sans-serif": "Helvetica",
21
+ "axes.linewidth": 2,
22
+ "axes.labelsize": 22,
23
+ "axes.spines.top": False,
24
+ "axes.spines.right": False,
25
+ "font.size": 20,
26
+ "xtick.major.width": 2,
27
+ "ytick.major.width": 2,
28
+ "xtick.major.size": 6,
29
+ "ytick.major.size": 6,
30
+ "legend.frameon": False,
31
+ }
32
+ for k, v in mpl_params.items(): mpl.rcParams[k] = v
33
+
34
+ def check_test_data_exists() -> bool:
35
+ """
36
+ Check if the UK-USA dataset has been downloaded.
37
+ """
38
+
39
+ _files = [
40
+ "test3UK.fasta",
41
+ "test3USA.fasta",
42
+ "test3UK.tsv",
43
+ "test3USA.tsv"
44
+ ]
45
+
46
+ _parent_path = "tests/data/test3/"
47
+
48
+ for file in _files:
49
+ if not os.path.exists(os.path.join(_parent_path, file)):
50
+ return False
51
+
52
+ return True
53
+
54
+ def download_test_data_zip() -> None:
55
+ """
56
+ Download the UK-USA dataset from the repository.
57
+ """
58
+ warnings.warn("""
59
+ The necessary data for testing is not present.
60
+ Downloading the UK-USA dataset from
61
+ https://sourceforge.net/projects/pyevomotion/files/test_data.zip
62
+ into
63
+ tests/data/test3/test_data.zip
64
+ This may take a while.
65
+ """
66
+ )
67
+ urllib.request.urlretrieve(
68
+ "https://sourceforge.net/projects/pyevomotion/files/test_data.zip/download",
69
+ "tests/data/test3/test_data.zip"
70
+ )
71
+
72
+ def extract_test_data_zip() -> None:
73
+ """
74
+ Extract the UK-USA dataset.
75
+ """
76
+ with zipfile.ZipFile("tests/data/test3/test_data.zip", "r") as zip_ref:
77
+ zip_ref.extractall("tests/data/test3/")
78
+ os.remove("tests/data/test3/test_data.zip")
79
+
80
+ def check_fig_data_exists() -> bool:
81
+ """
82
+ Check if the figure data files exist.
83
+ """
84
+ _files = [
85
+ "share/figdataUK.tsv",
86
+ "share/figdataUSA.tsv"
87
+ ]
88
+
89
+ for file in _files:
90
+ if not os.path.exists(file):
91
+ return False
92
+
93
+ return True
94
+
95
+ def create_fig_data() -> None:
96
+ print("Creating figure data files for the manuscript...")
97
+ with open("tests/data/test3/ids_sampled_for_figure.json") as f:
98
+ ids = json.load(f)
99
+
100
+ if not check_test_data_exists():
101
+ print("The necessary data for testing is not present. Downloading it now...")
102
+ download_test_data_zip()
103
+ extract_test_data_zip()
104
+
105
+ for country in ["UK", "USA"]:
106
+ df = (
107
+ pd.read_csv(
108
+ f"tests/data/test3/test3{country}.tsv",
109
+ sep="\t",
110
+ index_col=0,
111
+ parse_dates=["date"],
112
+ )
113
+ )
114
+ (
115
+ df[df["id"].isin(ids[country])]
116
+ .reset_index(drop=True)
117
+ .to_csv(f"share/figdata{country}.tsv", sep="\t")
118
+ )
119
+
120
+ def check_final_data_and_models_exist() -> bool:
121
+ """
122
+ Check if the final data files and models exist.
123
+ """
124
+ _files = [
125
+ "share/figUSA_stats.tsv",
126
+ "share/figUK_stats.tsv",
127
+ "share/figUSA_regression_results.json",
128
+ "share/figUK_regression_results.json"
129
+ ]
130
+
131
+ for file in _files:
132
+ if not os.path.exists(file):
133
+ return False
134
+
135
+ return True
136
+
137
+ def load_final_data_df() -> pd.DataFrame:
138
+ return pd.read_csv(
139
+ "share/figUSA_stats.tsv",
140
+ sep="\t",
141
+ ).merge(
142
+ pd.read_csv(
143
+ "share/figUK_stats.tsv",
144
+ sep="\t",
145
+ ),
146
+ on="date",
147
+ how="outer",
148
+ suffixes=(" USA", " UK"),
149
+ )
150
+
151
+ def load_models() -> dict[str, dict[str, callable]]:
152
+ _kinds = ("USA", "UK")
153
+ _file = "share/fig{}_regression_results.json"
154
+
155
+ _contents = {}
156
+
157
+ for k in _kinds:
158
+ with open(_file.format(k)) as f:
159
+ _contents[k] = json.load(f)
160
+
161
+ return {
162
+ "USA": {
163
+ "mean": [
164
+ lambda x: (
165
+ _contents["USA"]["mean number of mutations per 7D model"]["parameters"]["m"]*x
166
+ + _contents["USA"]["mean number of mutations per 7D model"]["parameters"]["b"]
167
+ ),
168
+ _contents["USA"]["mean number of mutations per 7D model"]["r2"],
169
+ ],
170
+ "var": [
171
+ lambda x: (
172
+ _contents["USA"]["scaled var number of mutations per 7D model"]["parameters"]["d"]
173
+ *(x**_contents["USA"]["scaled var number of mutations per 7D model"]["parameters"]["alpha"])
174
+ ),
175
+ _contents["USA"]["scaled var number of mutations per 7D model"]["r2"],
176
+ ]
177
+ },
178
+ "UK": {
179
+ "mean": [
180
+ lambda x: (
181
+ _contents["UK"]["mean number of mutations per 7D model"]["parameters"]["m"]*x
182
+ + _contents["UK"]["mean number of mutations per 7D model"]["parameters"]["b"]
183
+ ),
184
+ _contents["UK"]["mean number of mutations per 7D model"]["r2"],
185
+ ],
186
+ "var": [
187
+ lambda x: (
188
+ _contents["UK"]["scaled var number of mutations per 7D model"]["parameters"]["d"]
189
+ *(x**_contents["UK"]["scaled var number of mutations per 7D model"]["parameters"]["alpha"])
190
+ ),
191
+ _contents["UK"]["scaled var number of mutations per 7D model"]["r2"],
192
+ ]
193
+ },
194
+ }
195
+
196
+ def safe_map(f: callable, x: list[int | float]) -> list[int | float]:
197
+ _results = []
198
+ for el in x:
199
+ try: _results.append(f(el))
200
+ except Exception as e:
201
+ print(f"WARNING: {e}")
202
+ _results.append(None)
203
+ return _results
204
+
205
+ def plot(df: pd.DataFrame, models: dict[str, any], export: bool = False, show: bool = True) -> None:
206
+ set_matplotlib_global_params()
207
+ fig, ax = plt.subplots(2, 1, figsize=(6, 10))
208
+
209
+ colors = {
210
+ "UK": "#76d6ff",
211
+ "USA": "#FF6346",
212
+ }
213
+
214
+ for idx, case in enumerate(("mean", "var")):
215
+ for col in (f"{case} number of mutations USA", f"{case} number of mutations UK"):
216
+
217
+ _country = col.split()[-1].upper()
218
+
219
+ ax[idx].scatter(
220
+ df.index,
221
+ df[col] - (df[col].min() if idx == 1 else 0),
222
+ color=colors[_country],
223
+ edgecolor="k",
224
+ zorder=2,
225
+ )
226
+
227
+ _x = np.arange(-10, 50, 0.5)
228
+ ax[idx].plot(
229
+ _x + (8 if _country == "USA" else 0),
230
+ safe_map(models[_country][case][0], _x),
231
+ color=colors[_country],
232
+ label=rf"{_country} ($R^2 = {round(models[_country][case][1], 2):.2f})$",
233
+ linewidth=3,
234
+ zorder=1,
235
+ )
236
+
237
+ # Styling
238
+ ax[idx].set_xlim(-0.5, 40.5)
239
+ ax[idx].set_ylim(30, 50) if idx == 0 else ax[idx].set_ylim(0, 16)
240
+
241
+ ax[idx].set_xlabel("time (wk)")
242
+
243
+ if case == "mean":
244
+ ax[idx].set_ylabel(f"{case} (# mutations)")
245
+ elif case == "var":
246
+ ax[idx].set_ylabel(f"{case}iance (# mutations)")
247
+
248
+ ax[idx].set_xticks(np.arange(0, 41, 10))
249
+ ax[idx].set_yticks(np.arange(30, 51, 5)) if idx == 0 else ax[idx].set_yticks(np.arange(0, 17, 4))
250
+
251
+ ax[idx].legend(
252
+ fontsize=16,
253
+ loc="upper left",
254
+ )
255
+
256
+ fig.suptitle(" ", fontsize=1) # To get some space on top
257
+ fig.tight_layout()
258
+ plt.annotate("a", (0.02, 0.94), xycoords="figure fraction", fontsize=28, fontweight="bold")
259
+ plt.annotate("b", (0.02, 0.47), xycoords="figure fraction", fontsize=28, fontweight="bold")
260
+
261
+ if export:
262
+ fig.savefig(
263
+ "share/figure.pdf",
264
+ dpi=400,
265
+ bbox_inches="tight",
266
+ )
267
+ print("Figure saved as share/figure.pdf")
268
+
269
+ if show: plt.show()
270
+
271
+ #´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
272
+ # MAIN #
273
+ #.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
274
+
275
+ def main(export: bool = False) -> None:
276
+
277
+ if not check_final_data_and_models_exist():
278
+ print("Final data files do not exist. Creating them...")
279
+
280
+ if not check_fig_data_exists():
281
+ print("Figure data files do not exist. Creating them...")
282
+ create_fig_data()
283
+
284
+ for country in ["UK", "USA"]:
285
+ # Invoke PyEvoMotion as if it were a command line tool
286
+ print(f"Running PyEvoMotion for {country}...")
287
+ os.system(" ".join([
288
+ "PyEvoMotion",
289
+ f"tests/data/test3/test3{country}.fasta",
290
+ f"share/figdata{country}.tsv",
291
+ f"share/fig{country}",
292
+ "-k", "total",
293
+ "-n", "5",
294
+ "-dt", "7D",
295
+ "-dr", "2020-10-01..2021-08-01",
296
+ "-ep",
297
+ "-xj",
298
+ ]))
299
+
300
+ # Load plot data & models
301
+ df = load_final_data_df()
302
+ models = load_models()
303
+
304
+ # Plot
305
+ plot(df, models, export=export)
306
+
307
+
308
+ if __name__ == "__main__":
309
+
310
+ # Doing this way to not raise an out of bounds error when running the script without arguments
311
+ _export_flag = False
312
+ if len(sys.argv) > 1:
313
+ if sys.argv[1] == "export":
314
+ _export_flag = True
315
+
316
+ main(export=_export_flag)