PyEvoMotion 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/__init__.py +11 -0
- PyEvoMotion/cli.py +440 -0
- PyEvoMotion/core/__init__.py +7 -0
- PyEvoMotion/core/base.py +406 -0
- PyEvoMotion/core/core.py +520 -0
- PyEvoMotion/core/parser.py +467 -0
- PyEvoMotion/utils.py +87 -0
- pyevomotion-0.1.0.dist-info/METADATA +117 -0
- pyevomotion-0.1.0.dist-info/RECORD +13 -0
- pyevomotion-0.1.0.dist-info/WHEEL +4 -0
- pyevomotion-0.1.0.dist-info/entry_points.txt +3 -0
- share/mafft_install.sh +44 -0
- share/manuscript_figure.py +316 -0
share/mafft_install.sh
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# Utility script that installs mafft when it is not available on the system
|
2
|
+
# It requires git to be installed on the system
|
3
|
+
# Version will be bumped to latest version available on each revision
|
4
|
+
|
5
|
+
MAFFT_VERSION=v7.526;
|
6
|
+
echo "Installing MAFFT $MAFFT_VERSION";
|
7
|
+
|
8
|
+
PREFIX=$HOME/.local;
|
9
|
+
|
10
|
+
if ! test -d $PREFIX; then
|
11
|
+
mkdir -p $PREFIX;
|
12
|
+
fi
|
13
|
+
|
14
|
+
mkdir -p mafft.temp;
|
15
|
+
cd mafft.temp;
|
16
|
+
|
17
|
+
# Clone the repository and checkout the required version
|
18
|
+
git clone https://gitlab.com/sysimm/mafft;
|
19
|
+
cd mafft/core;
|
20
|
+
git checkout $MAFFT_VERSION;
|
21
|
+
|
22
|
+
# Modify the Makefile to install MAFFT locally
|
23
|
+
sed -i "s|PREFIX = /usr/local|PREFIX = $PREFIX|" Makefile;
|
24
|
+
if [ $? -ne 0 ]; then
|
25
|
+
echo "Failed to modify Makefile for MAFFT installation. Aborting...";
|
26
|
+
exit 1;
|
27
|
+
fi
|
28
|
+
|
29
|
+
# Build and install
|
30
|
+
make clean;
|
31
|
+
make;
|
32
|
+
make install;
|
33
|
+
|
34
|
+
# Cleanup
|
35
|
+
cd ..;
|
36
|
+
rm -rf mafft.temp;
|
37
|
+
|
38
|
+
if ! test -f $HOME/.bashrc; then
|
39
|
+
touch $HOME/.bashrc;
|
40
|
+
fi
|
41
|
+
|
42
|
+
if ! grep -q "PATH=$PREFIX/bin:\$PATH" $HOME/.bashrc; then
|
43
|
+
echo "export PATH=$PREFIX/bin:\$PATH" >> $HOME/.bashrc;
|
44
|
+
fi
|
@@ -0,0 +1,316 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
import json
|
4
|
+
import zipfile
|
5
|
+
import warnings
|
6
|
+
import urllib.request
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import pandas as pd
|
10
|
+
import matplotlib as mpl
|
11
|
+
import matplotlib.pyplot as plt
|
12
|
+
|
13
|
+
|
14
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
15
|
+
# FUNCTIONS #
|
16
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
17
|
+
|
18
|
+
def set_matplotlib_global_params() -> None:
|
19
|
+
mpl_params = {
|
20
|
+
"font.sans-serif": "Helvetica",
|
21
|
+
"axes.linewidth": 2,
|
22
|
+
"axes.labelsize": 22,
|
23
|
+
"axes.spines.top": False,
|
24
|
+
"axes.spines.right": False,
|
25
|
+
"font.size": 20,
|
26
|
+
"xtick.major.width": 2,
|
27
|
+
"ytick.major.width": 2,
|
28
|
+
"xtick.major.size": 6,
|
29
|
+
"ytick.major.size": 6,
|
30
|
+
"legend.frameon": False,
|
31
|
+
}
|
32
|
+
for k, v in mpl_params.items(): mpl.rcParams[k] = v
|
33
|
+
|
34
|
+
def check_test_data_exists() -> bool:
|
35
|
+
"""
|
36
|
+
Check if the UK-USA dataset has been downloaded.
|
37
|
+
"""
|
38
|
+
|
39
|
+
_files = [
|
40
|
+
"test3UK.fasta",
|
41
|
+
"test3USA.fasta",
|
42
|
+
"test3UK.tsv",
|
43
|
+
"test3USA.tsv"
|
44
|
+
]
|
45
|
+
|
46
|
+
_parent_path = "tests/data/test3/"
|
47
|
+
|
48
|
+
for file in _files:
|
49
|
+
if not os.path.exists(os.path.join(_parent_path, file)):
|
50
|
+
return False
|
51
|
+
|
52
|
+
return True
|
53
|
+
|
54
|
+
def download_test_data_zip() -> None:
|
55
|
+
"""
|
56
|
+
Download the UK-USA dataset from the repository.
|
57
|
+
"""
|
58
|
+
warnings.warn("""
|
59
|
+
The necessary data for testing is not present.
|
60
|
+
Downloading the UK-USA dataset from
|
61
|
+
https://sourceforge.net/projects/pyevomotion/files/test_data.zip
|
62
|
+
into
|
63
|
+
tests/data/test3/test_data.zip
|
64
|
+
This may take a while.
|
65
|
+
"""
|
66
|
+
)
|
67
|
+
urllib.request.urlretrieve(
|
68
|
+
"https://sourceforge.net/projects/pyevomotion/files/test_data.zip/download",
|
69
|
+
"tests/data/test3/test_data.zip"
|
70
|
+
)
|
71
|
+
|
72
|
+
def extract_test_data_zip() -> None:
|
73
|
+
"""
|
74
|
+
Extract the UK-USA dataset.
|
75
|
+
"""
|
76
|
+
with zipfile.ZipFile("tests/data/test3/test_data.zip", "r") as zip_ref:
|
77
|
+
zip_ref.extractall("tests/data/test3/")
|
78
|
+
os.remove("tests/data/test3/test_data.zip")
|
79
|
+
|
80
|
+
def check_fig_data_exists() -> bool:
|
81
|
+
"""
|
82
|
+
Check if the figure data files exist.
|
83
|
+
"""
|
84
|
+
_files = [
|
85
|
+
"share/figdataUK.tsv",
|
86
|
+
"share/figdataUSA.tsv"
|
87
|
+
]
|
88
|
+
|
89
|
+
for file in _files:
|
90
|
+
if not os.path.exists(file):
|
91
|
+
return False
|
92
|
+
|
93
|
+
return True
|
94
|
+
|
95
|
+
def create_fig_data() -> None:
|
96
|
+
print("Creating figure data files for the manuscript...")
|
97
|
+
with open("tests/data/test3/ids_sampled_for_figure.json") as f:
|
98
|
+
ids = json.load(f)
|
99
|
+
|
100
|
+
if not check_test_data_exists():
|
101
|
+
print("The necessary data for testing is not present. Downloading it now...")
|
102
|
+
download_test_data_zip()
|
103
|
+
extract_test_data_zip()
|
104
|
+
|
105
|
+
for country in ["UK", "USA"]:
|
106
|
+
df = (
|
107
|
+
pd.read_csv(
|
108
|
+
f"tests/data/test3/test3{country}.tsv",
|
109
|
+
sep="\t",
|
110
|
+
index_col=0,
|
111
|
+
parse_dates=["date"],
|
112
|
+
)
|
113
|
+
)
|
114
|
+
(
|
115
|
+
df[df["id"].isin(ids[country])]
|
116
|
+
.reset_index(drop=True)
|
117
|
+
.to_csv(f"share/figdata{country}.tsv", sep="\t")
|
118
|
+
)
|
119
|
+
|
120
|
+
def check_final_data_and_models_exist() -> bool:
|
121
|
+
"""
|
122
|
+
Check if the final data files and models exist.
|
123
|
+
"""
|
124
|
+
_files = [
|
125
|
+
"share/figUSA_stats.tsv",
|
126
|
+
"share/figUK_stats.tsv",
|
127
|
+
"share/figUSA_regression_results.json",
|
128
|
+
"share/figUK_regression_results.json"
|
129
|
+
]
|
130
|
+
|
131
|
+
for file in _files:
|
132
|
+
if not os.path.exists(file):
|
133
|
+
return False
|
134
|
+
|
135
|
+
return True
|
136
|
+
|
137
|
+
def load_final_data_df() -> pd.DataFrame:
|
138
|
+
return pd.read_csv(
|
139
|
+
"share/figUSA_stats.tsv",
|
140
|
+
sep="\t",
|
141
|
+
).merge(
|
142
|
+
pd.read_csv(
|
143
|
+
"share/figUK_stats.tsv",
|
144
|
+
sep="\t",
|
145
|
+
),
|
146
|
+
on="date",
|
147
|
+
how="outer",
|
148
|
+
suffixes=(" USA", " UK"),
|
149
|
+
)
|
150
|
+
|
151
|
+
def load_models() -> dict[str, dict[str, callable]]:
|
152
|
+
_kinds = ("USA", "UK")
|
153
|
+
_file = "share/fig{}_regression_results.json"
|
154
|
+
|
155
|
+
_contents = {}
|
156
|
+
|
157
|
+
for k in _kinds:
|
158
|
+
with open(_file.format(k)) as f:
|
159
|
+
_contents[k] = json.load(f)
|
160
|
+
|
161
|
+
return {
|
162
|
+
"USA": {
|
163
|
+
"mean": [
|
164
|
+
lambda x: (
|
165
|
+
_contents["USA"]["mean number of mutations per 7D model"]["parameters"]["m"]*x
|
166
|
+
+ _contents["USA"]["mean number of mutations per 7D model"]["parameters"]["b"]
|
167
|
+
),
|
168
|
+
_contents["USA"]["mean number of mutations per 7D model"]["r2"],
|
169
|
+
],
|
170
|
+
"var": [
|
171
|
+
lambda x: (
|
172
|
+
_contents["USA"]["scaled var number of mutations per 7D model"]["parameters"]["d"]
|
173
|
+
*(x**_contents["USA"]["scaled var number of mutations per 7D model"]["parameters"]["alpha"])
|
174
|
+
),
|
175
|
+
_contents["USA"]["scaled var number of mutations per 7D model"]["r2"],
|
176
|
+
]
|
177
|
+
},
|
178
|
+
"UK": {
|
179
|
+
"mean": [
|
180
|
+
lambda x: (
|
181
|
+
_contents["UK"]["mean number of mutations per 7D model"]["parameters"]["m"]*x
|
182
|
+
+ _contents["UK"]["mean number of mutations per 7D model"]["parameters"]["b"]
|
183
|
+
),
|
184
|
+
_contents["UK"]["mean number of mutations per 7D model"]["r2"],
|
185
|
+
],
|
186
|
+
"var": [
|
187
|
+
lambda x: (
|
188
|
+
_contents["UK"]["scaled var number of mutations per 7D model"]["parameters"]["d"]
|
189
|
+
*(x**_contents["UK"]["scaled var number of mutations per 7D model"]["parameters"]["alpha"])
|
190
|
+
),
|
191
|
+
_contents["UK"]["scaled var number of mutations per 7D model"]["r2"],
|
192
|
+
]
|
193
|
+
},
|
194
|
+
}
|
195
|
+
|
196
|
+
def safe_map(f: callable, x: list[int | float]) -> list[int | float]:
|
197
|
+
_results = []
|
198
|
+
for el in x:
|
199
|
+
try: _results.append(f(el))
|
200
|
+
except Exception as e:
|
201
|
+
print(f"WARNING: {e}")
|
202
|
+
_results.append(None)
|
203
|
+
return _results
|
204
|
+
|
205
|
+
def plot(df: pd.DataFrame, models: dict[str, any], export: bool = False, show: bool = True) -> None:
|
206
|
+
set_matplotlib_global_params()
|
207
|
+
fig, ax = plt.subplots(2, 1, figsize=(6, 10))
|
208
|
+
|
209
|
+
colors = {
|
210
|
+
"UK": "#76d6ff",
|
211
|
+
"USA": "#FF6346",
|
212
|
+
}
|
213
|
+
|
214
|
+
for idx, case in enumerate(("mean", "var")):
|
215
|
+
for col in (f"{case} number of mutations USA", f"{case} number of mutations UK"):
|
216
|
+
|
217
|
+
_country = col.split()[-1].upper()
|
218
|
+
|
219
|
+
ax[idx].scatter(
|
220
|
+
df.index,
|
221
|
+
df[col] - (df[col].min() if idx == 1 else 0),
|
222
|
+
color=colors[_country],
|
223
|
+
edgecolor="k",
|
224
|
+
zorder=2,
|
225
|
+
)
|
226
|
+
|
227
|
+
_x = np.arange(-10, 50, 0.5)
|
228
|
+
ax[idx].plot(
|
229
|
+
_x + (8 if _country == "USA" else 0),
|
230
|
+
safe_map(models[_country][case][0], _x),
|
231
|
+
color=colors[_country],
|
232
|
+
label=rf"{_country} ($R^2 = {round(models[_country][case][1], 2):.2f})$",
|
233
|
+
linewidth=3,
|
234
|
+
zorder=1,
|
235
|
+
)
|
236
|
+
|
237
|
+
# Styling
|
238
|
+
ax[idx].set_xlim(-0.5, 40.5)
|
239
|
+
ax[idx].set_ylim(30, 50) if idx == 0 else ax[idx].set_ylim(0, 16)
|
240
|
+
|
241
|
+
ax[idx].set_xlabel("time (wk)")
|
242
|
+
|
243
|
+
if case == "mean":
|
244
|
+
ax[idx].set_ylabel(f"{case} (# mutations)")
|
245
|
+
elif case == "var":
|
246
|
+
ax[idx].set_ylabel(f"{case}iance (# mutations)")
|
247
|
+
|
248
|
+
ax[idx].set_xticks(np.arange(0, 41, 10))
|
249
|
+
ax[idx].set_yticks(np.arange(30, 51, 5)) if idx == 0 else ax[idx].set_yticks(np.arange(0, 17, 4))
|
250
|
+
|
251
|
+
ax[idx].legend(
|
252
|
+
fontsize=16,
|
253
|
+
loc="upper left",
|
254
|
+
)
|
255
|
+
|
256
|
+
fig.suptitle(" ", fontsize=1) # To get some space on top
|
257
|
+
fig.tight_layout()
|
258
|
+
plt.annotate("a", (0.02, 0.94), xycoords="figure fraction", fontsize=28, fontweight="bold")
|
259
|
+
plt.annotate("b", (0.02, 0.47), xycoords="figure fraction", fontsize=28, fontweight="bold")
|
260
|
+
|
261
|
+
if export:
|
262
|
+
fig.savefig(
|
263
|
+
"share/figure.pdf",
|
264
|
+
dpi=400,
|
265
|
+
bbox_inches="tight",
|
266
|
+
)
|
267
|
+
print("Figure saved as share/figure.pdf")
|
268
|
+
|
269
|
+
if show: plt.show()
|
270
|
+
|
271
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
272
|
+
# MAIN #
|
273
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
274
|
+
|
275
|
+
def main(export: bool = False) -> None:
|
276
|
+
|
277
|
+
if not check_final_data_and_models_exist():
|
278
|
+
print("Final data files do not exist. Creating them...")
|
279
|
+
|
280
|
+
if not check_fig_data_exists():
|
281
|
+
print("Figure data files do not exist. Creating them...")
|
282
|
+
create_fig_data()
|
283
|
+
|
284
|
+
for country in ["UK", "USA"]:
|
285
|
+
# Invoke PyEvoMotion as if it were a command line tool
|
286
|
+
print(f"Running PyEvoMotion for {country}...")
|
287
|
+
os.system(" ".join([
|
288
|
+
"PyEvoMotion",
|
289
|
+
f"tests/data/test3/test3{country}.fasta",
|
290
|
+
f"share/figdata{country}.tsv",
|
291
|
+
f"share/fig{country}",
|
292
|
+
"-k", "total",
|
293
|
+
"-n", "5",
|
294
|
+
"-dt", "7D",
|
295
|
+
"-dr", "2020-10-01..2021-08-01",
|
296
|
+
"-ep",
|
297
|
+
"-xj",
|
298
|
+
]))
|
299
|
+
|
300
|
+
# Load plot data & models
|
301
|
+
df = load_final_data_df()
|
302
|
+
models = load_models()
|
303
|
+
|
304
|
+
# Plot
|
305
|
+
plot(df, models, export=export)
|
306
|
+
|
307
|
+
|
308
|
+
if __name__ == "__main__":
|
309
|
+
|
310
|
+
# Doing this way to not raise an out of bounds error when running the script without arguments
|
311
|
+
_export_flag = False
|
312
|
+
if len(sys.argv) > 1:
|
313
|
+
if sys.argv[1] == "export":
|
314
|
+
_export_flag = True
|
315
|
+
|
316
|
+
main(export=_export_flag)
|