PyEvoMotion 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/cli.py +1 -8
- PyEvoMotion/core/base.py +83 -20
- PyEvoMotion/core/core.py +82 -38
- PyEvoMotion/core/parser.py +4 -1
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.1.dist-info}/METADATA +72 -4
- pyevomotion-0.1.1.dist-info/RECORD +31 -0
- share/anomalous_diffusion.pdf +0 -0
- share/figUK.tsv +9949 -0
- share/figUK_plots.pdf +0 -0
- share/figUK_regression_results.json +18 -0
- share/figUK_run_args.json +13 -0
- share/figUK_stats.tsv +41 -0
- share/figUSA.tsv +9470 -0
- share/figUSA_plots.pdf +0 -0
- share/figUSA_regression_results.json +18 -0
- share/figUSA_run_args.json +13 -0
- share/figUSA_stats.tsv +34 -0
- share/figdataUK.tsv +10001 -0
- share/figdataUSA.tsv +10001 -0
- share/figure.pdf +0 -0
- share/generate_sequences_from_synthdata.py +85 -0
- share/manuscript_figure.py +457 -12
- share/synth_figure.pdf +0 -0
- share/uk_time_windows.pdf +0 -0
- share/weekly_size.pdf +0 -0
- pyevomotion-0.1.0.dist-info/RECORD +0 -13
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.1.dist-info}/WHEEL +0 -0
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.1.dist-info}/entry_points.txt +0 -0
share/figure.pdf
ADDED
Binary file
|
@@ -0,0 +1,85 @@
|
|
1
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
2
|
+
# IMPORTS #
|
3
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
from Bio import SeqIO
|
9
|
+
from Bio.SeqRecord import SeqRecord
|
10
|
+
|
11
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
12
|
+
# CONSTANTS #
|
13
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
14
|
+
|
15
|
+
WUHAN_REF_PATH = "tests/data/test3/test3UK.fasta"
|
16
|
+
DATE_MAP = dict(zip(
|
17
|
+
range(41),
|
18
|
+
pd.date_range(start="2020-01-01", periods=41, freq="7D")
|
19
|
+
))
|
20
|
+
NUCLEOTIDES = {"A", "C", "G", "T"}
|
21
|
+
|
22
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
23
|
+
# FUNCTIONS #
|
24
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
25
|
+
|
26
|
+
def load_synthdata(path: str) -> pd.DataFrame:
|
27
|
+
"""Loads the synthdata from the given path.
|
28
|
+
"""
|
29
|
+
|
30
|
+
df = pd.read_csv(path, sep="\t", index_col=0)
|
31
|
+
df.columns = [f"seq{k}" for k in range(len(df.columns))]
|
32
|
+
|
33
|
+
return df
|
34
|
+
|
35
|
+
def get_wuhan_ref() -> SeqRecord:
|
36
|
+
"""Returns the Wuhan reference sequence.
|
37
|
+
"""
|
38
|
+
# We know the Wuhan sequence is the first one in the file
|
39
|
+
with open(WUHAN_REF_PATH, "r") as f:
|
40
|
+
for record in SeqIO.parse(f, "fasta"):
|
41
|
+
return record
|
42
|
+
|
43
|
+
def create_synthetic_sequence(mut_num: int, wuhan_ref: SeqRecord) -> str:
|
44
|
+
"""Creates a synthetic sequence from the Wuhan reference sequence.
|
45
|
+
"""
|
46
|
+
_seq = list(wuhan_ref.seq)
|
47
|
+
mut_positions = np.random.choice(range(len(_seq)), mut_num, replace=False)
|
48
|
+
nucleotides = [
|
49
|
+
np.random.choice(list(NUCLEOTIDES - {_seq[i]}))
|
50
|
+
for i in mut_positions
|
51
|
+
]
|
52
|
+
for pos, nuc in zip(mut_positions, nucleotides):
|
53
|
+
_seq[pos] = nuc
|
54
|
+
return "".join(_seq)
|
55
|
+
|
56
|
+
|
57
|
+
def create_synthetic_sequences(output_path: str, synthdata: pd.DataFrame, wuhan_ref: SeqRecord) -> None:
|
58
|
+
"""Creates the synthetic dataset from the synthdata reference and the Wuhan reference sequence.
|
59
|
+
"""
|
60
|
+
with open(f"{output_path}.fasta", "w") as f, open(f"{output_path}.tsv", "w") as g:
|
61
|
+
f.write(f">{wuhan_ref.id}\n{wuhan_ref.seq}\n")
|
62
|
+
g.write(f"id\tdate\n{wuhan_ref.id}\t2019-12-29\n")
|
63
|
+
for row_idx, row in synthdata.iterrows():
|
64
|
+
for col_name, col in row.items():
|
65
|
+
# Fetch the row number, the column name and the entry in the synthdata
|
66
|
+
f.write(f">{col_name}_{row_idx}\n{create_synthetic_sequence(col, wuhan_ref)}\n")
|
67
|
+
g.write(f"{col_name}_{row_idx}\t{DATE_MAP[row_idx].strftime('%Y-%m-%d')}\n")
|
68
|
+
|
69
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
70
|
+
# MAIN #
|
71
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
72
|
+
|
73
|
+
def main():
|
74
|
+
WUHAN_REF = get_wuhan_ref()
|
75
|
+
|
76
|
+
for name in ("synthdata1", "synthdata2"):
|
77
|
+
create_synthetic_sequences(
|
78
|
+
f"{name}",
|
79
|
+
load_synthdata(f"tests/data/test4/{name}.txt"),
|
80
|
+
WUHAN_REF
|
81
|
+
)
|
82
|
+
print("Synthetic datasets created successfully.")
|
83
|
+
|
84
|
+
if __name__ == "__main__":
|
85
|
+
main()
|
share/manuscript_figure.py
CHANGED
@@ -4,12 +4,21 @@ import json
|
|
4
4
|
import zipfile
|
5
5
|
import warnings
|
6
6
|
import urllib.request
|
7
|
+
import subprocess
|
7
8
|
|
8
9
|
import numpy as np
|
9
10
|
import pandas as pd
|
10
11
|
import matplotlib as mpl
|
11
12
|
import matplotlib.pyplot as plt
|
12
13
|
|
14
|
+
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
15
|
+
# CONSTANTS #
|
16
|
+
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
17
|
+
|
18
|
+
COLORS = {
|
19
|
+
"UK": "#76d6ff",
|
20
|
+
"USA": "#FF6346",
|
21
|
+
}
|
13
22
|
|
14
23
|
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
15
24
|
# FUNCTIONS #
|
@@ -202,15 +211,10 @@ def safe_map(f: callable, x: list[int | float]) -> list[int | float]:
|
|
202
211
|
_results.append(None)
|
203
212
|
return _results
|
204
213
|
|
205
|
-
def
|
214
|
+
def plot_main_figure(df: pd.DataFrame, models: dict[str, any], export: bool = False, show: bool = True) -> None:
|
206
215
|
set_matplotlib_global_params()
|
207
216
|
fig, ax = plt.subplots(2, 1, figsize=(6, 10))
|
208
217
|
|
209
|
-
colors = {
|
210
|
-
"UK": "#76d6ff",
|
211
|
-
"USA": "#FF6346",
|
212
|
-
}
|
213
|
-
|
214
218
|
for idx, case in enumerate(("mean", "var")):
|
215
219
|
for col in (f"{case} number of mutations USA", f"{case} number of mutations UK"):
|
216
220
|
|
@@ -219,16 +223,16 @@ def plot(df: pd.DataFrame, models: dict[str, any], export: bool = False, show: b
|
|
219
223
|
ax[idx].scatter(
|
220
224
|
df.index,
|
221
225
|
df[col] - (df[col].min() if idx == 1 else 0),
|
222
|
-
color=
|
226
|
+
color=COLORS[_country],
|
223
227
|
edgecolor="k",
|
224
228
|
zorder=2,
|
225
229
|
)
|
226
230
|
|
227
|
-
_x = np.arange(-10,
|
231
|
+
_x = np.arange(-10, 60, 0.5)
|
228
232
|
ax[idx].plot(
|
229
233
|
_x + (8 if _country == "USA" else 0),
|
230
234
|
safe_map(models[_country][case][0], _x),
|
231
|
-
color=
|
235
|
+
color=COLORS[_country],
|
232
236
|
label=rf"{_country} ($R^2 = {round(models[_country][case][1], 2):.2f})$",
|
233
237
|
linewidth=3,
|
234
238
|
zorder=1,
|
@@ -268,6 +272,432 @@ def plot(df: pd.DataFrame, models: dict[str, any], export: bool = False, show: b
|
|
268
272
|
|
269
273
|
if show: plt.show()
|
270
274
|
|
275
|
+
def size_plot(df: pd.DataFrame, export: bool = False, show: bool = True) -> None:
|
276
|
+
set_matplotlib_global_params()
|
277
|
+
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
|
278
|
+
|
279
|
+
# Plot UK first
|
280
|
+
markerline, stemlines, baseline = ax.stem(df.index, df[f"size UK"], label="UK")
|
281
|
+
plt.setp(stemlines, color=COLORS["UK"])
|
282
|
+
plt.setp(markerline, color=COLORS["UK"], markeredgecolor="k")
|
283
|
+
plt.setp(baseline, color="#ffffff")
|
284
|
+
|
285
|
+
# Plot USA
|
286
|
+
markerline, stemlines, baseline = ax.stem(df.index, df[f"size USA"], label="USA")
|
287
|
+
plt.setp(stemlines, color=COLORS["USA"])
|
288
|
+
plt.setp(markerline, color=COLORS["USA"], markeredgecolor="k")
|
289
|
+
plt.setp(baseline, color="#ffffff")
|
290
|
+
|
291
|
+
# Plot UK again but with slight transparency on the stem
|
292
|
+
markerline, stemlines, baseline = ax.stem(df.index, df[f"size UK"])
|
293
|
+
plt.setp(stemlines, color=COLORS["UK"], alpha=0.5)
|
294
|
+
plt.setp(markerline, color=COLORS["UK"], markeredgecolor="#000000")
|
295
|
+
plt.setp(baseline, color="#ffffff")
|
296
|
+
|
297
|
+
ax.set_ylim(0, 405)
|
298
|
+
ax.set_xlim(-0.5, 40.5)
|
299
|
+
|
300
|
+
ax.set_xlabel("time (wk)")
|
301
|
+
ax.set_ylabel("Number of sequences")
|
302
|
+
|
303
|
+
ax.legend(
|
304
|
+
fontsize=16,
|
305
|
+
loc="upper right",
|
306
|
+
bbox_to_anchor=(1.08, 1.08)
|
307
|
+
)
|
308
|
+
|
309
|
+
if export:
|
310
|
+
fig.savefig(
|
311
|
+
"share/weekly_size.pdf",
|
312
|
+
dpi=400,
|
313
|
+
bbox_inches="tight",
|
314
|
+
)
|
315
|
+
print("Figure saved as share/weekly_size.pdf")
|
316
|
+
|
317
|
+
if show: plt.show()
|
318
|
+
|
319
|
+
def anomalous_diffusion_plot(export: bool = False, show: bool = True) -> None:
|
320
|
+
set_matplotlib_global_params()
|
321
|
+
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
|
322
|
+
|
323
|
+
x = np.linspace(0, 10, 100)
|
324
|
+
|
325
|
+
plt.plot(x, x**0.8, label=r"$\alpha = 0.8$" + "\n(subdiffusion)", color=COLORS["UK"], linewidth=3)
|
326
|
+
plt.plot(x, x**1, label=r"$\alpha = 1$" + "\n(normal diffusion)", color="#000000", linewidth=3)
|
327
|
+
plt.plot(x, x**1.2, label=r"$\alpha = 1.2$" + "\n(superdiffusion)", color=COLORS["USA"], linewidth=3)
|
328
|
+
|
329
|
+
plt.legend(
|
330
|
+
fontsize=13,
|
331
|
+
loc="upper left",
|
332
|
+
title=r"variance $\propto \text{time}^\alpha$",
|
333
|
+
title_fontsize=15
|
334
|
+
)
|
335
|
+
|
336
|
+
plt.xlabel("time")
|
337
|
+
plt.ylabel("variance")
|
338
|
+
|
339
|
+
ax.set_xticks([])
|
340
|
+
ax.set_yticks([])
|
341
|
+
|
342
|
+
plt.xlim(0, 10)
|
343
|
+
plt.ylim(0, 10)
|
344
|
+
|
345
|
+
if export:
|
346
|
+
fig.savefig(
|
347
|
+
"share/anomalous_diffusion.pdf",
|
348
|
+
dpi=400,
|
349
|
+
bbox_inches="tight",
|
350
|
+
)
|
351
|
+
print("Figure saved as share/anomalous_diffusion.pdf")
|
352
|
+
|
353
|
+
if show: plt.show()
|
354
|
+
|
355
|
+
def check_synthetic_data_exists() -> bool:
|
356
|
+
"""
|
357
|
+
Check if the synthetic data output files exist.
|
358
|
+
"""
|
359
|
+
_files = [
|
360
|
+
"tests/data/test4/synthdata1_out_stats.tsv",
|
361
|
+
"tests/data/test4/synthdata2_out_stats.tsv",
|
362
|
+
"tests/data/test4/synthdata1_out_regression_results.json",
|
363
|
+
"tests/data/test4/synthdata2_out_regression_results.json"
|
364
|
+
]
|
365
|
+
|
366
|
+
for file in _files:
|
367
|
+
if not os.path.exists(file):
|
368
|
+
return False
|
369
|
+
|
370
|
+
return True
|
371
|
+
|
372
|
+
def run_synthetic_data_tests() -> None:
|
373
|
+
"""
|
374
|
+
Run the synthetic data tests to generate the required files.
|
375
|
+
"""
|
376
|
+
print("Running synthetic data tests to generate required files...")
|
377
|
+
|
378
|
+
# Create output directory
|
379
|
+
os.makedirs("tests/data/test4", exist_ok=True)
|
380
|
+
|
381
|
+
# Run tests for S1 dataset
|
382
|
+
result1 = subprocess.run(
|
383
|
+
[
|
384
|
+
"PyEvoMotion",
|
385
|
+
"S1.fasta",
|
386
|
+
"S1.tsv",
|
387
|
+
"tests/data/test4/synthdata1_out",
|
388
|
+
"-ep"
|
389
|
+
],
|
390
|
+
stdout=subprocess.PIPE,
|
391
|
+
stderr=subprocess.PIPE,
|
392
|
+
text=True
|
393
|
+
)
|
394
|
+
|
395
|
+
if result1.stderr:
|
396
|
+
print(result1.stdout)
|
397
|
+
print(result1.stderr)
|
398
|
+
raise RuntimeError("Failed to process S1 dataset")
|
399
|
+
|
400
|
+
# Run tests for S2 dataset
|
401
|
+
result2 = subprocess.run(
|
402
|
+
[
|
403
|
+
"PyEvoMotion",
|
404
|
+
"S2.fasta",
|
405
|
+
"S2.tsv",
|
406
|
+
"tests/data/test4/synthdata2_out",
|
407
|
+
"-ep"
|
408
|
+
],
|
409
|
+
stdout=subprocess.PIPE,
|
410
|
+
stderr=subprocess.PIPE,
|
411
|
+
text=True
|
412
|
+
)
|
413
|
+
|
414
|
+
if result2.stderr:
|
415
|
+
print(result2.stdout)
|
416
|
+
print(result2.stderr)
|
417
|
+
raise RuntimeError("Failed to process S2 dataset")
|
418
|
+
|
419
|
+
def load_synthetic_data_df() -> pd.DataFrame:
|
420
|
+
if not check_synthetic_data_exists():
|
421
|
+
run_synthetic_data_tests()
|
422
|
+
|
423
|
+
return pd.read_csv(
|
424
|
+
"tests/data/test4/synthdata1_out_stats.tsv",
|
425
|
+
sep="\t",
|
426
|
+
).merge(
|
427
|
+
pd.read_csv(
|
428
|
+
"tests/data/test4/synthdata2_out_stats.tsv",
|
429
|
+
sep="\t",
|
430
|
+
),
|
431
|
+
on="date",
|
432
|
+
how="outer",
|
433
|
+
suffixes=(" synt1", " synt2"),
|
434
|
+
)
|
435
|
+
|
436
|
+
def load_synthetic_data_models() -> dict[str, dict[str, callable]]:
|
437
|
+
if not check_synthetic_data_exists():
|
438
|
+
run_synthetic_data_tests()
|
439
|
+
|
440
|
+
_kinds = ("synt1", "synt2")
|
441
|
+
_file = "tests/data/test4/synthdata{}_out_regression_results.json"
|
442
|
+
|
443
|
+
_contents = {}
|
444
|
+
|
445
|
+
for k in _kinds:
|
446
|
+
with open(_file.format(k[-1])) as f:
|
447
|
+
_contents[k] = json.load(f)
|
448
|
+
|
449
|
+
return {
|
450
|
+
"synt1": {
|
451
|
+
"mean": [
|
452
|
+
lambda x: (
|
453
|
+
_contents["synt1"]["mean number of mutations per 7D model"]["parameters"]["m"]*x
|
454
|
+
+ _contents["synt1"]["mean number of mutations per 7D model"]["parameters"]["b"]
|
455
|
+
),
|
456
|
+
_contents["synt1"]["mean number of mutations per 7D model"]["r2"],
|
457
|
+
],
|
458
|
+
"var": [
|
459
|
+
lambda x: (
|
460
|
+
_contents["synt1"]["scaled var number of mutations per 7D model"]["parameters"]["m"]*x
|
461
|
+
),
|
462
|
+
_contents["synt1"]["scaled var number of mutations per 7D model"]["r2"],
|
463
|
+
]
|
464
|
+
},
|
465
|
+
"synt2": {
|
466
|
+
"mean": [
|
467
|
+
lambda x: (
|
468
|
+
_contents["synt2"]["mean number of mutations per 7D model"]["parameters"]["m"]*x
|
469
|
+
+ _contents["synt2"]["mean number of mutations per 7D model"]["parameters"]["b"]
|
470
|
+
),
|
471
|
+
_contents["synt2"]["mean number of mutations per 7D model"]["r2"],
|
472
|
+
],
|
473
|
+
"var": [
|
474
|
+
lambda x: (
|
475
|
+
_contents["synt2"]["scaled var number of mutations per 7D model"]["parameters"]["d"]
|
476
|
+
*(x**_contents["synt2"]["scaled var number of mutations per 7D model"]["parameters"]["alpha"])
|
477
|
+
),
|
478
|
+
_contents["synt2"]["scaled var number of mutations per 7D model"]["r2"],
|
479
|
+
]
|
480
|
+
},
|
481
|
+
}
|
482
|
+
|
483
|
+
def synthetic_data_plot(df: pd.DataFrame, models: dict[str, any], export: bool = False, show: bool = True) -> None:
|
484
|
+
set_matplotlib_global_params()
|
485
|
+
fig, ax = plt.subplots(2, 2, figsize=(12, 10))
|
486
|
+
|
487
|
+
# Flatten axes for easier iteration
|
488
|
+
ax = ax.flatten()
|
489
|
+
|
490
|
+
# Plot counter for subplot index
|
491
|
+
plot_idx = 0
|
492
|
+
|
493
|
+
for case in ("mean", "var"):
|
494
|
+
for col in (f"{case} number of mutations synt1", f"{case} number of mutations synt2"):
|
495
|
+
_type = col.split()[-1].upper()
|
496
|
+
|
497
|
+
# Scatter plot
|
498
|
+
ax[plot_idx].scatter(
|
499
|
+
df.index,
|
500
|
+
df[col],
|
501
|
+
color="#76d6ff",
|
502
|
+
edgecolor="k",
|
503
|
+
zorder=2,
|
504
|
+
)
|
505
|
+
|
506
|
+
# Line plot
|
507
|
+
_x = np.arange(-10, 50, 0.5)
|
508
|
+
ax[plot_idx].plot(
|
509
|
+
_x,
|
510
|
+
safe_map(models[_type.lower()][case][0], _x),
|
511
|
+
color="#76d6ff",
|
512
|
+
label=rf"$R^2 = {round(models[_type.lower()][case][1], 2):.2f}$",
|
513
|
+
linewidth=3,
|
514
|
+
zorder=1,
|
515
|
+
)
|
516
|
+
|
517
|
+
# Styling
|
518
|
+
ax[plot_idx].set_xlim(-0.5, 40.5)
|
519
|
+
if case == "mean":
|
520
|
+
ax[plot_idx].set_ylim(-0.25, 20.25)
|
521
|
+
ax[plot_idx].set_ylabel(f"{case} (# mutations)")
|
522
|
+
else: # var case
|
523
|
+
if _type == "SYNT1":
|
524
|
+
ax[plot_idx].set_ylim(-0.5, 40.5)
|
525
|
+
else:
|
526
|
+
ax[plot_idx].set_ylim(-0.1, 10.1)
|
527
|
+
ax[plot_idx].set_ylabel(f"{case}iance (# mutations)")
|
528
|
+
|
529
|
+
ax[plot_idx].set_xlabel("time (wk)")
|
530
|
+
ax[plot_idx].legend(
|
531
|
+
fontsize=16,
|
532
|
+
loc="upper left",
|
533
|
+
)
|
534
|
+
|
535
|
+
plot_idx += 1
|
536
|
+
|
537
|
+
fig.suptitle(" ", fontsize=1) # To get some space on top
|
538
|
+
fig.tight_layout()
|
539
|
+
|
540
|
+
# Add subplot annotations
|
541
|
+
plt.annotate("a", (0.02, 0.935), xycoords="figure fraction", fontsize=28, fontweight="bold")
|
542
|
+
plt.annotate("b", (0.505, 0.935), xycoords="figure fraction", fontsize=28, fontweight="bold")
|
543
|
+
plt.annotate("c", (0.02, 0.465), xycoords="figure fraction", fontsize=28, fontweight="bold")
|
544
|
+
plt.annotate("d", (0.505, 0.465), xycoords="figure fraction", fontsize=28, fontweight="bold")
|
545
|
+
|
546
|
+
if export:
|
547
|
+
fig.savefig(
|
548
|
+
"share/synth_figure.pdf",
|
549
|
+
dpi=400,
|
550
|
+
bbox_inches="tight",
|
551
|
+
)
|
552
|
+
print("Figure saved as share/synth_figure.pdf")
|
553
|
+
|
554
|
+
if show: plt.show()
|
555
|
+
|
556
|
+
|
557
|
+
def load_additional_uk_stats() -> dict[str, pd.DataFrame]:
|
558
|
+
"""
|
559
|
+
Load the additional UK stats files for different time windows.
|
560
|
+
"""
|
561
|
+
_files = {
|
562
|
+
"5D": "tests/data/test3/output/20250517164757/UKout_5D_stats.tsv",
|
563
|
+
"10D": "tests/data/test3/output/20250517173133/UKout_10D_stats.tsv",
|
564
|
+
"14D": "tests/data/test3/output/20250517181004/UKout_14D_stats.tsv",
|
565
|
+
"7D": "share/figUK_stats.tsv"
|
566
|
+
}
|
567
|
+
|
568
|
+
return {
|
569
|
+
k: pd.read_csv(v, sep="\t")
|
570
|
+
for k, v in _files.items()
|
571
|
+
}
|
572
|
+
|
573
|
+
def load_additional_uk_models() -> dict[str, dict[str, callable]]:
|
574
|
+
"""
|
575
|
+
Load the additional UK models for different time windows.
|
576
|
+
"""
|
577
|
+
_files = {
|
578
|
+
"5D": "tests/data/test3/output/20250517164757/UKout_5D_regression_results.json",
|
579
|
+
"10D": "tests/data/test3/output/20250517173133/UKout_10D_regression_results.json",
|
580
|
+
"14D": "tests/data/test3/output/20250517181004/UKout_14D_regression_results.json",
|
581
|
+
"7D": "share/figUK_regression_results.json"
|
582
|
+
}
|
583
|
+
|
584
|
+
_contents = {}
|
585
|
+
for k, v in _files.items():
|
586
|
+
with open(v) as f:
|
587
|
+
_contents[k] = json.load(f)
|
588
|
+
return {
|
589
|
+
k: {
|
590
|
+
"mean": [
|
591
|
+
{
|
592
|
+
"m": _contents[k][f"mean number of mutations per {k} model"]["parameters"]["m"],
|
593
|
+
"b": _contents[k][f"mean number of mutations per {k} model"]["parameters"]["b"]
|
594
|
+
},
|
595
|
+
_contents[k][f"mean number of mutations per {k} model"]["r2"]
|
596
|
+
],
|
597
|
+
"var": [
|
598
|
+
{
|
599
|
+
"d": _contents[k][f"scaled var number of mutations per {k} model"]["parameters"]["d"],
|
600
|
+
"alpha": _contents[k][f"scaled var number of mutations per {k} model"]["parameters"]["alpha"]
|
601
|
+
},
|
602
|
+
_contents[k][f"scaled var number of mutations per {k} model"]["r2"],
|
603
|
+
]
|
604
|
+
}
|
605
|
+
for k in _files.keys()
|
606
|
+
}
|
607
|
+
|
608
|
+
def plot_uk_time_windows(stats: dict[str, pd.DataFrame], models: dict[str, dict[str, callable]], export: bool = False, show: bool = True) -> None:
|
609
|
+
"""
|
610
|
+
Plot a 1x4 subplot of UK data with different time windows.
|
611
|
+
|
612
|
+
Args:
|
613
|
+
stats: Dictionary of dataframes containing the stats for each time window
|
614
|
+
models: Dictionary of models for each time window
|
615
|
+
export: Whether to export the figure
|
616
|
+
show: Whether to show the figure
|
617
|
+
"""
|
618
|
+
set_matplotlib_global_params()
|
619
|
+
fig, ax = plt.subplots(2, 4, figsize=(24, 12))
|
620
|
+
|
621
|
+
# Order of time windows to plot
|
622
|
+
windows = ["5D", "7D", "10D", "14D"]
|
623
|
+
|
624
|
+
for idx, window in enumerate(windows):
|
625
|
+
df = stats[window]
|
626
|
+
model = models[window]
|
627
|
+
scaling = {
|
628
|
+
"5D": 5/7,
|
629
|
+
"7D": 1,
|
630
|
+
"10D": 10/7,
|
631
|
+
"14D": 14/7,
|
632
|
+
}
|
633
|
+
for idx2, case in enumerate(("mean", "var")):
|
634
|
+
|
635
|
+
if case == "mean":
|
636
|
+
# Plot mean
|
637
|
+
ax[idx2, idx].scatter(
|
638
|
+
df.index.to_numpy()*scaling[window],
|
639
|
+
df["mean number of mutations"],
|
640
|
+
color=COLORS["UK"],
|
641
|
+
edgecolor="k",
|
642
|
+
zorder=2,
|
643
|
+
)
|
644
|
+
|
645
|
+
_x = np.arange(-0.5, 51, 0.5)
|
646
|
+
ax[idx2, idx].plot(
|
647
|
+
_x,
|
648
|
+
model["mean"][0]["m"]*(_x/scaling[window]) + model["mean"][0]["b"],
|
649
|
+
color=COLORS["UK"],
|
650
|
+
label=rf"Mean ($R^2 = {round(model['mean'][1], 2):.2f})$",
|
651
|
+
linewidth=3,
|
652
|
+
zorder=1,
|
653
|
+
)
|
654
|
+
|
655
|
+
elif case == "var":
|
656
|
+
# Plot variance
|
657
|
+
ax[idx2, idx].scatter(
|
658
|
+
df.index.to_numpy()*scaling[window],
|
659
|
+
df["var number of mutations"] - df["var number of mutations"].min(),
|
660
|
+
color=COLORS["UK"],
|
661
|
+
edgecolor="k",
|
662
|
+
zorder=2,
|
663
|
+
)
|
664
|
+
|
665
|
+
ax[idx2, idx].plot(
|
666
|
+
_x,
|
667
|
+
model["var"][0]["d"]*(_x/scaling[window])**model["var"][0]["alpha"],
|
668
|
+
color=COLORS["UK"],
|
669
|
+
label=rf"Var ($R^2 = {round(model['var'][1], 2):.2f})$",
|
670
|
+
linewidth=3,
|
671
|
+
zorder=1,
|
672
|
+
)
|
673
|
+
|
674
|
+
# Styling
|
675
|
+
ax[idx2, idx].set_xlim(-0.5, 40.5)
|
676
|
+
|
677
|
+
if case == "mean":
|
678
|
+
ax[idx2, idx].set_ylim(29.5, 45.5)
|
679
|
+
else:
|
680
|
+
ax[idx2, idx].set_ylim(-0.5, 10.5)
|
681
|
+
|
682
|
+
ax[idx2, idx].set_xlabel("time (wk)")
|
683
|
+
if idx == 0:
|
684
|
+
ax[idx2, idx].set_ylabel(f"{case} (# mutations)")
|
685
|
+
|
686
|
+
ax[idx2, idx].legend(
|
687
|
+
fontsize=12,
|
688
|
+
loc="upper left",
|
689
|
+
)
|
690
|
+
|
691
|
+
if export:
|
692
|
+
fig.savefig(
|
693
|
+
"share/uk_time_windows.pdf",
|
694
|
+
dpi=400,
|
695
|
+
bbox_inches="tight",
|
696
|
+
)
|
697
|
+
|
698
|
+
if show:
|
699
|
+
plt.show()
|
700
|
+
|
271
701
|
#´:°•.°+.*•´.*:˚.°*.˚•´.°:°•.°•.*•´.*:˚.°*.˚•´.°:°•.°+.*•´.*:#
|
272
702
|
# MAIN #
|
273
703
|
#.•°:°.´+˚.*°.˚:*.´•*.+°.•°:´*.´•*.•°.•°:°.´:•˚°.*°.˚:*.´+°.•#
|
@@ -290,7 +720,6 @@ def main(export: bool = False) -> None:
|
|
290
720
|
f"share/figdata{country}.tsv",
|
291
721
|
f"share/fig{country}",
|
292
722
|
"-k", "total",
|
293
|
-
"-n", "5",
|
294
723
|
"-dt", "7D",
|
295
724
|
"-dr", "2020-10-01..2021-08-01",
|
296
725
|
"-ep",
|
@@ -301,8 +730,24 @@ def main(export: bool = False) -> None:
|
|
301
730
|
df = load_final_data_df()
|
302
731
|
models = load_models()
|
303
732
|
|
304
|
-
#
|
305
|
-
|
733
|
+
# Main plot
|
734
|
+
plot_main_figure(df, models, export=export)
|
735
|
+
|
736
|
+
# Size plot
|
737
|
+
size_plot(df, export=export)
|
738
|
+
|
739
|
+
# Anomalous diffusion plot
|
740
|
+
anomalous_diffusion_plot(export=export)
|
741
|
+
|
742
|
+
# Synthetic data plot
|
743
|
+
synth_df = load_synthetic_data_df()
|
744
|
+
synth_models = load_synthetic_data_models()
|
745
|
+
synthetic_data_plot(synth_df, synth_models, export=export)
|
746
|
+
|
747
|
+
# UK time windows plot
|
748
|
+
additional_uk_stats = load_additional_uk_stats()
|
749
|
+
additional_uk_models = load_additional_uk_models()
|
750
|
+
plot_uk_time_windows(additional_uk_stats, additional_uk_models, export=export)
|
306
751
|
|
307
752
|
|
308
753
|
if __name__ == "__main__":
|
share/synth_figure.pdf
ADDED
Binary file
|
Binary file
|
share/weekly_size.pdf
ADDED
Binary file
|
@@ -1,13 +0,0 @@
|
|
1
|
-
PyEvoMotion/__init__.py,sha256=NqFDD-EZBzouzTwXozZqhPC9sLr7GQaElRKtP0tkHoE,568
|
2
|
-
PyEvoMotion/cli.py,sha256=424ATWV3P89sMYhXT2-i3eHN-VwmujWjcue0coxY-lQ,15498
|
3
|
-
PyEvoMotion/core/__init__.py,sha256=1I-NkFFh6ljLgB_mqQVFLNvCrVKEHLVxa_5dsv3ihWQ,450
|
4
|
-
PyEvoMotion/core/base.py,sha256=SvvJAuYx__NAI8Mzye9lZJM6mPsEuKnulcJuztiPU_E,13226
|
5
|
-
PyEvoMotion/core/core.py,sha256=xCVGZxNvIKge0KINHO-tHK6aD4tdL7Zgua10eONAbi0,18311
|
6
|
-
PyEvoMotion/core/parser.py,sha256=xbjTbIvNy6ta-8WBWwDdnUoTjARPE9eZyaHOXfQKW4U,17144
|
7
|
-
PyEvoMotion/utils.py,sha256=Ye3eL1RXZOZzzs2KZy0R45u06DOtLYo-zqE45tN2t7g,2859
|
8
|
-
share/mafft_install.sh,sha256=pCw70UsKkkNXUsZMwQlQ2b4zSXFrBA7jAj9iOfGLzUw,1007
|
9
|
-
share/manuscript_figure.py,sha256=czznZchVsb7qsCXEGJo-NFG7DJuAE2XxydLmqsAQ66g,9704
|
10
|
-
pyevomotion-0.1.0.dist-info/METADATA,sha256=tEv8So175_nhi-FAr_aCM3lo052xgIh-_HPJBGWCc6k,5950
|
11
|
-
pyevomotion-0.1.0.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
12
|
-
pyevomotion-0.1.0.dist-info/entry_points.txt,sha256=UMzoojYwQi-713hRggkQXUIfGNygUARhTdGs77Usp7s,53
|
13
|
-
pyevomotion-0.1.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|