PyEvoMotion 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/cli.py +88 -11
- PyEvoMotion/core/base.py +373 -34
- PyEvoMotion/core/core.py +136 -43
- PyEvoMotion/core/parser.py +4 -1
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/METADATA +72 -4
- pyevomotion-0.1.2.dist-info/RECORD +35 -0
- share/analyze_model_selection_accuracy.py +316 -0
- share/analyze_test_runs.py +436 -0
- share/anomalous_diffusion.pdf +0 -0
- share/confusion_matrix_heatmap.pdf +0 -0
- share/figUK.tsv +9949 -0
- share/figUK_plots.pdf +0 -0
- share/figUK_regression_results.json +65 -0
- share/figUK_run_args.json +14 -0
- share/figUK_stats.tsv +41 -0
- share/figUSA.tsv +9470 -0
- share/figUSA_plots.pdf +0 -0
- share/figUSA_regression_results.json +65 -0
- share/figUSA_run_args.json +14 -0
- share/figUSA_stats.tsv +34 -0
- share/figdataUK.tsv +10001 -0
- share/figdataUSA.tsv +10001 -0
- share/generate_sequences_from_synthdata.py +85 -0
- share/generate_sequences_from_test5_data.py +107 -0
- share/manuscript_figure.py +858 -43
- share/run_parallel_analysis.py +196 -0
- share/synth_figure.pdf +0 -0
- share/uk_time_windows.pdf +0 -0
- share/weekly_size.pdf +0 -0
- pyevomotion-0.1.0.dist-info/RECORD +0 -13
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/WHEEL +0 -0
- {pyevomotion-0.1.0.dist-info → pyevomotion-0.1.2.dist-info}/entry_points.txt +0 -0
PyEvoMotion/core/core.py
CHANGED
|
@@ -62,7 +62,9 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
62
62
|
:type date_range: tuple[str] | None
|
|
63
63
|
"""
|
|
64
64
|
|
|
65
|
+
self._verify_dt(dt)
|
|
65
66
|
self.dt = dt
|
|
67
|
+
self.dt_ratio = self._get_time_ratio(dt)
|
|
66
68
|
|
|
67
69
|
# Parse the input fasta and metadata files
|
|
68
70
|
super().__init__(
|
|
@@ -89,7 +91,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
89
91
|
def plot_results(cls,
|
|
90
92
|
stats: pd.DataFrame,
|
|
91
93
|
regs: dict[str, dict[str, any]],
|
|
92
|
-
data_xlabel_units: str
|
|
94
|
+
data_xlabel_units: str,
|
|
95
|
+
dt_ratio: float
|
|
93
96
|
) -> None:
|
|
94
97
|
"""
|
|
95
98
|
Plot the results of the analysis.
|
|
@@ -110,7 +113,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
110
113
|
for k,v in regs.items()
|
|
111
114
|
if k.startswith("mean")
|
|
112
115
|
)
|
|
113
|
-
_mean_data = stats[stats.columns[
|
|
116
|
+
_mean_data = stats[stats.columns[2]]
|
|
114
117
|
cls.plot_single_data_and_model(
|
|
115
118
|
stats.index,
|
|
116
119
|
_mean_data,
|
|
@@ -118,7 +121,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
118
121
|
_model["model"],
|
|
119
122
|
r"$r^2$: " + f"{_model['r2']:.2f}",
|
|
120
123
|
data_xlabel_units,
|
|
121
|
-
ax[0]
|
|
124
|
+
ax[0],
|
|
125
|
+
dt_ratio=dt_ratio
|
|
122
126
|
)
|
|
123
127
|
|
|
124
128
|
# Variance
|
|
@@ -127,7 +131,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
127
131
|
for k,v in regs.items()
|
|
128
132
|
if k.startswith("scaled var")
|
|
129
133
|
)
|
|
130
|
-
_variance_data = stats[stats.columns[
|
|
134
|
+
_variance_data = stats[stats.columns[3]]
|
|
131
135
|
cls.plot_single_data_and_model(
|
|
132
136
|
stats.index,
|
|
133
137
|
_variance_data,
|
|
@@ -135,7 +139,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
135
139
|
_model["model"],
|
|
136
140
|
r"$r^2$: " + f"{_model['r2']:.2f}",
|
|
137
141
|
data_xlabel_units,
|
|
138
|
-
ax[1]
|
|
142
|
+
ax[1],
|
|
143
|
+
dt_ratio=dt_ratio
|
|
139
144
|
)
|
|
140
145
|
|
|
141
146
|
# Dispersion index
|
|
@@ -147,6 +152,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
147
152
|
"Poissonian regime",
|
|
148
153
|
data_xlabel_units,
|
|
149
154
|
ax[2],
|
|
155
|
+
dt_ratio=dt_ratio,
|
|
150
156
|
line_linestyle="--",
|
|
151
157
|
line_color="black"
|
|
152
158
|
)
|
|
@@ -159,6 +165,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
159
165
|
stats: pd.DataFrame,
|
|
160
166
|
regs: dict[str, dict[str, any]],
|
|
161
167
|
data_xlabel_units: str,
|
|
168
|
+
dt_ratio: float,
|
|
162
169
|
output_ptr: str | None = None
|
|
163
170
|
) -> None:
|
|
164
171
|
"""
|
|
@@ -183,7 +190,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
183
190
|
for k,v in regs.items()
|
|
184
191
|
if k.startswith("mean")
|
|
185
192
|
)
|
|
186
|
-
_mean_data = stats[stats.columns[
|
|
193
|
+
_mean_data = stats[stats.columns[2]]
|
|
187
194
|
cls.plot_single_data_and_model(
|
|
188
195
|
stats.index,
|
|
189
196
|
_mean_data,
|
|
@@ -191,7 +198,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
191
198
|
_model["model"],
|
|
192
199
|
r"$r^2$: " + f"{_model['r2']:.2f}",
|
|
193
200
|
data_xlabel_units,
|
|
194
|
-
plt.gca()
|
|
201
|
+
plt.gca(),
|
|
202
|
+
dt_ratio=dt_ratio
|
|
195
203
|
)
|
|
196
204
|
|
|
197
205
|
plt.title(_mean_data.name)
|
|
@@ -205,7 +213,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
205
213
|
for k,v in regs.items()
|
|
206
214
|
if k.startswith("scaled var")
|
|
207
215
|
)
|
|
208
|
-
_variance_data = stats[stats.columns[
|
|
216
|
+
_variance_data = stats[stats.columns[3]]
|
|
209
217
|
cls.plot_single_data_and_model(
|
|
210
218
|
stats.index,
|
|
211
219
|
_variance_data,
|
|
@@ -213,7 +221,8 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
213
221
|
lambda x: _model["model"](x) + _variance_data.min(), # Adjust the model to the original variance
|
|
214
222
|
r"$r^2$: " + f"{_model['r2']:.2f}",
|
|
215
223
|
data_xlabel_units,
|
|
216
|
-
plt.gca()
|
|
224
|
+
plt.gca(),
|
|
225
|
+
dt_ratio=dt_ratio
|
|
217
226
|
)
|
|
218
227
|
|
|
219
228
|
plt.title(_variance_data.name)
|
|
@@ -232,6 +241,7 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
232
241
|
"Poissonian regime",
|
|
233
242
|
data_xlabel_units,
|
|
234
243
|
plt.gca(),
|
|
244
|
+
dt_ratio=dt_ratio,
|
|
235
245
|
line_linestyle="--",
|
|
236
246
|
line_color="black"
|
|
237
247
|
)
|
|
@@ -360,7 +370,6 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
360
370
|
def compute_stats(self,
|
|
361
371
|
DT: str,
|
|
362
372
|
origin: str,
|
|
363
|
-
n_threshold: int | None = None,
|
|
364
373
|
mutation_kind: str = "all"
|
|
365
374
|
) -> pd.DataFrame:
|
|
366
375
|
"""
|
|
@@ -372,31 +381,37 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
372
381
|
:type DT: str
|
|
373
382
|
:param origin: The string datetime that will be the origin of the grouping.
|
|
374
383
|
:type origin: str
|
|
375
|
-
:param n_threshold: Minimum number of sequences required in a time interval to compute statistics.
|
|
376
|
-
:type n_threshold: int | None
|
|
377
384
|
:param mutation_kind: The kind of mutation to compute the statistics for. Has to be one of ``all``, ``total``, ``substitutions``, ``insertions``, ``deletions`` or ``indels``. Default is ``all``.
|
|
378
385
|
:return: The statistics of the data.
|
|
379
386
|
:rtype: ``pd.DataFrame``
|
|
380
387
|
"""
|
|
381
388
|
|
|
382
|
-
|
|
389
|
+
# Create a local copy of the data
|
|
390
|
+
_data = self.data.copy()
|
|
383
391
|
|
|
384
|
-
#
|
|
385
|
-
if
|
|
392
|
+
# If the very first row's date is the same as the origin, and there happens to be only one entry for that date, duplicate that row; this way the stats for the first week can be computed (with variance = 0 of course)
|
|
393
|
+
if _data.iloc[0]["date"] == origin and len(_data[_data["date"] == origin]) == 1:
|
|
394
|
+
_data = pd.concat([_data, pd.DataFrame([_data.iloc[0]])], ignore_index=True)
|
|
395
|
+
_data.sort_values(by="date", inplace=True)
|
|
396
|
+
_data.reset_index(drop=True, inplace=True)
|
|
386
397
|
|
|
387
|
-
|
|
398
|
+
# Group the data by the datetime interval
|
|
399
|
+
grouped = self.date_grouper(_data, DT, origin)
|
|
388
400
|
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
f"No groups with at least {n_threshold} observations. Consider lowering the threshold."
|
|
392
|
-
)
|
|
401
|
+
# Only keep weeks where the number of observations is greater than 1
|
|
402
|
+
_filtered = grouped.filter(lambda x: len(x) >= 2)
|
|
393
403
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
origin
|
|
404
|
+
if len(_filtered) == 0:
|
|
405
|
+
raise ValueError(
|
|
406
|
+
f"No groups with at least 2 observations. Consider widening the time interval."
|
|
398
407
|
)
|
|
399
408
|
|
|
409
|
+
grouped = self.date_grouper(
|
|
410
|
+
_filtered,
|
|
411
|
+
DT,
|
|
412
|
+
origin
|
|
413
|
+
)
|
|
414
|
+
|
|
400
415
|
levels = [
|
|
401
416
|
f"number of {x}"
|
|
402
417
|
for x in self._mutation_type_switch(mutation_kind)
|
|
@@ -416,10 +431,10 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
416
431
|
|
|
417
432
|
def analysis(self,
|
|
418
433
|
length: int,
|
|
419
|
-
n_threshold: int | None = None,
|
|
420
434
|
show: bool = False,
|
|
421
435
|
mutation_kind: str = "all",
|
|
422
|
-
export_plots_filename: str | None = None
|
|
436
|
+
export_plots_filename: str | None = None,
|
|
437
|
+
confidence_level: float = 0.95
|
|
423
438
|
) -> tuple[pd.DataFrame, dict[str,dict[str,any]]]:
|
|
424
439
|
"""
|
|
425
440
|
Perform the global analysis of the data.
|
|
@@ -428,13 +443,14 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
428
443
|
|
|
429
444
|
:param length: The length to filter by.
|
|
430
445
|
:type length: int
|
|
431
|
-
:param n_threshold: Minimum number of sequences required in a time interval to compute statistics.
|
|
432
446
|
:param show: Whether to show the plots or not. Default is False.
|
|
433
447
|
:type show: bool
|
|
434
448
|
:param mutation_kind: The kind of mutation to compute the statistics for. Has to be one of ``all``, ``total``, ``substitutions`` or ``indels``. Default is ``all``.
|
|
435
449
|
:type mutation_kind: str
|
|
436
|
-
:param
|
|
437
|
-
:type
|
|
450
|
+
:param export_plots_filename: Filename to export the plots. Default is None and does not export the plots.
|
|
451
|
+
:type export_plots_filename: str | None
|
|
452
|
+
:param confidence_level: Confidence level for parameter confidence intervals (default 0.95 for 95% CI).
|
|
453
|
+
:type confidence_level: float
|
|
438
454
|
:return: The statistics and the regression models.
|
|
439
455
|
:rtype: ``tuple[pd.DataFrame, dict[str, dict[str, any]]]``
|
|
440
456
|
"""
|
|
@@ -447,54 +463,92 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
447
463
|
stats = self.compute_stats(
|
|
448
464
|
self.dt,
|
|
449
465
|
self.origin,
|
|
450
|
-
n_threshold,
|
|
451
466
|
mutation_kind
|
|
452
467
|
)
|
|
453
468
|
|
|
469
|
+
# Get weights for weighted fitting
|
|
470
|
+
weights = stats["size"]
|
|
454
471
|
|
|
455
472
|
regs = {}
|
|
456
473
|
# For each column in the statistics (except the date and the size), compute the corresponding regression model
|
|
457
474
|
for col in stats.columns[1:-1]:
|
|
458
475
|
if col.startswith("mean"):
|
|
459
476
|
_single_regression = {
|
|
460
|
-
f"{col}
|
|
477
|
+
f"{col} model": self.linear_regression(
|
|
461
478
|
*self._remove_nan(
|
|
462
479
|
stats.index, # Regression is given by the index, so in time, it is the same as multiplying by dt days
|
|
463
|
-
stats[col]
|
|
464
|
-
|
|
480
|
+
stats[col],
|
|
481
|
+
weights
|
|
482
|
+
),
|
|
483
|
+
confidence_level=confidence_level
|
|
465
484
|
)
|
|
466
485
|
}
|
|
467
486
|
elif col.startswith("var"):
|
|
468
|
-
|
|
487
|
+
_adjust_result = self.adjust_model(
|
|
469
488
|
stats.index,
|
|
470
489
|
stats[col] - stats[col].min(),
|
|
471
|
-
name=f"scaled {col}
|
|
490
|
+
name=f"scaled {col} model",
|
|
491
|
+
weights=weights.to_numpy().flatten(),
|
|
492
|
+
confidence_level=confidence_level
|
|
472
493
|
)
|
|
494
|
+
# Extract the selected model for backward compatibility while preserving all model info
|
|
495
|
+
model_name = f"scaled {col} model"
|
|
496
|
+
full_result = _adjust_result[model_name]
|
|
497
|
+
selected_model = full_result["selected_model"]
|
|
498
|
+
|
|
499
|
+
# Store both the selected model (for backward compatibility) and full results
|
|
500
|
+
_single_regression = {
|
|
501
|
+
model_name: selected_model,
|
|
502
|
+
f"{model_name}_full_results": full_result
|
|
503
|
+
}
|
|
473
504
|
# Save the regression model
|
|
474
505
|
regs.update(_single_regression)
|
|
475
506
|
|
|
507
|
+
# Add scaling correction to the regression models
|
|
508
|
+
for k, v in regs.items():
|
|
509
|
+
# Skip full results entries - we'll handle them separately
|
|
510
|
+
if k.endswith("_full_results"):
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
# Use the helper method for scaling correction
|
|
514
|
+
self._apply_scaling_correction_to_model(v)
|
|
515
|
+
|
|
516
|
+
# Apply scaling correction to all models in full results
|
|
517
|
+
for k, v in regs.items():
|
|
518
|
+
if k.endswith("_full_results"):
|
|
519
|
+
# Apply scaling to selected model
|
|
520
|
+
self._apply_scaling_correction_to_model(v["selected_model"])
|
|
521
|
+
# Apply scaling to linear model
|
|
522
|
+
self._apply_scaling_correction_to_model(v["linear_model"])
|
|
523
|
+
# Apply scaling to power law model
|
|
524
|
+
self._apply_scaling_correction_to_model(v["power_law_model"])
|
|
525
|
+
|
|
476
526
|
# Sets of mutation types used in the analysis
|
|
477
527
|
_sets = sorted({
|
|
478
528
|
" ".join(x.split()[1:])
|
|
479
529
|
for x in stats.columns[1:-1]
|
|
480
530
|
})
|
|
481
531
|
|
|
532
|
+
stats["dt_idx"] = (stats["date"] - stats["date"].min()) / pd.Timedelta("7D")
|
|
533
|
+
|
|
482
534
|
# Plot the results
|
|
483
535
|
if show:
|
|
484
536
|
# For each set of mutation types
|
|
485
537
|
for _type in _sets:
|
|
486
538
|
self.plot_results(
|
|
487
|
-
stats[["date", f"mean {_type}", f"var {_type}"]],
|
|
539
|
+
stats[["date", "dt_idx", f"mean {_type}", f"var {_type}"]],
|
|
488
540
|
{
|
|
489
541
|
k: v
|
|
490
542
|
for k, v in regs.items()
|
|
491
543
|
if k in (
|
|
492
|
-
f"mean {_type}
|
|
493
|
-
f"scaled var {_type}
|
|
544
|
+
f"mean {_type} model",
|
|
545
|
+
f"scaled var {_type} model"
|
|
494
546
|
)
|
|
495
547
|
},
|
|
496
|
-
|
|
548
|
+
"wk",
|
|
549
|
+
self.dt_ratio
|
|
497
550
|
)
|
|
551
|
+
|
|
498
552
|
# Export the plots
|
|
499
553
|
if export_plots_filename:
|
|
500
554
|
# Open pdf file pointer
|
|
@@ -502,19 +556,58 @@ class PyEvoMotion(PyEvoMotionParser, PyEvoMotionBase):
|
|
|
502
556
|
# For each set of mutation types save the plots
|
|
503
557
|
for _type in _sets:
|
|
504
558
|
self.export_plot_results(
|
|
505
|
-
stats[["date", f"mean {_type}", f"var {_type}"]],
|
|
559
|
+
stats[["date", "dt_idx", f"mean {_type}", f"var {_type}"]],
|
|
506
560
|
{
|
|
507
561
|
k: v
|
|
508
562
|
for k, v in regs.items()
|
|
509
563
|
if k in (
|
|
510
|
-
f"mean {_type}
|
|
511
|
-
f"scaled var {_type}
|
|
564
|
+
f"mean {_type} model",
|
|
565
|
+
f"scaled var {_type} model"
|
|
512
566
|
)
|
|
513
567
|
},
|
|
514
|
-
|
|
568
|
+
"wk",
|
|
569
|
+
self.dt_ratio,
|
|
515
570
|
pdf
|
|
516
571
|
)
|
|
517
572
|
# Close pdf file pointer
|
|
518
573
|
pdf.close()
|
|
519
574
|
|
|
520
575
|
return stats, regs
|
|
576
|
+
|
|
577
|
+
def _apply_scaling_correction_to_model(self, model: dict[str, any]) -> None:
|
|
578
|
+
"""Apply scaling correction to a single model dictionary.
|
|
579
|
+
|
|
580
|
+
:param model: The model dictionary to apply scaling correction to
|
|
581
|
+
:type model: dict[str, any]
|
|
582
|
+
"""
|
|
583
|
+
if model["expression"] == "mx + b":
|
|
584
|
+
m = model["parameters"]["m"]
|
|
585
|
+
b = model["parameters"]["b"]
|
|
586
|
+
model["parameters"]["m"] = m/self.dt_ratio
|
|
587
|
+
m = model["parameters"]["m"]
|
|
588
|
+
model["model"] = lambda x: m*x + b
|
|
589
|
+
# Update confidence intervals to match scaled parameters
|
|
590
|
+
if "confidence_intervals" in model:
|
|
591
|
+
m_ci_lower, m_ci_upper = model["confidence_intervals"]["m"]
|
|
592
|
+
model["confidence_intervals"]["m"] = (m_ci_lower/self.dt_ratio, m_ci_upper/self.dt_ratio)
|
|
593
|
+
elif model["expression"] == "mx":
|
|
594
|
+
m = model["parameters"]["m"]
|
|
595
|
+
model["parameters"]["m"] = m/self.dt_ratio
|
|
596
|
+
m = model["parameters"]["m"]
|
|
597
|
+
model["model"] = lambda x: m*x
|
|
598
|
+
# Update confidence intervals to match scaled parameters
|
|
599
|
+
if "confidence_intervals" in model:
|
|
600
|
+
m_ci_lower, m_ci_upper = model["confidence_intervals"]["m"]
|
|
601
|
+
model["confidence_intervals"]["m"] = (m_ci_lower/self.dt_ratio, m_ci_upper/self.dt_ratio)
|
|
602
|
+
elif model["expression"] == "d*x^alpha":
|
|
603
|
+
d = model["parameters"]["d"]
|
|
604
|
+
alpha = model["parameters"]["alpha"]
|
|
605
|
+
model["parameters"]["d"] = d/(self.dt_ratio**alpha)
|
|
606
|
+
d = model["parameters"]["d"]
|
|
607
|
+
model["model"] = lambda x: d*(x**alpha)
|
|
608
|
+
# Update confidence intervals to match scaled parameters
|
|
609
|
+
if "confidence_intervals" in model:
|
|
610
|
+
d_ci_lower, d_ci_upper = model["confidence_intervals"]["d"]
|
|
611
|
+
model["confidence_intervals"]["d"] = (d_ci_lower/(self.dt_ratio**alpha), d_ci_upper/(self.dt_ratio**alpha))
|
|
612
|
+
|
|
613
|
+
|
PyEvoMotion/core/parser.py
CHANGED
|
@@ -153,11 +153,14 @@ class PyEvoMotionParser():
|
|
|
153
153
|
mod
|
|
154
154
|
for mod in x
|
|
155
155
|
if start - 1 < int(mod.split("_")[1]) < end
|
|
156
|
-
]
|
|
156
|
+
] if x else ["NO_MUTATION"]
|
|
157
157
|
)
|
|
158
158
|
self.data = self.data[
|
|
159
159
|
self.data["mutation instructions"].apply(len) > 0
|
|
160
160
|
]
|
|
161
|
+
self.data["mutation instructions"] = self.data["mutation instructions"].apply(
|
|
162
|
+
lambda x: [] if x == ["NO_MUTATION"] else x
|
|
163
|
+
)
|
|
161
164
|
|
|
162
165
|
def filter_columns(self, filters: dict[str, list[str] | str]) -> None:
|
|
163
166
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: PyEvoMotion
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Evolutionary motion analysis tool
|
|
5
5
|
Keywords: evolution,anomalous diffusion,bioinformatics
|
|
6
6
|
Author: Lucas Goiriz
|
|
@@ -27,7 +27,7 @@ _(See [Goiriz L, et al.](http://doi.org/10.1073/pnas.2303578120))_
|
|
|
27
27
|
## Installation
|
|
28
28
|
|
|
29
29
|
> **Note:**
|
|
30
|
-
> `PyEvoMotion` uses [mafft](https://mafft.cbrc.jp/alignment/software/) to do the sequence alignment. If it
|
|
30
|
+
> `PyEvoMotion` uses [mafft](https://mafft.cbrc.jp/alignment/software/) to do the sequence alignment. If it's not available in your system, on the first run of `PyEvoMotion`, it will ask to install it locally.
|
|
31
31
|
>
|
|
32
32
|
> If so, ensure to restart your shell session or run `source ~/.bashrc` to update the PATH environment variable, so that the `mafft` executable is available in your shell.
|
|
33
33
|
>
|
|
@@ -74,8 +74,6 @@ options:
|
|
|
74
74
|
-ep, --export_plots Export the plots of the analysis.
|
|
75
75
|
-l LENGTH_FILTER, --length_filter LENGTH_FILTER
|
|
76
76
|
Length filter for the sequences (removes sequences with length less than the specified value). Default is 0.
|
|
77
|
-
-n N_THRESHOLD, --n_threshold N_THRESHOLD
|
|
78
|
-
Minimum number of sequences required in a time interval to compute statistics. Default is 2.
|
|
79
77
|
-xj, --export_json Export the run arguments to a json file.
|
|
80
78
|
-ij IMPORT_JSON, --import_json IMPORT_JSON
|
|
81
79
|
Import the run arguments from a JSON file. If this argument is passed, the other arguments are ignored. The JSON file must contain the mandatory keys 'seqs', 'meta', and 'out'.
|
|
@@ -114,4 +112,74 @@ pytest
|
|
|
114
112
|
> Given the size of the test data, this may take a while.
|
|
115
113
|
|
|
116
114
|
|
|
115
|
+
## Docker
|
|
117
116
|
|
|
117
|
+
A Docker image containing a virtual environment with `PyEvoMotion` pre-installed, its dependencies, the test data is available at `ghcr.io/luksgrin/pyevomotion:latest` and the manuscript's original figure script is available at `ghcr.io/luksgrin/pyevomotion-fig:latest`.
|
|
118
|
+
|
|
119
|
+
Pull the image from by running:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
docker pull ghcr.io/luksgrin/pyevomotion:latest
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Alternatively, to build the main image, run:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
docker build -t ghcr.io/luksgrin/pyevomotion:latest -f docker/Dockerfile
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Running the container
|
|
132
|
+
|
|
133
|
+
To start an interactive container:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
docker run -it ghcr.io/luksgrin/pyevomotion:latest
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
This will open a prompt that displays a welcome message and allows you to start using `PyEvoMotion` right away.
|
|
140
|
+
|
|
141
|
+
### Included data
|
|
142
|
+
|
|
143
|
+
The image includes (heavy) input files (FASTA and metadata) in:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
/home/pyevomotion/pyevomotion-*/tests/data/test3
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
which are used by the test suite (and are automatically downloaded and extracted if not present, thereby using the containerized version is more convenient).
|
|
150
|
+
|
|
151
|
+
Also, the source script for figure generation (along with the pre-generated results of running `PyEvoMotion`) is already available under:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
/home/pyevomotion/pyevomotion-*/share
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Do note that if all the contents within
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
/home/pyevomotion/pyevomotion-*/share
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
are deleted except for the `manuscript_figure.py` script, it is still possible to generate the figure (although it will take much longer since the dataset's stats must be computed by `PyEvoMotion`).
|
|
164
|
+
|
|
165
|
+
### Running tests
|
|
166
|
+
|
|
167
|
+
Once inside the container, run:
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
cd pyevomotion-*
|
|
171
|
+
pytest
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
This will execute the test suite included with the source.
|
|
175
|
+
|
|
176
|
+
### Reproducing the Figure from the original manuscript
|
|
177
|
+
|
|
178
|
+
To reproduce the figure from the original manuscript, run:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
cd pyevomotion-*
|
|
182
|
+
python share/manuscript_figure.py export
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
The figure will be saved in the `share` directory. Font warnings may appear — they are safe to ignore and do not affect the scientific content of the figure, only the styling.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
PyEvoMotion/__init__.py,sha256=NqFDD-EZBzouzTwXozZqhPC9sLr7GQaElRKtP0tkHoE,568
|
|
2
|
+
PyEvoMotion/cli.py,sha256=GnTBJDlKjXEtvqhT9bZxEHl-tq4e0QZYqWticCXklo0,18885
|
|
3
|
+
PyEvoMotion/core/__init__.py,sha256=1I-NkFFh6ljLgB_mqQVFLNvCrVKEHLVxa_5dsv3ihWQ,450
|
|
4
|
+
PyEvoMotion/core/base.py,sha256=L_uabRqGgAQy3mXs4QfzE05RuCz-my8ZJcTglsMAg7E,27931
|
|
5
|
+
PyEvoMotion/core/core.py,sha256=RHkIoIYIfteA_zrKrLF9-XemPcenl_BSbHcCwz6Sg-M,22737
|
|
6
|
+
PyEvoMotion/core/parser.py,sha256=w23KzX0jl3NLS0WYjAY1s_2VFEqfn6EoTrQXmGRRXfg,17323
|
|
7
|
+
PyEvoMotion/utils.py,sha256=Ye3eL1RXZOZzzs2KZy0R45u06DOtLYo-zqE45tN2t7g,2859
|
|
8
|
+
share/analyze_model_selection_accuracy.py,sha256=OnGKbmI515bIRdpYMNHGQ9SlZGmVQZi_tFnAX4g2Iyw,12846
|
|
9
|
+
share/analyze_test_runs.py,sha256=AXdz-TdyK7DO1iT_FWrYsONYMs-2HYst_7fofcZ8wxQ,15534
|
|
10
|
+
share/anomalous_diffusion.pdf,sha256=fWUvoxB2J9JRCRRjYEXtPNfJpR3ajbGfmCBiA5_-nzs,19384
|
|
11
|
+
share/confusion_matrix_heatmap.pdf,sha256=GHfupvVgHF4msQjVohc_5KWXmVsPZDueml5zb4sL4Zo,23108
|
|
12
|
+
share/figUK.tsv,sha256=DnPkVfbMGfsOq9x8IAkbPzIWsmQBB1hOXChSNkNlHGo,6711825
|
|
13
|
+
share/figUK_plots.pdf,sha256=6PQw3ujMN0bfj8s8DMstadyoIYna3a2Upig-W1yHWOc,22664
|
|
14
|
+
share/figUK_regression_results.json,sha256=Q-WpaqxNv7O7ZNKtnt9qmqEAifpmmRnarpT0zLCKAR4,1923
|
|
15
|
+
share/figUK_run_args.json,sha256=k5NFbR0YNFBQ7M6TNpLT6G2vLml27iu8DDwGmwVWKqM,344
|
|
16
|
+
share/figUK_stats.tsv,sha256=_gRc1-GYlGp4P7iUWtuiTeDy6cvYVbTX6SrXYUaNF-g,2252
|
|
17
|
+
share/figUSA.tsv,sha256=hSPmKjCFk0a5B0XN75JhQOuhZgfcz9ZCxNrfqMrYcF0,6577552
|
|
18
|
+
share/figUSA_plots.pdf,sha256=OsAseOotQGmFljxvgyqr4NGa4PXDPWo9UFAT47cjc-k,22066
|
|
19
|
+
share/figUSA_regression_results.json,sha256=xgXn3paIYmZ2mEAunx-VKqAaMKotIz2CT8zDiHWlJt4,1918
|
|
20
|
+
share/figUSA_run_args.json,sha256=N5ZBL9W0OcJEyMqcBq2dxSPr9vQUJsnQfflQMdTzmRw,347
|
|
21
|
+
share/figUSA_stats.tsv,sha256=-lC1Gk_t4nB5vQXTjvhAfFS34ILjtkVEILarcDGWHZA,1913
|
|
22
|
+
share/figdataUK.tsv,sha256=HMF07FNT7d3Tb2OMHuFYkRzc6vb5EQ6vj2nJBpXlXJ8,939837
|
|
23
|
+
share/figdataUSA.tsv,sha256=z5yaIwcyfLo7Wr5ioE-x6_qXg9IhT_CmAJxcLTfP4jA,827811
|
|
24
|
+
share/generate_sequences_from_synthdata.py,sha256=_2IdMgcOB7SxAq30iypA-VypSmZyZmMhA8otKQnkfAw,3443
|
|
25
|
+
share/generate_sequences_from_test5_data.py,sha256=H1J4FQgndTSrRbXqEzaHMFI2JGX9oWDhwNnU7uwu534,4127
|
|
26
|
+
share/mafft_install.sh,sha256=pCw70UsKkkNXUsZMwQlQ2b4zSXFrBA7jAj9iOfGLzUw,1007
|
|
27
|
+
share/manuscript_figure.py,sha256=JVwFc236-KtZnmkve0PrPcmMPIeg94jUOaZGq7Z6_QM,41109
|
|
28
|
+
share/run_parallel_analysis.py,sha256=D67RG0ze8xikOkOVeWm716ZNXUwaHR3O7flBioubgRg,6750
|
|
29
|
+
share/synth_figure.pdf,sha256=cqjXjnd0Q14p6NQHLr58vprTot8edKvaWDoq6ZNTkis,26350
|
|
30
|
+
share/uk_time_windows.pdf,sha256=JioGh4bHv8VROvGnySuJ0r7VAtr9ykWeH2M9eAinpSk,35022
|
|
31
|
+
share/weekly_size.pdf,sha256=_YKsCKln8wpgNNwuu9_mFRQfkv4r0g3gLVaVK76MIj8,14602
|
|
32
|
+
pyevomotion-0.1.2.dist-info/METADATA,sha256=Da84PseCS2zeTtQHMJArt584_k_jZLbNh7cCSISh7PI,7833
|
|
33
|
+
pyevomotion-0.1.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
34
|
+
pyevomotion-0.1.2.dist-info/entry_points.txt,sha256=UMzoojYwQi-713hRggkQXUIfGNygUARhTdGs77Usp7s,53
|
|
35
|
+
pyevomotion-0.1.2.dist-info/RECORD,,
|