PyEvoMotion 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/__init__.py +11 -0
- PyEvoMotion/cli.py +440 -0
- PyEvoMotion/core/__init__.py +7 -0
- PyEvoMotion/core/base.py +406 -0
- PyEvoMotion/core/core.py +520 -0
- PyEvoMotion/core/parser.py +467 -0
- PyEvoMotion/utils.py +87 -0
- pyevomotion-0.1.0.dist-info/METADATA +117 -0
- pyevomotion-0.1.0.dist-info/RECORD +13 -0
- pyevomotion-0.1.0.dist-info/WHEEL +4 -0
- pyevomotion-0.1.0.dist-info/entry_points.txt +3 -0
- share/mafft_install.sh +44 -0
- share/manuscript_figure.py +316 -0
@@ -0,0 +1,467 @@
|
|
1
|
+
from Bio import SeqIO, AlignIO
|
2
|
+
from Bio.SeqRecord import SeqRecord
|
3
|
+
from Bio.Align import MultipleSeqAlignment
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
from io import StringIO
|
8
|
+
from itertools import groupby
|
9
|
+
from datetime import datetime
|
10
|
+
from operator import itemgetter
|
11
|
+
from subprocess import Popen, PIPE
|
12
|
+
|
13
|
+
|
14
|
+
class PyEvoMotionParser():
|
15
|
+
"""
|
16
|
+
This class is responsible for parsing the input fasta and metadata files. It is inherited by the :class:`PyEvoMotion` class.
|
17
|
+
|
18
|
+
:param input_fasta: The path to the input ``.fasta`` file.
|
19
|
+
:type input_fasta: str
|
20
|
+
:param input_meta: The path to the input metadata file. The metadata file must contain a ``date`` column and can be in either ``.csv`` or ``.tsv`` format.
|
21
|
+
:type input_meta: str
|
22
|
+
:param filters: The filters to be applied to the data. The keys are the column names and the values are the values to be filtered.
|
23
|
+
:type filters: dict[str, list[str] | str]
|
24
|
+
:param positions: The start and end positions to filter the data by.
|
25
|
+
:type positions: tuple[int]
|
26
|
+
:param date_range: The start and end dates to filter the data by. If ``None``, the date range is not filtered.
|
27
|
+
:type date_range: tuple[datetime] | None
|
28
|
+
|
29
|
+
On construction, it invokes the following methods:
|
30
|
+
- :meth:`parse_metadata`: Parses the metadata file.
|
31
|
+
- :meth:`parse_sequence_by_id`: Parses the sequence with the ID of the first entry in the metadata file.
|
32
|
+
- :meth:`filter_columns`: Filters the metadata based on the input filters.
|
33
|
+
- :meth:`filter_by_daterange`: Filters the metadata based on the input date range.
|
34
|
+
- :meth:`parse_data`: Parses the input fasta file and appends the mutations that differ between the reference sequence and the input sequences to the metadata.
|
35
|
+
- :meth:`filter_by_position`: Filters the metadata based on the input start and end positions.
|
36
|
+
|
37
|
+
Attributes:
|
38
|
+
-----------
|
39
|
+
data: ``pd.DataFrame``
|
40
|
+
DataFrame containing metadata.
|
41
|
+
reference: ``SeqRecord``
|
42
|
+
The reference sequence parsed from the fasta file.
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(self,
|
46
|
+
input_fasta: str,
|
47
|
+
input_meta: str,
|
48
|
+
filters: dict[str, list[str] | str],
|
49
|
+
positions: tuple[int],
|
50
|
+
date_range: tuple[datetime] | None
|
51
|
+
) -> None:
|
52
|
+
"""
|
53
|
+
Initializes the class with input FASTA and metadata files.
|
54
|
+
|
55
|
+
:param input_fasta: The path to the input FASTA file.
|
56
|
+
:type input_fasta: str
|
57
|
+
:param input_meta: The path to the input metadata file.
|
58
|
+
:type input_meta: str
|
59
|
+
:param filters: The filters to be applied to the data. The keys are the column names and the values are the values to be filtered.
|
60
|
+
:type filters: dict[str, list[str] | str]
|
61
|
+
:param positions: The start and end positions to filter the data by.
|
62
|
+
:type positions: tuple[int]
|
63
|
+
:param date_range: The start and end dates to filter the data by. If None, the date range is not filtered.
|
64
|
+
:type date_range: tuple[datetime] | None
|
65
|
+
"""
|
66
|
+
|
67
|
+
self.data = self.parse_metadata(input_meta)
|
68
|
+
self.reference = self.parse_sequence_by_id(
|
69
|
+
input_fasta,
|
70
|
+
self.data.iloc[0]["id"]
|
71
|
+
)
|
72
|
+
# Implicitly filters the data
|
73
|
+
self.filter_columns(filters)
|
74
|
+
if date_range:
|
75
|
+
self.filter_by_daterange(*date_range)
|
76
|
+
# Appends the mutations that differ between the reference sequence and the input sequences to the data
|
77
|
+
self.parse_data(input_fasta, self.data["id"])
|
78
|
+
# Applies the position filter if provided
|
79
|
+
self.filter_by_position(*positions)
|
80
|
+
|
81
|
+
def parse_data(self, input_fasta: str, selection: pd.Series) -> None:
|
82
|
+
"""
|
83
|
+
Parse the input fasta file and store the resulting data in the ``data`` attribute.
|
84
|
+
|
85
|
+
:param input_fasta: The path to the input ``.fasta`` file.
|
86
|
+
:type input_fasta: str
|
87
|
+
:param selection: The selection of sequence ids to be parsed.
|
88
|
+
:type selection: pd.Series
|
89
|
+
"""
|
90
|
+
|
91
|
+
self.data = (
|
92
|
+
self.data
|
93
|
+
.merge(
|
94
|
+
self.get_differing_mutations(input_fasta, selection),
|
95
|
+
on="id",
|
96
|
+
how="left"
|
97
|
+
)
|
98
|
+
.reset_index(drop=True)
|
99
|
+
)
|
100
|
+
|
101
|
+
def filter_by_daterange(self, start: datetime, end: datetime) -> None:
|
102
|
+
"""
|
103
|
+
Filter the data based on a date range.
|
104
|
+
|
105
|
+
The data is filtered to only include entries with dates between the start and end dates. This method modifies the ``data`` attribute in place.
|
106
|
+
|
107
|
+
:param start: The start date.
|
108
|
+
:type start: datetime
|
109
|
+
:param end: The end date.
|
110
|
+
:type end: datetime
|
111
|
+
:raises ValueError: If the start date is greater than the end date.
|
112
|
+
"""
|
113
|
+
|
114
|
+
start = (
|
115
|
+
max(self.data["date"].min(), start) if start
|
116
|
+
else self.data["date"].min()
|
117
|
+
)
|
118
|
+
end = (
|
119
|
+
min(self.data["date"].max(), end)
|
120
|
+
if end else self.data["date"].max()
|
121
|
+
)
|
122
|
+
|
123
|
+
if start > end:
|
124
|
+
raise ValueError("Start date must be smaller than end date")
|
125
|
+
|
126
|
+
self.data = self.data[
|
127
|
+
(self.data["date"] >= start) & (self.data["date"] <= end)
|
128
|
+
]
|
129
|
+
|
130
|
+
def filter_by_position(self, start: int, end: int) -> None:
|
131
|
+
"""
|
132
|
+
Filter the data based on some start and end positions in the reference sequence.
|
133
|
+
|
134
|
+
*Note that the positions are 1-indexed, and that the end position is inclusive.*
|
135
|
+
|
136
|
+
:param start: The start position index.
|
137
|
+
:type start: int
|
138
|
+
:param end: The end position index.
|
139
|
+
:type end: int
|
140
|
+
:raises ValueError: If the start position is greater than the end position., or if the start position is greater than the length of the reference sequence.
|
141
|
+
"""
|
142
|
+
|
143
|
+
start = max(1, start) # Ensure start is at least 1
|
144
|
+
end = end if end > 0 else len(self.reference.seq) + 1 # Set end if not provided
|
145
|
+
|
146
|
+
if start >= end:
|
147
|
+
raise ValueError("Start position must be smaller than end position")
|
148
|
+
elif start > len(self.reference.seq):
|
149
|
+
raise ValueError("Start position is out of range")
|
150
|
+
|
151
|
+
self.data["mutation instructions"] = self.data["mutation instructions"].apply(
|
152
|
+
lambda x: [
|
153
|
+
mod
|
154
|
+
for mod in x
|
155
|
+
if start - 1 < int(mod.split("_")[1]) < end
|
156
|
+
]
|
157
|
+
)
|
158
|
+
self.data = self.data[
|
159
|
+
self.data["mutation instructions"].apply(len) > 0
|
160
|
+
]
|
161
|
+
|
162
|
+
def filter_columns(self, filters: dict[str, list[str] | str]) -> None:
|
163
|
+
"""
|
164
|
+
Filter the data based on the input filters provided.
|
165
|
+
|
166
|
+
:param filters: The filters to be applied to the data. The keys are the column names and the values are the values to be filtered from the provided metadata.
|
167
|
+
:type filters: dict[str, list[str] | str]
|
168
|
+
"""
|
169
|
+
|
170
|
+
# Only keep filters that are columns in the data
|
171
|
+
_filters = {
|
172
|
+
k: v
|
173
|
+
for k,v in filters.items()
|
174
|
+
if k in self.data.columns
|
175
|
+
}
|
176
|
+
|
177
|
+
for col, vals in _filters.items():
|
178
|
+
if isinstance(vals, str):
|
179
|
+
vals = [vals]
|
180
|
+
regex_pattern = "|".join(
|
181
|
+
val.replace('*', '.*')
|
182
|
+
for val in vals
|
183
|
+
)
|
184
|
+
self.data = self.data[
|
185
|
+
self.data[col]
|
186
|
+
.str.contains(
|
187
|
+
regex_pattern,
|
188
|
+
regex=True
|
189
|
+
)
|
190
|
+
]
|
191
|
+
|
192
|
+
@staticmethod
|
193
|
+
def _get_consecutives(data: list[int]) -> list[list[int]]:
|
194
|
+
"""
|
195
|
+
Groups list of ordered integers into list of groups of integers
|
196
|
+
|
197
|
+
:param data: a list of ordered integers.
|
198
|
+
:type data: list[int]
|
199
|
+
:return idxs: a list of lists of consecutive integers.
|
200
|
+
:rtype: list[list[int]]
|
201
|
+
"""
|
202
|
+
idxs = []
|
203
|
+
|
204
|
+
for _, g in groupby(
|
205
|
+
enumerate(data),
|
206
|
+
lambda x: x[0] - x[1]
|
207
|
+
):
|
208
|
+
idxs.append(list(map(itemgetter(1), g)))
|
209
|
+
|
210
|
+
return idxs
|
211
|
+
|
212
|
+
@staticmethod
|
213
|
+
def _column_decision(col:np.array) -> int:
|
214
|
+
"""
|
215
|
+
Classifies bases in array column
|
216
|
+
|
217
|
+
:param col: column with two rows, containing one of these symbols: A, G, T, C, N, -
|
218
|
+
:type col: np.array
|
219
|
+
:return: returns an integer indicating if match (0), mismatch (1), insertion (2) or deletion (3).
|
220
|
+
:rtype: int
|
221
|
+
"""
|
222
|
+
|
223
|
+
# If there's a match, return 0. In the case that there's an insertion of an N, we also return 0 as it's probably a sequencing error
|
224
|
+
if ((col[0] == col[1]) or ("N" in col)): return 0
|
225
|
+
|
226
|
+
# If there's an insertion, return 2
|
227
|
+
elif col[0] == "-": return 2
|
228
|
+
|
229
|
+
# If there's a deletion, return 3
|
230
|
+
elif col[1] == "-": return 3
|
231
|
+
|
232
|
+
# Else, it has to be a mismatch. Return 1
|
233
|
+
else: return 1
|
234
|
+
|
235
|
+
@classmethod
|
236
|
+
def create_modifs(cls, alignment: MultipleSeqAlignment) -> list[str]:
|
237
|
+
"""
|
238
|
+
Creates a guide on how to modify the root sequence to get the appropriate mutant sequence.
|
239
|
+
|
240
|
+
:param alignment: ``Bio.Align.MultipleSeqAlignment`` object containing the alignment.
|
241
|
+
:type alignment: MultipleSeqAlignment
|
242
|
+
:return mods: List of modifications encoded as strings that have the format ``<type>_<position>_<bases>``, where ``<type>`` is one of ``i``, ``d`` and ``s`` (insertion, deletion and substitution), ``<position>`` is the position in the sequence where the modification should be made, and ``<bases>`` are the bases to be inserted, deleted or substituted.
|
243
|
+
:rtype: ``list[str]``
|
244
|
+
"""
|
245
|
+
|
246
|
+
# Turn alignment into np.array
|
247
|
+
coding = np.array([
|
248
|
+
list(alignment[0].upper()),
|
249
|
+
list(alignment[1].upper())
|
250
|
+
])
|
251
|
+
|
252
|
+
# Get classification
|
253
|
+
clsMut = np.apply_along_axis(cls._column_decision, 0, coding)
|
254
|
+
|
255
|
+
# Encode substitutions
|
256
|
+
subst = list(map(
|
257
|
+
lambda x: f"s_{x + 1}_{coding[1, x]}",
|
258
|
+
list(np.where(clsMut == 1)[0])
|
259
|
+
))
|
260
|
+
|
261
|
+
# Encode insertions
|
262
|
+
insertions = list(map(
|
263
|
+
lambda x: f"i_{x[0]}_{''.join(coding[1, x])}",
|
264
|
+
cls._get_consecutives(
|
265
|
+
list(np.where(clsMut == 2)[0])
|
266
|
+
)
|
267
|
+
))
|
268
|
+
|
269
|
+
# Encode deletions
|
270
|
+
deletions = list(map(
|
271
|
+
lambda x: f"d_{x[0]}_{''.join(coding[0, x])}",
|
272
|
+
cls._get_consecutives(
|
273
|
+
list(np.where(clsMut == 3)[0])
|
274
|
+
)
|
275
|
+
))
|
276
|
+
|
277
|
+
# Blend modifications and order them
|
278
|
+
mods = sorted(
|
279
|
+
insertions + deletions + subst,
|
280
|
+
key=lambda x: int(x.split("_")[1])
|
281
|
+
)
|
282
|
+
|
283
|
+
reindex = [
|
284
|
+
(mods.index(el),len(el.split("_")[-1]))
|
285
|
+
for el in mods
|
286
|
+
if el.startswith("i")
|
287
|
+
]
|
288
|
+
|
289
|
+
for idx,v in reindex:
|
290
|
+
mods[idx + 1:] = list(map(
|
291
|
+
lambda x: "_".join((
|
292
|
+
x.split("_")[0],
|
293
|
+
str(int(x.split("_")[1]) - v),
|
294
|
+
x.split("_")[-1]
|
295
|
+
)),
|
296
|
+
mods[idx + 1:]
|
297
|
+
))
|
298
|
+
|
299
|
+
return mods
|
300
|
+
|
301
|
+
def get_differing_mutations(self, input_fasta: str, selection: pd.Series) -> pd.DataFrame:
|
302
|
+
"""
|
303
|
+
Return the mutations that differ between the reference sequence and the input sequence.
|
304
|
+
|
305
|
+
Also, for the sake of sequence selection, it outputs the number of ``N`` found in the sequence.
|
306
|
+
|
307
|
+
:param input_fasta: The path to the input ``.fasta`` file.
|
308
|
+
:type input_fasta: str
|
309
|
+
:param selection: The selection of sequence ids to be compared with the reference sequence.
|
310
|
+
:type selection: pd.Series
|
311
|
+
:return: The mutations that differ between the reference sequence and the input sequence. It contains the columns ``id``, ``mutation instructions`` and ``N count`` (the number of ``N`` in the sequence).
|
312
|
+
:rtype: ``pd.DataFrame``
|
313
|
+
"""
|
314
|
+
|
315
|
+
aligments = {}
|
316
|
+
|
317
|
+
with open(input_fasta) as handle:
|
318
|
+
for record in SeqIO.parse(handle, "fasta"):
|
319
|
+
if not(record.id in selection.values):
|
320
|
+
continue
|
321
|
+
alignment = self.generate_alignment(
|
322
|
+
self.reference,
|
323
|
+
record
|
324
|
+
)
|
325
|
+
aligments[record.id] = (
|
326
|
+
self.create_modifs(alignment),
|
327
|
+
alignment[1].seq.count("n") # In alignment fasta files, bases are lowercase
|
328
|
+
)
|
329
|
+
|
330
|
+
return pd.DataFrame(
|
331
|
+
[(k, v1, v2) for k, (v1, v2) in aligments.items()],
|
332
|
+
columns=["id", "mutation instructions", "N count"]
|
333
|
+
)
|
334
|
+
|
335
|
+
@classmethod
|
336
|
+
def generate_alignment(cls, seq1: str, seq2: str) -> MultipleSeqAlignment:
|
337
|
+
"""
|
338
|
+
Generate a multiple sequence alignment of the input sequences using ``MAFFT``.
|
339
|
+
|
340
|
+
:param seq1: The first sequence to be aligned.
|
341
|
+
:type seq1: str
|
342
|
+
:param seq2: The second sequence to be aligned.
|
343
|
+
:type seq2: str
|
344
|
+
:return: The aligned sequences.
|
345
|
+
:rtype: ``MultipleSeqAlignment``
|
346
|
+
"""
|
347
|
+
|
348
|
+
id_1 = seq1.id
|
349
|
+
id_2 = seq2.id
|
350
|
+
|
351
|
+
if seq1.id == seq2.id:
|
352
|
+
id_1 += "_ref"
|
353
|
+
|
354
|
+
return AlignIO.read(
|
355
|
+
StringIO(cls._run_mafft({
|
356
|
+
id_1: seq1.seq,
|
357
|
+
id_2: seq2.seq
|
358
|
+
})),
|
359
|
+
"fasta"
|
360
|
+
)
|
361
|
+
|
362
|
+
@staticmethod
|
363
|
+
def parse_sequence_by_id(input_fasta: str, _id: str) -> SeqRecord | None:
|
364
|
+
"""
|
365
|
+
Parse the input ``.fasta`` file and return the ``Bio.SeqRecord`` with the given ``_id``. Returns ``None`` if the ``_id`` is not found.
|
366
|
+
|
367
|
+
:param input_fasta: The path to the input ``.fasta`` file.
|
368
|
+
:type input_fasta: str
|
369
|
+
:param _id: The ID of the sequence to be returned.
|
370
|
+
:type _id: str
|
371
|
+
:return: The sequence record with the given ``_id``. ``None`` if the ``_id`` is not found.
|
372
|
+
:rtype: SeqRecord | None
|
373
|
+
"""
|
374
|
+
|
375
|
+
with open(input_fasta) as handle:
|
376
|
+
for record in SeqIO.parse(handle, "fasta"):
|
377
|
+
if record.id == _id:
|
378
|
+
return record
|
379
|
+
return None
|
380
|
+
|
381
|
+
@staticmethod
|
382
|
+
def _run_mafft(seqs_dict: dict[str,str], outformat: str = "fasta") -> str:
|
383
|
+
"""
|
384
|
+
This function runs the MAFFT multiple sequence alignment tool on the input sequences.
|
385
|
+
|
386
|
+
It raises an exception if the return code is not 0 (i.e. there was an error running MAFFT).
|
387
|
+
|
388
|
+
:param seqs_dict: A dictionary containing the sequences to be aligned. The keys are the sequence names and the values are the sequences.
|
389
|
+
:type seqs_dict: dict[str,str]
|
390
|
+
:param outformat: The output format of the alignment. Default is fasta.
|
391
|
+
:type outformat: str
|
392
|
+
:return: The aligned sequences as parsed from stdout. If the output format is clustal, it returns the alignment in clustal format; otherwise, it returns the alignment in fasta format.
|
393
|
+
:rtype: str
|
394
|
+
"""
|
395
|
+
|
396
|
+
cmd = ["mafft"]
|
397
|
+
template_format = ">{}\n{}\n"
|
398
|
+
|
399
|
+
if outformat == "clustal":
|
400
|
+
cmd.extend(["--clustalout", "-"])
|
401
|
+
|
402
|
+
elif outformat != "fasta":
|
403
|
+
print(f"Unknown output format: {outformat}. Defaulting to fasta.")
|
404
|
+
|
405
|
+
cmd.append("-")
|
406
|
+
|
407
|
+
input_data = bytes(
|
408
|
+
"".join(
|
409
|
+
template_format.format(name, seq)
|
410
|
+
for name, seq in seqs_dict.items()
|
411
|
+
),
|
412
|
+
"utf-8"
|
413
|
+
)
|
414
|
+
|
415
|
+
ps = Popen(
|
416
|
+
cmd,
|
417
|
+
stdin=PIPE,
|
418
|
+
stdout=PIPE,
|
419
|
+
stderr=PIPE,
|
420
|
+
shell=False
|
421
|
+
)
|
422
|
+
ps.stdin.write(input_data)
|
423
|
+
ps.stdin.close()
|
424
|
+
|
425
|
+
err = ps.stderr.read().decode("utf-8")
|
426
|
+
out = ps.stdout.read().decode("utf-8")
|
427
|
+
|
428
|
+
if (ps.returncode != 0) and not(ps.returncode is None):
|
429
|
+
raise Exception(
|
430
|
+
f"Error running MAFFT:\nStdout:\n{out}\n\nStderr:\n{err}\nReturn code: {ps.returncode}"
|
431
|
+
)
|
432
|
+
|
433
|
+
return out
|
434
|
+
|
435
|
+
@staticmethod
|
436
|
+
def parse_metadata(input_meta: str) -> pd.DataFrame:
|
437
|
+
"""
|
438
|
+
Parse the metadata file into a ``pandas.DataFrame``.
|
439
|
+
|
440
|
+
:param input_meta: The path to the metadata file, in either ``.csv`` or ``.tsv`` format.
|
441
|
+
:type input_meta: str
|
442
|
+
:return: The metadata as a ``pd.DataFrame``. It must contain a ``date`` column. Other columns are optional.
|
443
|
+
:rtype: ``pd.DataFrame``
|
444
|
+
:raises ValueError: If the metadata file does not contain a ``date`` column.
|
445
|
+
"""
|
446
|
+
|
447
|
+
seps = {
|
448
|
+
"csv": ",",
|
449
|
+
"tsv": "\t"
|
450
|
+
}
|
451
|
+
|
452
|
+
try:
|
453
|
+
if input_meta.endswith(".csv"):
|
454
|
+
df = pd.read_csv(input_meta, sep=seps["csv"])
|
455
|
+
elif input_meta.endswith(".tsv"):
|
456
|
+
df = pd.read_csv(input_meta, sep=seps["tsv"])
|
457
|
+
except Exception as e:
|
458
|
+
print(f"Error reading metadata file: {e}")
|
459
|
+
return
|
460
|
+
|
461
|
+
if not "date" in df.columns:
|
462
|
+
raise ValueError("Metadata file must contain a \"date\" column")
|
463
|
+
|
464
|
+
df["date"] = pd.to_datetime(df["date"])
|
465
|
+
|
466
|
+
return df.sort_values(by="date")
|
467
|
+
|
PyEvoMotion/utils.py
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
import shutil
|
4
|
+
import subprocess
|
5
|
+
|
6
|
+
|
7
|
+
def get_mafft_script_path() -> str:
|
8
|
+
# Find the installed location of this package
|
9
|
+
package_root = os.path.dirname(sys.modules["PyEvoMotion"].__file__)
|
10
|
+
|
11
|
+
share_path = os.path.abspath(os.path.join(package_root, "..", "share", "mafft_install.sh"))
|
12
|
+
|
13
|
+
if not os.path.exists(share_path):
|
14
|
+
raise FileNotFoundError(f"mafft_install.sh not found at {share_path}")
|
15
|
+
|
16
|
+
return share_path
|
17
|
+
|
18
|
+
def ensure_local_bin_in_path() -> bool:
|
19
|
+
"""Checks if ~/.local/bin is in the PATH environment variable."""
|
20
|
+
local_bin = os.path.expanduser("~/.local/bin")
|
21
|
+
if local_bin not in os.environ.get("PATH", ""):
|
22
|
+
print(f"\n⚠️ {local_bin} is not in your PATH.")
|
23
|
+
print("You may not be able to run 'mafft' from the terminal.")
|
24
|
+
print("To fix this, add the following line to your shell config (e.g., ~/.bashrc, ~/.zshrc):\n")
|
25
|
+
print(f' export PATH="$PATH:{local_bin}"\n')
|
26
|
+
print("Then restart your shell or run `source ~/.bashrc`.\n")
|
27
|
+
return False
|
28
|
+
return True
|
29
|
+
|
30
|
+
def get_mafft_path() -> str:
|
31
|
+
"""Returns the path to the mafft binary if found, else None."""
|
32
|
+
return shutil.which("mafft") or os.path.expandvars("$HOME/.local/bin/mafft")
|
33
|
+
|
34
|
+
def is_mafft_installed() -> bool:
|
35
|
+
"""Returns True if mafft is available."""
|
36
|
+
return shutil.which("mafft") is not None or os.path.exists(os.path.expandvars("$HOME/.local/bin/mafft"))
|
37
|
+
|
38
|
+
def install_mafft():
|
39
|
+
|
40
|
+
response = input(
|
41
|
+
"mafft is not installed. Would you like to install it locally? (y/n): "
|
42
|
+
).strip().lower()
|
43
|
+
if response not in ["y", "yes"]:
|
44
|
+
print("mafft installation aborted.")
|
45
|
+
exit(0)
|
46
|
+
|
47
|
+
"""Installs mafft locally using the bundled script."""
|
48
|
+
print("Installing mafft locally...")
|
49
|
+
|
50
|
+
try:
|
51
|
+
subprocess.run(["bash", get_mafft_script_path()], check=True)
|
52
|
+
print("mafft installation complete.")
|
53
|
+
except subprocess.CalledProcessError as e:
|
54
|
+
print(f"Failed to install mafft: {e}")
|
55
|
+
sys.exit(1)
|
56
|
+
|
57
|
+
def verify_mafft():
|
58
|
+
"""Runs `mafft --version` to confirm installation."""
|
59
|
+
|
60
|
+
if not ensure_local_bin_in_path():
|
61
|
+
sys.exit(0)
|
62
|
+
|
63
|
+
mafft_path = get_mafft_path()
|
64
|
+
if not mafft_path:
|
65
|
+
print("mafft not found.")
|
66
|
+
return False
|
67
|
+
|
68
|
+
try:
|
69
|
+
result = subprocess.run(
|
70
|
+
[mafft_path, "--version"],
|
71
|
+
capture_output=True,
|
72
|
+
text=True
|
73
|
+
)
|
74
|
+
print(f"mafft version: {result.stderr.strip() or result.stdout.strip()}")
|
75
|
+
return True
|
76
|
+
except Exception as e:
|
77
|
+
print(f"Error running mafft: {e}")
|
78
|
+
return False
|
79
|
+
|
80
|
+
def check_and_install_mafft():
|
81
|
+
if not is_mafft_installed():
|
82
|
+
print("mafft not found.")
|
83
|
+
install_mafft()
|
84
|
+
|
85
|
+
if not verify_mafft():
|
86
|
+
print("mafft verification failed after installation.")
|
87
|
+
sys.exit(1)
|
@@ -0,0 +1,117 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: PyEvoMotion
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Evolutionary motion analysis tool
|
5
|
+
Keywords: evolution,anomalous diffusion,bioinformatics
|
6
|
+
Author: Lucas Goiriz
|
7
|
+
Author-email: lucas.goiriz@csic.es
|
8
|
+
Requires-Python: >=3.12,<3.13
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: Programming Language :: Python :: 3.12
|
11
|
+
Requires-Dist: bio (>=1.7.1,<2.0.0)
|
12
|
+
Requires-Dist: matplotlib (>=3.9.1,<4.0.0)
|
13
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
14
|
+
Requires-Dist: pytest (>=8.2.2,<9.0.0)
|
15
|
+
Requires-Dist: scikit-learn (>=1.5.1,<2.0.0)
|
16
|
+
Requires-Dist: twine (>=6.1.0,<7.0.0)
|
17
|
+
Project-URL: Homepage, https://luksgrin.github.io/PyEvoMotion/
|
18
|
+
Project-URL: Repository, https://github.com/luksgrin/PyEvoMotion/
|
19
|
+
Description-Content-Type: text/markdown
|
20
|
+
|
21
|
+
# PyEvoMotion
|
22
|
+
|
23
|
+
A software to assess the evolution dynamics of a set of related DNA sequences.
|
24
|
+
|
25
|
+
_(See [Goiriz L, et al.](http://doi.org/10.1073/pnas.2303578120))_
|
26
|
+
|
27
|
+
## Installation
|
28
|
+
|
29
|
+
> **Note:**
|
30
|
+
> `PyEvoMotion` uses [mafft](https://mafft.cbrc.jp/alignment/software/) to do the sequence alignment. If it’s not available in your system, on the the first run of `PyEvoMotion`, it will ask to install it locally.
|
31
|
+
>
|
32
|
+
> If so, ensure to restart your shell session or run `source ~/.bashrc` to update the PATH environment variable, so that the `mafft` executable is available in your shell.
|
33
|
+
>
|
34
|
+
> To install `PyEvoMotion` you may clone the repository and run `pip install`, or install it from PyPI:
|
35
|
+
|
36
|
+
```bash
|
37
|
+
pip install PyEvoMotion
|
38
|
+
```
|
39
|
+
|
40
|
+
This will install the package and its dependencies _(but not the tests nor the test data)_. To check if the installation was successful, you can run the following command:
|
41
|
+
|
42
|
+
```bash
|
43
|
+
PyEvoMotion
|
44
|
+
```
|
45
|
+
|
46
|
+
If the installation was successful, you should see the following output:
|
47
|
+
|
48
|
+
```bash
|
49
|
+
Welcome to Rodrigolab's
|
50
|
+
_____ ______ __ __ _ _
|
51
|
+
| __ \ | ____| | \/ | | | (_)
|
52
|
+
| |__) | _| |____ _____ | \ / | ___ | |_ _ ___ _ __
|
53
|
+
| ___/ | | | __\ \ / / _ \| |\/| |/ _ \| __| |/ _ \| '_ \
|
54
|
+
| | | |_| | |___\ V / (_) | | | | (_) | |_| | (_) | | | |
|
55
|
+
|_| \__, |______\_/ \___/|_| |_|\___/ \__|_|\___/|_| |_|
|
56
|
+
__/ |
|
57
|
+
|___/
|
58
|
+
|
59
|
+
usage: PyEvoMotion [-h] [-dt DELTA_T] [-sh] [-ep] [-l LENGTH_FILTER] [-xj] [-ij IMPORT_JSON] [-k {all,total,substitutions,insertions,deletions,indels}] [-f FILTER [FILTER ...]] [-gp GENOME_POSITIONS] [-dr DATE_RANGE]
|
60
|
+
seqs meta out
|
61
|
+
|
62
|
+
PyEvoMotion
|
63
|
+
|
64
|
+
positional arguments:
|
65
|
+
seqs Path to the input fasta file containing the sequences.
|
66
|
+
meta Path to the corresponding metadata file for the sequences.
|
67
|
+
out Path to the output filename prefix used to save the different results.
|
68
|
+
|
69
|
+
options:
|
70
|
+
-h, --help show this help message and exit
|
71
|
+
-dt DELTA_T, --delta_t DELTA_T
|
72
|
+
Time interval to calculate the statistics. Default is 7 days (7D).
|
73
|
+
-sh, --show Show the plots of the analysis.
|
74
|
+
-ep, --export_plots Export the plots of the analysis.
|
75
|
+
-l LENGTH_FILTER, --length_filter LENGTH_FILTER
|
76
|
+
Length filter for the sequences (removes sequences with length less than the specified value). Default is 0.
|
77
|
+
-n N_THRESHOLD, --n_threshold N_THRESHOLD
|
78
|
+
Minimum number of sequences required in a time interval to compute statistics. Default is 2.
|
79
|
+
-xj, --export_json Export the run arguments to a json file.
|
80
|
+
-ij IMPORT_JSON, --import_json IMPORT_JSON
|
81
|
+
Import the run arguments from a JSON file. If this argument is passed, the other arguments are ignored. The JSON file must contain the mandatory keys 'seqs', 'meta', and 'out'.
|
82
|
+
-k {all,total,substitutions,insertions,deletions,indels}, --kind {all,total,substitutions,insertions,deletions,indels}
|
83
|
+
Kind of mutations to consider for the analysis. Default is 'all'.
|
84
|
+
-f FILTER [FILTER ...], --filter FILTER [FILTER ...]
|
85
|
+
Specify filters to be applied on the data with keys followed by values. If the values are multiple, they must be enclosed in square brackets. Example: --filter key1 value1 key2 [value2 value3]
|
86
|
+
key3 value4. If either the keys or values contain spaces, they must be enclosed in quotes. keys must be present in the metadata file as columns for the filter to be applied. Use '*' as a
|
87
|
+
wildcard, for example Bio* to filter all columns starting with 'Bio'.
|
88
|
+
-gp GENOME_POSITIONS, --genome_positions GENOME_POSITIONS
|
89
|
+
Genome positions to restrict the analysis. The positions must be separated by two dots. Example: 1..1000. Open start or end positions are allowed by omitting the first or last position,
|
90
|
+
respectively. If not specified, the whole reference genome is considered.
|
91
|
+
-dr DATE_RANGE, --date_range DATE_RANGE
|
92
|
+
Date range to filter the data. The date range must be separated by two dots and the format must be YYYY-MM-DD. Example: 2020-01-01..2020-12-31. If not specified, the whole dataset is
|
93
|
+
considered. Note that if the origin is specified, the most restrictive date range is considered.
|
94
|
+
|
95
|
+
Error: the following arguments are required: seqs, meta, out
|
96
|
+
```
|
97
|
+
|
98
|
+
## Tests
|
99
|
+
|
100
|
+
This package has been developed using `pytest` for testing. To run the tests, you may install PyEvoMotion from the `sdist` archive, decompress it, install it and run the tests:
|
101
|
+
|
102
|
+
```bash
|
103
|
+
pip download --no-deps --no-binary :all: PyEvoMotion
|
104
|
+
tar -xvzf pyevomotion-*.tar.gz
|
105
|
+
cd pyevomotion-*/
|
106
|
+
pip install .
|
107
|
+
PyEvoMotion # To trigger mafft installation. Ensure afterwards that mafft is available in your PATH.
|
108
|
+
pytest
|
109
|
+
```
|
110
|
+
|
111
|
+
> [!WARNING]
|
112
|
+
> The first time the tests are run, they will automatically download the test data from `https://sourceforge.net/projects/pyevomotion/files/test_data.zip/download` and extract it in the appropriate directory.
|
113
|
+
>
|
114
|
+
> Given the size of the test data, this may take a while.
|
115
|
+
|
116
|
+
|
117
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
PyEvoMotion/__init__.py,sha256=NqFDD-EZBzouzTwXozZqhPC9sLr7GQaElRKtP0tkHoE,568
|
2
|
+
PyEvoMotion/cli.py,sha256=424ATWV3P89sMYhXT2-i3eHN-VwmujWjcue0coxY-lQ,15498
|
3
|
+
PyEvoMotion/core/__init__.py,sha256=1I-NkFFh6ljLgB_mqQVFLNvCrVKEHLVxa_5dsv3ihWQ,450
|
4
|
+
PyEvoMotion/core/base.py,sha256=SvvJAuYx__NAI8Mzye9lZJM6mPsEuKnulcJuztiPU_E,13226
|
5
|
+
PyEvoMotion/core/core.py,sha256=xCVGZxNvIKge0KINHO-tHK6aD4tdL7Zgua10eONAbi0,18311
|
6
|
+
PyEvoMotion/core/parser.py,sha256=xbjTbIvNy6ta-8WBWwDdnUoTjARPE9eZyaHOXfQKW4U,17144
|
7
|
+
PyEvoMotion/utils.py,sha256=Ye3eL1RXZOZzzs2KZy0R45u06DOtLYo-zqE45tN2t7g,2859
|
8
|
+
share/mafft_install.sh,sha256=pCw70UsKkkNXUsZMwQlQ2b4zSXFrBA7jAj9iOfGLzUw,1007
|
9
|
+
share/manuscript_figure.py,sha256=czznZchVsb7qsCXEGJo-NFG7DJuAE2XxydLmqsAQ66g,9704
|
10
|
+
pyevomotion-0.1.0.dist-info/METADATA,sha256=tEv8So175_nhi-FAr_aCM3lo052xgIh-_HPJBGWCc6k,5950
|
11
|
+
pyevomotion-0.1.0.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
12
|
+
pyevomotion-0.1.0.dist-info/entry_points.txt,sha256=UMzoojYwQi-713hRggkQXUIfGNygUARhTdGs77Usp7s,53
|
13
|
+
pyevomotion-0.1.0.dist-info/RECORD,,
|