PyEvoMotion 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,467 @@
1
+ from Bio import SeqIO, AlignIO
2
+ from Bio.SeqRecord import SeqRecord
3
+ from Bio.Align import MultipleSeqAlignment
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from io import StringIO
8
+ from itertools import groupby
9
+ from datetime import datetime
10
+ from operator import itemgetter
11
+ from subprocess import Popen, PIPE
12
+
13
+
14
+ class PyEvoMotionParser():
15
+ """
16
+ This class is responsible for parsing the input fasta and metadata files. It is inherited by the :class:`PyEvoMotion` class.
17
+
18
+ :param input_fasta: The path to the input ``.fasta`` file.
19
+ :type input_fasta: str
20
+ :param input_meta: The path to the input metadata file. The metadata file must contain a ``date`` column and can be in either ``.csv`` or ``.tsv`` format.
21
+ :type input_meta: str
22
+ :param filters: The filters to be applied to the data. The keys are the column names and the values are the values to be filtered.
23
+ :type filters: dict[str, list[str] | str]
24
+ :param positions: The start and end positions to filter the data by.
25
+ :type positions: tuple[int]
26
+ :param date_range: The start and end dates to filter the data by. If ``None``, the date range is not filtered.
27
+ :type date_range: tuple[datetime] | None
28
+
29
+ On construction, it invokes the following methods:
30
+ - :meth:`parse_metadata`: Parses the metadata file.
31
+ - :meth:`parse_sequence_by_id`: Parses the sequence with the ID of the first entry in the metadata file.
32
+ - :meth:`filter_columns`: Filters the metadata based on the input filters.
33
+ - :meth:`filter_by_daterange`: Filters the metadata based on the input date range.
34
+ - :meth:`parse_data`: Parses the input fasta file and appends the mutations that differ between the reference sequence and the input sequences to the metadata.
35
+ - :meth:`filter_by_position`: Filters the metadata based on the input start and end positions.
36
+
37
+ Attributes:
38
+ -----------
39
+ data: ``pd.DataFrame``
40
+ DataFrame containing metadata.
41
+ reference: ``SeqRecord``
42
+ The reference sequence parsed from the fasta file.
43
+ """
44
+
45
+ def __init__(self,
46
+ input_fasta: str,
47
+ input_meta: str,
48
+ filters: dict[str, list[str] | str],
49
+ positions: tuple[int],
50
+ date_range: tuple[datetime] | None
51
+ ) -> None:
52
+ """
53
+ Initializes the class with input FASTA and metadata files.
54
+
55
+ :param input_fasta: The path to the input FASTA file.
56
+ :type input_fasta: str
57
+ :param input_meta: The path to the input metadata file.
58
+ :type input_meta: str
59
+ :param filters: The filters to be applied to the data. The keys are the column names and the values are the values to be filtered.
60
+ :type filters: dict[str, list[str] | str]
61
+ :param positions: The start and end positions to filter the data by.
62
+ :type positions: tuple[int]
63
+ :param date_range: The start and end dates to filter the data by. If None, the date range is not filtered.
64
+ :type date_range: tuple[datetime] | None
65
+ """
66
+
67
+ self.data = self.parse_metadata(input_meta)
68
+ self.reference = self.parse_sequence_by_id(
69
+ input_fasta,
70
+ self.data.iloc[0]["id"]
71
+ )
72
+ # Implicitly filters the data
73
+ self.filter_columns(filters)
74
+ if date_range:
75
+ self.filter_by_daterange(*date_range)
76
+ # Appends the mutations that differ between the reference sequence and the input sequences to the data
77
+ self.parse_data(input_fasta, self.data["id"])
78
+ # Applies the position filter if provided
79
+ self.filter_by_position(*positions)
80
+
81
+ def parse_data(self, input_fasta: str, selection: pd.Series) -> None:
82
+ """
83
+ Parse the input fasta file and store the resulting data in the ``data`` attribute.
84
+
85
+ :param input_fasta: The path to the input ``.fasta`` file.
86
+ :type input_fasta: str
87
+ :param selection: The selection of sequence ids to be parsed.
88
+ :type selection: pd.Series
89
+ """
90
+
91
+ self.data = (
92
+ self.data
93
+ .merge(
94
+ self.get_differing_mutations(input_fasta, selection),
95
+ on="id",
96
+ how="left"
97
+ )
98
+ .reset_index(drop=True)
99
+ )
100
+
101
+ def filter_by_daterange(self, start: datetime, end: datetime) -> None:
102
+ """
103
+ Filter the data based on a date range.
104
+
105
+ The data is filtered to only include entries with dates between the start and end dates. This method modifies the ``data`` attribute in place.
106
+
107
+ :param start: The start date.
108
+ :type start: datetime
109
+ :param end: The end date.
110
+ :type end: datetime
111
+ :raises ValueError: If the start date is greater than the end date.
112
+ """
113
+
114
+ start = (
115
+ max(self.data["date"].min(), start) if start
116
+ else self.data["date"].min()
117
+ )
118
+ end = (
119
+ min(self.data["date"].max(), end)
120
+ if end else self.data["date"].max()
121
+ )
122
+
123
+ if start > end:
124
+ raise ValueError("Start date must be smaller than end date")
125
+
126
+ self.data = self.data[
127
+ (self.data["date"] >= start) & (self.data["date"] <= end)
128
+ ]
129
+
130
+ def filter_by_position(self, start: int, end: int) -> None:
131
+ """
132
+ Filter the data based on some start and end positions in the reference sequence.
133
+
134
+ *Note that the positions are 1-indexed, and that the end position is inclusive.*
135
+
136
+ :param start: The start position index.
137
+ :type start: int
138
+ :param end: The end position index.
139
+ :type end: int
140
+ :raises ValueError: If the start position is greater than the end position., or if the start position is greater than the length of the reference sequence.
141
+ """
142
+
143
+ start = max(1, start) # Ensure start is at least 1
144
+ end = end if end > 0 else len(self.reference.seq) + 1 # Set end if not provided
145
+
146
+ if start >= end:
147
+ raise ValueError("Start position must be smaller than end position")
148
+ elif start > len(self.reference.seq):
149
+ raise ValueError("Start position is out of range")
150
+
151
+ self.data["mutation instructions"] = self.data["mutation instructions"].apply(
152
+ lambda x: [
153
+ mod
154
+ for mod in x
155
+ if start - 1 < int(mod.split("_")[1]) < end
156
+ ]
157
+ )
158
+ self.data = self.data[
159
+ self.data["mutation instructions"].apply(len) > 0
160
+ ]
161
+
162
+ def filter_columns(self, filters: dict[str, list[str] | str]) -> None:
163
+ """
164
+ Filter the data based on the input filters provided.
165
+
166
+ :param filters: The filters to be applied to the data. The keys are the column names and the values are the values to be filtered from the provided metadata.
167
+ :type filters: dict[str, list[str] | str]
168
+ """
169
+
170
+ # Only keep filters that are columns in the data
171
+ _filters = {
172
+ k: v
173
+ for k,v in filters.items()
174
+ if k in self.data.columns
175
+ }
176
+
177
+ for col, vals in _filters.items():
178
+ if isinstance(vals, str):
179
+ vals = [vals]
180
+ regex_pattern = "|".join(
181
+ val.replace('*', '.*')
182
+ for val in vals
183
+ )
184
+ self.data = self.data[
185
+ self.data[col]
186
+ .str.contains(
187
+ regex_pattern,
188
+ regex=True
189
+ )
190
+ ]
191
+
192
+ @staticmethod
193
+ def _get_consecutives(data: list[int]) -> list[list[int]]:
194
+ """
195
+ Groups list of ordered integers into list of groups of integers
196
+
197
+ :param data: a list of ordered integers.
198
+ :type data: list[int]
199
+ :return idxs: a list of lists of consecutive integers.
200
+ :rtype: list[list[int]]
201
+ """
202
+ idxs = []
203
+
204
+ for _, g in groupby(
205
+ enumerate(data),
206
+ lambda x: x[0] - x[1]
207
+ ):
208
+ idxs.append(list(map(itemgetter(1), g)))
209
+
210
+ return idxs
211
+
212
+ @staticmethod
213
+ def _column_decision(col:np.array) -> int:
214
+ """
215
+ Classifies bases in array column
216
+
217
+ :param col: column with two rows, containing one of these symbols: A, G, T, C, N, -
218
+ :type col: np.array
219
+ :return: returns an integer indicating if match (0), mismatch (1), insertion (2) or deletion (3).
220
+ :rtype: int
221
+ """
222
+
223
+ # If there's a match, return 0. In the case that there's an insertion of an N, we also return 0 as it's probably a sequencing error
224
+ if ((col[0] == col[1]) or ("N" in col)): return 0
225
+
226
+ # If there's an insertion, return 2
227
+ elif col[0] == "-": return 2
228
+
229
+ # If there's a deletion, return 3
230
+ elif col[1] == "-": return 3
231
+
232
+ # Else, it has to be a mismatch. Return 1
233
+ else: return 1
234
+
235
+ @classmethod
236
+ def create_modifs(cls, alignment: MultipleSeqAlignment) -> list[str]:
237
+ """
238
+ Creates a guide on how to modify the root sequence to get the appropriate mutant sequence.
239
+
240
+ :param alignment: ``Bio.Align.MultipleSeqAlignment`` object containing the alignment.
241
+ :type alignment: MultipleSeqAlignment
242
+ :return mods: List of modifications encoded as strings that have the format ``<type>_<position>_<bases>``, where ``<type>`` is one of ``i``, ``d`` and ``s`` (insertion, deletion and substitution), ``<position>`` is the position in the sequence where the modification should be made, and ``<bases>`` are the bases to be inserted, deleted or substituted.
243
+ :rtype: ``list[str]``
244
+ """
245
+
246
+ # Turn alignment into np.array
247
+ coding = np.array([
248
+ list(alignment[0].upper()),
249
+ list(alignment[1].upper())
250
+ ])
251
+
252
+ # Get classification
253
+ clsMut = np.apply_along_axis(cls._column_decision, 0, coding)
254
+
255
+ # Encode substitutions
256
+ subst = list(map(
257
+ lambda x: f"s_{x + 1}_{coding[1, x]}",
258
+ list(np.where(clsMut == 1)[0])
259
+ ))
260
+
261
+ # Encode insertions
262
+ insertions = list(map(
263
+ lambda x: f"i_{x[0]}_{''.join(coding[1, x])}",
264
+ cls._get_consecutives(
265
+ list(np.where(clsMut == 2)[0])
266
+ )
267
+ ))
268
+
269
+ # Encode deletions
270
+ deletions = list(map(
271
+ lambda x: f"d_{x[0]}_{''.join(coding[0, x])}",
272
+ cls._get_consecutives(
273
+ list(np.where(clsMut == 3)[0])
274
+ )
275
+ ))
276
+
277
+ # Blend modifications and order them
278
+ mods = sorted(
279
+ insertions + deletions + subst,
280
+ key=lambda x: int(x.split("_")[1])
281
+ )
282
+
283
+ reindex = [
284
+ (mods.index(el),len(el.split("_")[-1]))
285
+ for el in mods
286
+ if el.startswith("i")
287
+ ]
288
+
289
+ for idx,v in reindex:
290
+ mods[idx + 1:] = list(map(
291
+ lambda x: "_".join((
292
+ x.split("_")[0],
293
+ str(int(x.split("_")[1]) - v),
294
+ x.split("_")[-1]
295
+ )),
296
+ mods[idx + 1:]
297
+ ))
298
+
299
+ return mods
300
+
301
+ def get_differing_mutations(self, input_fasta: str, selection: pd.Series) -> pd.DataFrame:
302
+ """
303
+ Return the mutations that differ between the reference sequence and the input sequence.
304
+
305
+ Also, for the sake of sequence selection, it outputs the number of ``N`` found in the sequence.
306
+
307
+ :param input_fasta: The path to the input ``.fasta`` file.
308
+ :type input_fasta: str
309
+ :param selection: The selection of sequence ids to be compared with the reference sequence.
310
+ :type selection: pd.Series
311
+ :return: The mutations that differ between the reference sequence and the input sequence. It contains the columns ``id``, ``mutation instructions`` and ``N count`` (the number of ``N`` in the sequence).
312
+ :rtype: ``pd.DataFrame``
313
+ """
314
+
315
+ aligments = {}
316
+
317
+ with open(input_fasta) as handle:
318
+ for record in SeqIO.parse(handle, "fasta"):
319
+ if not(record.id in selection.values):
320
+ continue
321
+ alignment = self.generate_alignment(
322
+ self.reference,
323
+ record
324
+ )
325
+ aligments[record.id] = (
326
+ self.create_modifs(alignment),
327
+ alignment[1].seq.count("n") # In alignment fasta files, bases are lowercase
328
+ )
329
+
330
+ return pd.DataFrame(
331
+ [(k, v1, v2) for k, (v1, v2) in aligments.items()],
332
+ columns=["id", "mutation instructions", "N count"]
333
+ )
334
+
335
+ @classmethod
336
+ def generate_alignment(cls, seq1: str, seq2: str) -> MultipleSeqAlignment:
337
+ """
338
+ Generate a multiple sequence alignment of the input sequences using ``MAFFT``.
339
+
340
+ :param seq1: The first sequence to be aligned.
341
+ :type seq1: str
342
+ :param seq2: The second sequence to be aligned.
343
+ :type seq2: str
344
+ :return: The aligned sequences.
345
+ :rtype: ``MultipleSeqAlignment``
346
+ """
347
+
348
+ id_1 = seq1.id
349
+ id_2 = seq2.id
350
+
351
+ if seq1.id == seq2.id:
352
+ id_1 += "_ref"
353
+
354
+ return AlignIO.read(
355
+ StringIO(cls._run_mafft({
356
+ id_1: seq1.seq,
357
+ id_2: seq2.seq
358
+ })),
359
+ "fasta"
360
+ )
361
+
362
+ @staticmethod
363
+ def parse_sequence_by_id(input_fasta: str, _id: str) -> SeqRecord | None:
364
+ """
365
+ Parse the input ``.fasta`` file and return the ``Bio.SeqRecord`` with the given ``_id``. Returns ``None`` if the ``_id`` is not found.
366
+
367
+ :param input_fasta: The path to the input ``.fasta`` file.
368
+ :type input_fasta: str
369
+ :param _id: The ID of the sequence to be returned.
370
+ :type _id: str
371
+ :return: The sequence record with the given ``_id``. ``None`` if the ``_id`` is not found.
372
+ :rtype: SeqRecord | None
373
+ """
374
+
375
+ with open(input_fasta) as handle:
376
+ for record in SeqIO.parse(handle, "fasta"):
377
+ if record.id == _id:
378
+ return record
379
+ return None
380
+
381
+ @staticmethod
382
+ def _run_mafft(seqs_dict: dict[str,str], outformat: str = "fasta") -> str:
383
+ """
384
+ This function runs the MAFFT multiple sequence alignment tool on the input sequences.
385
+
386
+ It raises an exception if the return code is not 0 (i.e. there was an error running MAFFT).
387
+
388
+ :param seqs_dict: A dictionary containing the sequences to be aligned. The keys are the sequence names and the values are the sequences.
389
+ :type seqs_dict: dict[str,str]
390
+ :param outformat: The output format of the alignment. Default is fasta.
391
+ :type outformat: str
392
+ :return: The aligned sequences as parsed from stdout. If the output format is clustal, it returns the alignment in clustal format; otherwise, it returns the alignment in fasta format.
393
+ :rtype: str
394
+ """
395
+
396
+ cmd = ["mafft"]
397
+ template_format = ">{}\n{}\n"
398
+
399
+ if outformat == "clustal":
400
+ cmd.extend(["--clustalout", "-"])
401
+
402
+ elif outformat != "fasta":
403
+ print(f"Unknown output format: {outformat}. Defaulting to fasta.")
404
+
405
+ cmd.append("-")
406
+
407
+ input_data = bytes(
408
+ "".join(
409
+ template_format.format(name, seq)
410
+ for name, seq in seqs_dict.items()
411
+ ),
412
+ "utf-8"
413
+ )
414
+
415
+ ps = Popen(
416
+ cmd,
417
+ stdin=PIPE,
418
+ stdout=PIPE,
419
+ stderr=PIPE,
420
+ shell=False
421
+ )
422
+ ps.stdin.write(input_data)
423
+ ps.stdin.close()
424
+
425
+ err = ps.stderr.read().decode("utf-8")
426
+ out = ps.stdout.read().decode("utf-8")
427
+
428
+ if (ps.returncode != 0) and not(ps.returncode is None):
429
+ raise Exception(
430
+ f"Error running MAFFT:\nStdout:\n{out}\n\nStderr:\n{err}\nReturn code: {ps.returncode}"
431
+ )
432
+
433
+ return out
434
+
435
+ @staticmethod
436
+ def parse_metadata(input_meta: str) -> pd.DataFrame:
437
+ """
438
+ Parse the metadata file into a ``pandas.DataFrame``.
439
+
440
+ :param input_meta: The path to the metadata file, in either ``.csv`` or ``.tsv`` format.
441
+ :type input_meta: str
442
+ :return: The metadata as a ``pd.DataFrame``. It must contain a ``date`` column. Other columns are optional.
443
+ :rtype: ``pd.DataFrame``
444
+ :raises ValueError: If the metadata file does not contain a ``date`` column.
445
+ """
446
+
447
+ seps = {
448
+ "csv": ",",
449
+ "tsv": "\t"
450
+ }
451
+
452
+ try:
453
+ if input_meta.endswith(".csv"):
454
+ df = pd.read_csv(input_meta, sep=seps["csv"])
455
+ elif input_meta.endswith(".tsv"):
456
+ df = pd.read_csv(input_meta, sep=seps["tsv"])
457
+ except Exception as e:
458
+ print(f"Error reading metadata file: {e}")
459
+ return
460
+
461
+ if not "date" in df.columns:
462
+ raise ValueError("Metadata file must contain a \"date\" column")
463
+
464
+ df["date"] = pd.to_datetime(df["date"])
465
+
466
+ return df.sort_values(by="date")
467
+
PyEvoMotion/utils.py ADDED
@@ -0,0 +1,87 @@
1
+ import os
2
+ import sys
3
+ import shutil
4
+ import subprocess
5
+
6
+
7
+ def get_mafft_script_path() -> str:
8
+ # Find the installed location of this package
9
+ package_root = os.path.dirname(sys.modules["PyEvoMotion"].__file__)
10
+
11
+ share_path = os.path.abspath(os.path.join(package_root, "..", "share", "mafft_install.sh"))
12
+
13
+ if not os.path.exists(share_path):
14
+ raise FileNotFoundError(f"mafft_install.sh not found at {share_path}")
15
+
16
+ return share_path
17
+
18
+ def ensure_local_bin_in_path() -> bool:
19
+ """Checks if ~/.local/bin is in the PATH environment variable."""
20
+ local_bin = os.path.expanduser("~/.local/bin")
21
+ if local_bin not in os.environ.get("PATH", ""):
22
+ print(f"\n⚠️ {local_bin} is not in your PATH.")
23
+ print("You may not be able to run 'mafft' from the terminal.")
24
+ print("To fix this, add the following line to your shell config (e.g., ~/.bashrc, ~/.zshrc):\n")
25
+ print(f' export PATH="$PATH:{local_bin}"\n')
26
+ print("Then restart your shell or run `source ~/.bashrc`.\n")
27
+ return False
28
+ return True
29
+
30
+ def get_mafft_path() -> str:
31
+ """Returns the path to the mafft binary if found, else None."""
32
+ return shutil.which("mafft") or os.path.expandvars("$HOME/.local/bin/mafft")
33
+
34
+ def is_mafft_installed() -> bool:
35
+ """Returns True if mafft is available."""
36
+ return shutil.which("mafft") is not None or os.path.exists(os.path.expandvars("$HOME/.local/bin/mafft"))
37
+
38
+ def install_mafft():
39
+
40
+ response = input(
41
+ "mafft is not installed. Would you like to install it locally? (y/n): "
42
+ ).strip().lower()
43
+ if response not in ["y", "yes"]:
44
+ print("mafft installation aborted.")
45
+ exit(0)
46
+
47
+ """Installs mafft locally using the bundled script."""
48
+ print("Installing mafft locally...")
49
+
50
+ try:
51
+ subprocess.run(["bash", get_mafft_script_path()], check=True)
52
+ print("mafft installation complete.")
53
+ except subprocess.CalledProcessError as e:
54
+ print(f"Failed to install mafft: {e}")
55
+ sys.exit(1)
56
+
57
+ def verify_mafft():
58
+ """Runs `mafft --version` to confirm installation."""
59
+
60
+ if not ensure_local_bin_in_path():
61
+ sys.exit(0)
62
+
63
+ mafft_path = get_mafft_path()
64
+ if not mafft_path:
65
+ print("mafft not found.")
66
+ return False
67
+
68
+ try:
69
+ result = subprocess.run(
70
+ [mafft_path, "--version"],
71
+ capture_output=True,
72
+ text=True
73
+ )
74
+ print(f"mafft version: {result.stderr.strip() or result.stdout.strip()}")
75
+ return True
76
+ except Exception as e:
77
+ print(f"Error running mafft: {e}")
78
+ return False
79
+
80
+ def check_and_install_mafft():
81
+ if not is_mafft_installed():
82
+ print("mafft not found.")
83
+ install_mafft()
84
+
85
+ if not verify_mafft():
86
+ print("mafft verification failed after installation.")
87
+ sys.exit(1)
@@ -0,0 +1,117 @@
1
+ Metadata-Version: 2.3
2
+ Name: PyEvoMotion
3
+ Version: 0.1.0
4
+ Summary: Evolutionary motion analysis tool
5
+ Keywords: evolution,anomalous diffusion,bioinformatics
6
+ Author: Lucas Goiriz
7
+ Author-email: lucas.goiriz@csic.es
8
+ Requires-Python: >=3.12,<3.13
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Dist: bio (>=1.7.1,<2.0.0)
12
+ Requires-Dist: matplotlib (>=3.9.1,<4.0.0)
13
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
14
+ Requires-Dist: pytest (>=8.2.2,<9.0.0)
15
+ Requires-Dist: scikit-learn (>=1.5.1,<2.0.0)
16
+ Requires-Dist: twine (>=6.1.0,<7.0.0)
17
+ Project-URL: Homepage, https://luksgrin.github.io/PyEvoMotion/
18
+ Project-URL: Repository, https://github.com/luksgrin/PyEvoMotion/
19
+ Description-Content-Type: text/markdown
20
+
21
+ # PyEvoMotion
22
+
23
+ A software to assess the evolution dynamics of a set of related DNA sequences.
24
+
25
+ _(See [Goiriz L, et al.](http://doi.org/10.1073/pnas.2303578120))_
26
+
27
+ ## Installation
28
+
29
+ > **Note:**
30
+ > `PyEvoMotion` uses [mafft](https://mafft.cbrc.jp/alignment/software/) to do the sequence alignment. If it’s not available in your system, on the the first run of `PyEvoMotion`, it will ask to install it locally.
31
+ >
32
+ > If so, ensure to restart your shell session or run `source ~/.bashrc` to update the PATH environment variable, so that the `mafft` executable is available in your shell.
33
+ >
34
+ > To install `PyEvoMotion` you may clone the repository and run `pip install`, or install it from PyPI:
35
+
36
+ ```bash
37
+ pip install PyEvoMotion
38
+ ```
39
+
40
+ This will install the package and its dependencies _(but not the tests nor the test data)_. To check if the installation was successful, you can run the following command:
41
+
42
+ ```bash
43
+ PyEvoMotion
44
+ ```
45
+
46
+ If the installation was successful, you should see the following output:
47
+
48
+ ```bash
49
+ Welcome to Rodrigolab's
50
+ _____ ______ __ __ _ _
51
+ | __ \ | ____| | \/ | | | (_)
52
+ | |__) | _| |____ _____ | \ / | ___ | |_ _ ___ _ __
53
+ | ___/ | | | __\ \ / / _ \| |\/| |/ _ \| __| |/ _ \| '_ \
54
+ | | | |_| | |___\ V / (_) | | | | (_) | |_| | (_) | | | |
55
+ |_| \__, |______\_/ \___/|_| |_|\___/ \__|_|\___/|_| |_|
56
+ __/ |
57
+ |___/
58
+
59
+ usage: PyEvoMotion [-h] [-dt DELTA_T] [-sh] [-ep] [-l LENGTH_FILTER] [-xj] [-ij IMPORT_JSON] [-k {all,total,substitutions,insertions,deletions,indels}] [-f FILTER [FILTER ...]] [-gp GENOME_POSITIONS] [-dr DATE_RANGE]
60
+ seqs meta out
61
+
62
+ PyEvoMotion
63
+
64
+ positional arguments:
65
+ seqs Path to the input fasta file containing the sequences.
66
+ meta Path to the corresponding metadata file for the sequences.
67
+ out Path to the output filename prefix used to save the different results.
68
+
69
+ options:
70
+ -h, --help show this help message and exit
71
+ -dt DELTA_T, --delta_t DELTA_T
72
+ Time interval to calculate the statistics. Default is 7 days (7D).
73
+ -sh, --show Show the plots of the analysis.
74
+ -ep, --export_plots Export the plots of the analysis.
75
+ -l LENGTH_FILTER, --length_filter LENGTH_FILTER
76
+ Length filter for the sequences (removes sequences with length less than the specified value). Default is 0.
77
+ -n N_THRESHOLD, --n_threshold N_THRESHOLD
78
+ Minimum number of sequences required in a time interval to compute statistics. Default is 2.
79
+ -xj, --export_json Export the run arguments to a json file.
80
+ -ij IMPORT_JSON, --import_json IMPORT_JSON
81
+ Import the run arguments from a JSON file. If this argument is passed, the other arguments are ignored. The JSON file must contain the mandatory keys 'seqs', 'meta', and 'out'.
82
+ -k {all,total,substitutions,insertions,deletions,indels}, --kind {all,total,substitutions,insertions,deletions,indels}
83
+ Kind of mutations to consider for the analysis. Default is 'all'.
84
+ -f FILTER [FILTER ...], --filter FILTER [FILTER ...]
85
+ Specify filters to be applied on the data with keys followed by values. If the values are multiple, they must be enclosed in square brackets. Example: --filter key1 value1 key2 [value2 value3]
86
+ key3 value4. If either the keys or values contain spaces, they must be enclosed in quotes. keys must be present in the metadata file as columns for the filter to be applied. Use '*' as a
87
+ wildcard, for example Bio* to filter all columns starting with 'Bio'.
88
+ -gp GENOME_POSITIONS, --genome_positions GENOME_POSITIONS
89
+ Genome positions to restrict the analysis. The positions must be separated by two dots. Example: 1..1000. Open start or end positions are allowed by omitting the first or last position,
90
+ respectively. If not specified, the whole reference genome is considered.
91
+ -dr DATE_RANGE, --date_range DATE_RANGE
92
+ Date range to filter the data. The date range must be separated by two dots and the format must be YYYY-MM-DD. Example: 2020-01-01..2020-12-31. If not specified, the whole dataset is
93
+ considered. Note that if the origin is specified, the most restrictive date range is considered.
94
+
95
+ Error: the following arguments are required: seqs, meta, out
96
+ ```
97
+
98
+ ## Tests
99
+
100
+ This package has been developed using `pytest` for testing. To run the tests, you may install PyEvoMotion from the `sdist` archive, decompress it, install it and run the tests:
101
+
102
+ ```bash
103
+ pip download --no-deps --no-binary :all: PyEvoMotion
104
+ tar -xvzf pyevomotion-*.tar.gz
105
+ cd pyevomotion-*/
106
+ pip install .
107
+ PyEvoMotion # To trigger mafft installation. Ensure afterwards that mafft is available in your PATH.
108
+ pytest
109
+ ```
110
+
111
+ > [!WARNING]
112
+ > The first time the tests are run, they will automatically download the test data from `https://sourceforge.net/projects/pyevomotion/files/test_data.zip/download` and extract it in the appropriate directory.
113
+ >
114
+ > Given the size of the test data, this may take a while.
115
+
116
+
117
+
@@ -0,0 +1,13 @@
1
+ PyEvoMotion/__init__.py,sha256=NqFDD-EZBzouzTwXozZqhPC9sLr7GQaElRKtP0tkHoE,568
2
+ PyEvoMotion/cli.py,sha256=424ATWV3P89sMYhXT2-i3eHN-VwmujWjcue0coxY-lQ,15498
3
+ PyEvoMotion/core/__init__.py,sha256=1I-NkFFh6ljLgB_mqQVFLNvCrVKEHLVxa_5dsv3ihWQ,450
4
+ PyEvoMotion/core/base.py,sha256=SvvJAuYx__NAI8Mzye9lZJM6mPsEuKnulcJuztiPU_E,13226
5
+ PyEvoMotion/core/core.py,sha256=xCVGZxNvIKge0KINHO-tHK6aD4tdL7Zgua10eONAbi0,18311
6
+ PyEvoMotion/core/parser.py,sha256=xbjTbIvNy6ta-8WBWwDdnUoTjARPE9eZyaHOXfQKW4U,17144
7
+ PyEvoMotion/utils.py,sha256=Ye3eL1RXZOZzzs2KZy0R45u06DOtLYo-zqE45tN2t7g,2859
8
+ share/mafft_install.sh,sha256=pCw70UsKkkNXUsZMwQlQ2b4zSXFrBA7jAj9iOfGLzUw,1007
9
+ share/manuscript_figure.py,sha256=czznZchVsb7qsCXEGJo-NFG7DJuAE2XxydLmqsAQ66g,9704
10
+ pyevomotion-0.1.0.dist-info/METADATA,sha256=tEv8So175_nhi-FAr_aCM3lo052xgIh-_HPJBGWCc6k,5950
11
+ pyevomotion-0.1.0.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
12
+ pyevomotion-0.1.0.dist-info/entry_points.txt,sha256=UMzoojYwQi-713hRggkQXUIfGNygUARhTdGs77Usp7s,53
13
+ pyevomotion-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any