PyEvoMotion 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyEvoMotion/__init__.py +11 -0
- PyEvoMotion/cli.py +440 -0
- PyEvoMotion/core/__init__.py +7 -0
- PyEvoMotion/core/base.py +406 -0
- PyEvoMotion/core/core.py +520 -0
- PyEvoMotion/core/parser.py +467 -0
- PyEvoMotion/utils.py +87 -0
- pyevomotion-0.1.0.dist-info/METADATA +117 -0
- pyevomotion-0.1.0.dist-info/RECORD +13 -0
- pyevomotion-0.1.0.dist-info/WHEEL +4 -0
- pyevomotion-0.1.0.dist-info/entry_points.txt +3 -0
- share/mafft_install.sh +44 -0
- share/manuscript_figure.py +316 -0
PyEvoMotion/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
"""
|
2
|
+
The main functionality of the ``PyEvoMotion`` project is abstracted into the following classes:
|
3
|
+
|
4
|
+
* :class:`PyEvoMotion` - The main class that encapsulates the entire analysis.
|
5
|
+
* :class:`PyEvoMotionBase` - The base class that provides basic utility functions inherited by :class:`PyEvoMotion`.
|
6
|
+
* :class:`PyEvoMotionParser` - The class that provides the functionality to parse the input data for the analysis, inherited by :class:`PyEvoMotion`.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from .core.core import PyEvoMotion
|
10
|
+
from .core.base import PyEvoMotionBase
|
11
|
+
from .core.parser import PyEvoMotionParser
|
PyEvoMotion/cli.py
ADDED
@@ -0,0 +1,440 @@
|
|
1
|
+
"""
|
2
|
+
Command line interface for :class:`PyEvoMotion`.
|
3
|
+
|
4
|
+
It parses the arguments from the command line and runs the analysis with the specified parameters.
|
5
|
+
|
6
|
+
This module is not meant to be inherited from, but to be used as a standalone script in the command line.
|
7
|
+
"""
|
8
|
+
|
9
|
+
import json
|
10
|
+
import argparse
|
11
|
+
from datetime import datetime
|
12
|
+
|
13
|
+
from .core.core import PyEvoMotion
|
14
|
+
from .utils import check_and_install_mafft
|
15
|
+
|
16
|
+
PACKAGE_DESCRIPTION = "PyEvoMotion"
|
17
|
+
BANNER = r"""
|
18
|
+
Welcome to Rodrigolab's
|
19
|
+
_____ ______ __ __ _ _
|
20
|
+
| __ \ | ____| | \/ | | | (_)
|
21
|
+
| |__) | _| |____ _____ | \ / | ___ | |_ _ ___ _ __
|
22
|
+
| ___/ | | | __\ \ / / _ \| |\/| |/ _ \| __| |/ _ \| '_ \
|
23
|
+
| | | |_| | |___\ V / (_) | | | | (_) | |_| | (_) | | | |
|
24
|
+
|_| \__, |______\_/ \___/|_| |_|\___/ \__|_|\___/|_| |_|
|
25
|
+
__/ |
|
26
|
+
|___/
|
27
|
+
"""
|
28
|
+
|
29
|
+
class _ArgumentParserWithHelpOnError(argparse.ArgumentParser):
|
30
|
+
"""
|
31
|
+
Custom ArgumentParser that prints the help message when an error occurs.
|
32
|
+
"""
|
33
|
+
|
34
|
+
def error(self, message: str) -> None:
|
35
|
+
"""
|
36
|
+
Print the help message and the error message.
|
37
|
+
|
38
|
+
:param message: the error message to print.
|
39
|
+
:type message: str
|
40
|
+
"""
|
41
|
+
self.print_help()
|
42
|
+
print(f"\nError: {message}\n")
|
43
|
+
super().exit(2)
|
44
|
+
|
45
|
+
class _ParseFilter(argparse.Action):
|
46
|
+
"""
|
47
|
+
Custom action to parse the filters from the command line.
|
48
|
+
|
49
|
+
The filters are passed as key-value pairs, where the key is followed by multiple values, specified in square brackets.
|
50
|
+
"""
|
51
|
+
def __call__(self, _: argparse.ArgumentParser, namespace: argparse.Namespace, values: list[str], option_string: str | None = None) -> None:
|
52
|
+
"""
|
53
|
+
Call the action to parse the filters.
|
54
|
+
|
55
|
+
:param _: the parser.
|
56
|
+
:type _: argparse.ArgumentParser
|
57
|
+
:param namespace: the namespace to store the parsed filters.
|
58
|
+
:type namespace: argparse.Namespace
|
59
|
+
:param values: the values to parse.
|
60
|
+
:type values: list[str]
|
61
|
+
:param option_string: the option string.
|
62
|
+
:type option_string: str
|
63
|
+
:raises ValueError: if the values are not in the correct format.
|
64
|
+
"""
|
65
|
+
|
66
|
+
setattr(namespace, self.dest, self.parse_filters(values))
|
67
|
+
|
68
|
+
@staticmethod
|
69
|
+
def parse_filters(values: list[str] | None) -> dict[str, str | list[str]] | None:
|
70
|
+
"""
|
71
|
+
Parse the filters from the values.
|
72
|
+
|
73
|
+
:param values: the values to parse.
|
74
|
+
:type values: list[str] | None
|
75
|
+
:return: the parsed filters as a dictionary.
|
76
|
+
:rtype: dict[str, str | list[str]] | None
|
77
|
+
"""
|
78
|
+
|
79
|
+
if values is None: return None
|
80
|
+
|
81
|
+
# Create an iterator to process values one by one
|
82
|
+
cleaned_values = []
|
83
|
+
buffer = []
|
84
|
+
inside_brackets = False
|
85
|
+
|
86
|
+
# Loop through the input values and handle brackets
|
87
|
+
for value in values:
|
88
|
+
if value.startswith('[') and value.endswith(']'): # Single value inside brackets
|
89
|
+
cleaned_values.append(value[1:-1])
|
90
|
+
if value.startswith('['): # Start of a bracketed group
|
91
|
+
inside_brackets = True
|
92
|
+
buffer.append(value[1:]) # Strip the '['
|
93
|
+
elif value.endswith(']'): # End of a bracketed group
|
94
|
+
buffer.append(value[:-1]) # Strip the ']'
|
95
|
+
cleaned_values.append(buffer)
|
96
|
+
buffer = []
|
97
|
+
inside_brackets = False
|
98
|
+
elif inside_brackets: # Values inside the brackets
|
99
|
+
buffer.append(value)
|
100
|
+
else: # Regular values outside of brackets
|
101
|
+
cleaned_values.append(value)
|
102
|
+
|
103
|
+
return dict(zip(
|
104
|
+
cleaned_values[::2],
|
105
|
+
cleaned_values[1::2]
|
106
|
+
))
|
107
|
+
|
108
|
+
class _ParseGenomePosition(argparse.Action):
|
109
|
+
"""
|
110
|
+
Custom action to parse the genome positions from the command line.
|
111
|
+
|
112
|
+
The genome positions are passed as a string with two dots separating the start and end positions. Open start or end positions are allowed by omitting the first or last position, respectively.
|
113
|
+
"""
|
114
|
+
def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: str, option_string: str | None = None):
|
115
|
+
"""
|
116
|
+
Call the action to parse the genome positions.
|
117
|
+
|
118
|
+
:param parser: the parser.
|
119
|
+
:type parser: argparse.ArgumentParser
|
120
|
+
:param namespace: the namespace to store the parsed genome positions.
|
121
|
+
:type namespace: argparse.Namespace
|
122
|
+
:param values: the values to parse.
|
123
|
+
:type values: str
|
124
|
+
:param option_string: the option string.
|
125
|
+
:type option_string: str
|
126
|
+
:raises ValueError: if the values are not in the correct format.
|
127
|
+
"""
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
setattr(namespace, self.dest, self.parse_genome_position(parser, values))
|
132
|
+
|
133
|
+
@staticmethod
|
134
|
+
def parse_genome_position(parser: argparse.ArgumentParser, values: str | None) -> tuple[int, int] | None:
|
135
|
+
"""
|
136
|
+
Parse the genome positions from the values.
|
137
|
+
|
138
|
+
:param parser: the parser.
|
139
|
+
:type parser: argparse.ArgumentParser
|
140
|
+
:param values: the values to parse.
|
141
|
+
:type values: str | None
|
142
|
+
:return: the parsed genome positions.
|
143
|
+
:rtype: tuple[int, int] | None
|
144
|
+
:raises ValueError: if the values are not in the correct format.
|
145
|
+
"""
|
146
|
+
|
147
|
+
if values is None: return None
|
148
|
+
|
149
|
+
if not(".." in values):
|
150
|
+
parser.error("The genome positions must be separated by two dots. Example: 1..1000")
|
151
|
+
|
152
|
+
_split = values.split("..")
|
153
|
+
|
154
|
+
positions = []
|
155
|
+
for el in _split:
|
156
|
+
if not el.isdigit() and el != "":
|
157
|
+
parser.error("The genome positions must be positive integers")
|
158
|
+
positions.append(0 if el == "" else int(el))
|
159
|
+
|
160
|
+
return tuple(positions)
|
161
|
+
|
162
|
+
class _ParseDateRange(argparse.Action):
|
163
|
+
"""
|
164
|
+
Custom action to parse the date range from the command line.
|
165
|
+
|
166
|
+
The date range is passed as a string with two dots separating the start and end dates. The format must be YYYY-MM-DD.
|
167
|
+
"""
|
168
|
+
def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: str, option_string: str | None = None):
|
169
|
+
|
170
|
+
setattr(namespace, self.dest, self.parse_date_range(parser, values))
|
171
|
+
|
172
|
+
@staticmethod
|
173
|
+
def parse_date_range(parser: argparse.ArgumentParser, values: str | None) -> tuple[datetime | None, datetime | None] | None:
|
174
|
+
"""
|
175
|
+
Parse the date range from the values.
|
176
|
+
|
177
|
+
:param parser: the parser.
|
178
|
+
:type parser: argparse.ArgumentParser
|
179
|
+
:param values: the values to parse.
|
180
|
+
:type values: str | None
|
181
|
+
:return: the parsed date range.
|
182
|
+
:rtype: tuple[datetime | None, datetime | None] | None
|
183
|
+
"""
|
184
|
+
|
185
|
+
if values is None: return None
|
186
|
+
|
187
|
+
if not(".." in values):
|
188
|
+
parser.error("The date range must be separated by two dots. Example: 2020-01-01..2020-12-31")
|
189
|
+
if values.count(".") > 2:
|
190
|
+
parser.error("The date range must contain '..' as separator")
|
191
|
+
|
192
|
+
_split = values.split("..")
|
193
|
+
|
194
|
+
range = []
|
195
|
+
for date in _split:
|
196
|
+
if date == "":
|
197
|
+
range.append(None)
|
198
|
+
continue
|
199
|
+
try:
|
200
|
+
range.append(datetime.strptime(date, "%Y-%m-%d"))
|
201
|
+
except ValueError:
|
202
|
+
parser.error("Incorrect date format, should be YYYY-MM-DD")
|
203
|
+
|
204
|
+
return tuple(range)
|
205
|
+
|
206
|
+
|
207
|
+
def _parse_arguments() -> argparse.Namespace:
|
208
|
+
"""
|
209
|
+
Parse the arguments from the command line.
|
210
|
+
|
211
|
+
:return: the parsed arguments.
|
212
|
+
:rtype: argparse.Namespace
|
213
|
+
"""
|
214
|
+
|
215
|
+
# True parser. If the -ij argument is not passed, it will be used to parse the arguments
|
216
|
+
parser = _ArgumentParserWithHelpOnError(description=PACKAGE_DESCRIPTION)
|
217
|
+
parser.add_argument(
|
218
|
+
"seqs",
|
219
|
+
type=str,
|
220
|
+
help="Path to the input fasta file containing the sequences."
|
221
|
+
)
|
222
|
+
parser.add_argument(
|
223
|
+
"meta",
|
224
|
+
type=str,
|
225
|
+
help="Path to the corresponding metadata file for the sequences."
|
226
|
+
)
|
227
|
+
parser.add_argument(
|
228
|
+
"out",
|
229
|
+
type=str,
|
230
|
+
help="Path to the output filename prefix used to save the different results."
|
231
|
+
)
|
232
|
+
parser.add_argument(
|
233
|
+
"-dt",
|
234
|
+
"--delta_t",
|
235
|
+
type=str,
|
236
|
+
default="7D",
|
237
|
+
help="Time interval to calculate the statistics. Default is 7 days (7D)."
|
238
|
+
)
|
239
|
+
parser.add_argument(
|
240
|
+
"-sh",
|
241
|
+
"--show",
|
242
|
+
action="store_true",
|
243
|
+
help="Show the plots of the analysis."
|
244
|
+
)
|
245
|
+
parser.add_argument(
|
246
|
+
"-ep",
|
247
|
+
"--export_plots",
|
248
|
+
action="store_true",
|
249
|
+
help="Export the plots of the analysis."
|
250
|
+
)
|
251
|
+
parser.add_argument(
|
252
|
+
"-l",
|
253
|
+
"--length_filter",
|
254
|
+
type=int,
|
255
|
+
default=0,
|
256
|
+
help="Length filter for the sequences (removes sequences with length less than the specified value). Default is 0."
|
257
|
+
)
|
258
|
+
parser.add_argument(
|
259
|
+
"-n",
|
260
|
+
"--n_threshold",
|
261
|
+
type=int,
|
262
|
+
default=2,
|
263
|
+
help="Minimum number of sequences required in a time interval to compute statistics. Default is 2."
|
264
|
+
)
|
265
|
+
parser.add_argument(
|
266
|
+
"-xj",
|
267
|
+
"--export_json",
|
268
|
+
action="store_true",
|
269
|
+
help="Export the run arguments to a json file."
|
270
|
+
)
|
271
|
+
parser.add_argument(
|
272
|
+
"-ij",
|
273
|
+
"--import_json",
|
274
|
+
type=str,
|
275
|
+
help="Import the run arguments from a JSON file. If this argument is passed, the other arguments are ignored. The JSON file must contain the mandatory keys 'seqs', 'meta', and 'out'."
|
276
|
+
)
|
277
|
+
parser.add_argument(
|
278
|
+
"-k",
|
279
|
+
"--kind",
|
280
|
+
type=str,
|
281
|
+
choices=["all", "total", "substitutions", "indels"],
|
282
|
+
default="all",
|
283
|
+
help="Kind of mutations to consider for the analysis. Default is 'all'."
|
284
|
+
)
|
285
|
+
parser.add_argument(
|
286
|
+
"-f",
|
287
|
+
"--filter",
|
288
|
+
nargs='+', # Accepts multiple arguments
|
289
|
+
action=_ParseFilter,
|
290
|
+
default=None,
|
291
|
+
help="Specify filters to be applied on the data with keys followed by values. If the values are multiple, they must be enclosed in square brackets. Example: --filter key1 value1 key2 [value2 value3] key3 value4. If either the keys or values contain spaces, they must be enclosed in quotes. keys must be present in the metadata file as columns for the filter to be applied. Use '*' as a wildcard, for example Bio* to filter all columns starting with 'Bio'."
|
292
|
+
)
|
293
|
+
parser.add_argument(
|
294
|
+
"-gp",
|
295
|
+
"--genome_positions",
|
296
|
+
type=str,
|
297
|
+
action=_ParseGenomePosition,
|
298
|
+
default=None,
|
299
|
+
help="Genome positions to restrict the analysis. The positions must be separated by two dots. Example: 1..1000. Open start or end positions are allowed by omitting the first or last position, respectively. If not specified, the whole reference genome is considered."
|
300
|
+
)
|
301
|
+
parser.add_argument(
|
302
|
+
"-dr",
|
303
|
+
"--date_range",
|
304
|
+
type=str,
|
305
|
+
action=_ParseDateRange,
|
306
|
+
default=None,
|
307
|
+
help="Date range to filter the data. The date range must be separated by two dots and the format must be YYYY-MM-DD. Example: 2020-01-01..2020-12-31. If not specified, the whole dataset is considered. Note that if the origin is specified, the most restrictive date range is considered."
|
308
|
+
)
|
309
|
+
|
310
|
+
# Initial parser to parse just the -ij argument
|
311
|
+
json_input_parser = argparse.ArgumentParser(add_help=False)
|
312
|
+
json_input_parser.add_argument(
|
313
|
+
"-ij",
|
314
|
+
"--import_json",
|
315
|
+
type=str
|
316
|
+
)
|
317
|
+
json_input_args, _ = json_input_parser.parse_known_args()
|
318
|
+
|
319
|
+
# If the -ij argument is passed, the arguments are imported from the JSON file
|
320
|
+
if json_input_args.import_json:
|
321
|
+
with open(json_input_args.import_json, "r") as file:
|
322
|
+
# Dumps the arguments to the namespace
|
323
|
+
_args = json.load(file)
|
324
|
+
|
325
|
+
# Checks if the JSON file contains the minimum required keys
|
326
|
+
if not {"seqs", "meta", "out"}.issubset(set(_args.keys())):
|
327
|
+
parser.error("The JSON file must contain the keys 'seqs', 'meta', and 'out'")
|
328
|
+
|
329
|
+
# Initialize a new namespace
|
330
|
+
namespace = argparse.Namespace()
|
331
|
+
|
332
|
+
# Apply the JSON values to the namespace
|
333
|
+
for action in parser._actions:
|
334
|
+
if action.dest in _args:
|
335
|
+
value = _args[action.dest]
|
336
|
+
|
337
|
+
# If the argument has a custom action, apply the action manually
|
338
|
+
if isinstance(action, (_ParseFilter, _ParseGenomePosition, _ParseDateRange)):
|
339
|
+
action(parser, namespace, value)
|
340
|
+
else:
|
341
|
+
# For regular arguments, just set them in the namespace
|
342
|
+
setattr(namespace, action.dest, value)
|
343
|
+
else:
|
344
|
+
# If no value from JSON, use the default value
|
345
|
+
setattr(namespace, action.dest, action.default)
|
346
|
+
|
347
|
+
return namespace
|
348
|
+
|
349
|
+
return parser.parse_args()
|
350
|
+
|
351
|
+
def _simple_serializer(k: str, v: any) -> any:
|
352
|
+
"""
|
353
|
+
Simple serializer to convert the arguments to JSON.
|
354
|
+
|
355
|
+
:param k: the key of the argument.
|
356
|
+
:type k: str
|
357
|
+
:param v: the value of the argument.
|
358
|
+
:type v: any
|
359
|
+
:return: the serialized value.
|
360
|
+
:rtype: any
|
361
|
+
"""
|
362
|
+
|
363
|
+
if k == "date_range":
|
364
|
+
return "..".join(map(lambda x: x.strftime("%Y-%m-%d") if x else "", v))
|
365
|
+
return v
|
366
|
+
|
367
|
+
def _main():
|
368
|
+
check_and_install_mafft()
|
369
|
+
"""
|
370
|
+
Command line interface for :class:`PyEvoMotion`.
|
371
|
+
|
372
|
+
It parses the arguments from the command line and runs the analysis with the specified parameters.
|
373
|
+
"""
|
374
|
+
print(BANNER)
|
375
|
+
args = _parse_arguments()
|
376
|
+
|
377
|
+
# If the -xj argument is passed, the arguments are exported to a JSON file before running the analysis altogether
|
378
|
+
if args.export_json:
|
379
|
+
with open(f"{args.out}_run_args.json", "w") as file:
|
380
|
+
json.dump(
|
381
|
+
{
|
382
|
+
k: _simple_serializer(k, v)
|
383
|
+
for k, v in vars(args).items()
|
384
|
+
if k not in ["export_json", "import_json"]
|
385
|
+
},
|
386
|
+
file,
|
387
|
+
indent=4
|
388
|
+
)
|
389
|
+
|
390
|
+
# Instantiates the PyEvoMotion class, which parses the data on construction
|
391
|
+
instance = PyEvoMotion(
|
392
|
+
args.seqs,
|
393
|
+
args.meta,
|
394
|
+
dt=args.delta_t,
|
395
|
+
filters=args.filter,
|
396
|
+
positions=args.genome_positions,
|
397
|
+
date_range=args.date_range,
|
398
|
+
)
|
399
|
+
|
400
|
+
# Exports the data to a TSV file
|
401
|
+
instance.data.to_csv(
|
402
|
+
f"{args.out}.tsv",
|
403
|
+
sep="\t",
|
404
|
+
index=False
|
405
|
+
)
|
406
|
+
|
407
|
+
# Runs the analysis
|
408
|
+
stats, reg = instance.analysis(
|
409
|
+
length=args.length_filter,
|
410
|
+
n_threshold=args.n_threshold,
|
411
|
+
show=args.show,
|
412
|
+
mutation_kind=args.kind,
|
413
|
+
export_plots_filename=(
|
414
|
+
f"{args.out}_plots"
|
415
|
+
if args.export_plots
|
416
|
+
else None
|
417
|
+
)
|
418
|
+
)
|
419
|
+
|
420
|
+
_reg = reg.copy()
|
421
|
+
|
422
|
+
for k in _reg.keys():
|
423
|
+
del _reg[k]["model"]
|
424
|
+
|
425
|
+
# Exports the statistic results to TSV file
|
426
|
+
stats.to_csv(
|
427
|
+
f"{args.out}_stats.tsv",
|
428
|
+
sep="\t",
|
429
|
+
index=False
|
430
|
+
)
|
431
|
+
|
432
|
+
# Exports the regression models to a JSON file
|
433
|
+
with open(f"{args.out}_regression_results.json", "w") as file:
|
434
|
+
json.dump(_reg, file, indent=4)
|
435
|
+
|
436
|
+
# Exits the program with code 0 (success)
|
437
|
+
exit(0)
|
438
|
+
|
439
|
+
if __name__ == "__main__":
|
440
|
+
_main()
|
@@ -0,0 +1,7 @@
|
|
1
|
+
"""
|
2
|
+
The main functionality of the ``PyEvoMotion`` project is abstracted into the following classes:
|
3
|
+
|
4
|
+
* :class:`PyEvoMotion` - The main class that encapsulates the entire analysis.
|
5
|
+
* :class:`PyEvoMotionBase` - The base class that provides basic utility functions inherited by :class:`PyEvoMotion`.
|
6
|
+
* :class:`PyEvoMotionParser` - The class that provides the functionality to parse the input data for the analysis, inherited by :class:`PyEvoMotion`.
|
7
|
+
"""
|