clipkit 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clipkit/__init__.py +1 -0
- clipkit/__main__.py +6 -0
- clipkit/api.py +71 -0
- clipkit/args_processing.py +65 -0
- clipkit/clipkit.py +232 -0
- clipkit/exceptions.py +6 -0
- clipkit/files.py +75 -0
- clipkit/helpers.py +78 -0
- clipkit/logger.py +6 -0
- clipkit/modes.py +20 -0
- clipkit/msa.py +462 -0
- clipkit/parser.py +316 -0
- clipkit/settings.py +2 -0
- clipkit/site_classification.py +40 -0
- clipkit/smart_gap_helper.py +139 -0
- clipkit/stats.py +37 -0
- clipkit/version.py +1 -0
- clipkit/warnings.py +25 -0
- clipkit/write.py +97 -0
- clipkit-2.7.0.dist-info/LICENSE.md +7 -0
- clipkit-2.7.0.dist-info/METADATA +147 -0
- clipkit-2.7.0.dist-info/RECORD +25 -0
- clipkit-2.7.0.dist-info/WHEEL +5 -0
- clipkit-2.7.0.dist-info/entry_points.txt +2 -0
- clipkit-2.7.0.dist-info/top_level.txt +1 -0
clipkit/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .api import clipkit
|
clipkit/__main__.py
ADDED
clipkit/api.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import TextIO, Union
|
|
2
|
+
from tempfile import NamedTemporaryFile
|
|
3
|
+
|
|
4
|
+
from .clipkit import run
|
|
5
|
+
from .files import FileFormat
|
|
6
|
+
from .helpers import SeqType, write_msa
|
|
7
|
+
from .logger import logger
|
|
8
|
+
from .modes import TrimmingMode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def clipkit(
|
|
12
|
+
*,
|
|
13
|
+
raw_alignment: Union[str, None] = None,
|
|
14
|
+
input_file_path: Union[str, None] = None,
|
|
15
|
+
output_file_path: Union[str, None] = None,
|
|
16
|
+
mode: TrimmingMode = TrimmingMode.smart_gap,
|
|
17
|
+
gaps: Union[float, None] = None,
|
|
18
|
+
gap_characters=None,
|
|
19
|
+
input_file_format=FileFormat.fasta,
|
|
20
|
+
output_file_format=FileFormat.fasta,
|
|
21
|
+
sequence_type=SeqType.aa,
|
|
22
|
+
codon: bool = False,
|
|
23
|
+
ends_only=False,
|
|
24
|
+
threads: int = 1,
|
|
25
|
+
) -> TextIO:
|
|
26
|
+
"""
|
|
27
|
+
If input_file_path is given with no output_file_path -> Bio MSA (multiple sequence alignment object)
|
|
28
|
+
If input_file_path is given and output_file_path is given -> write to output file
|
|
29
|
+
If raw_alignment is given we write it to NamedTemporaryFile and then pass to execute
|
|
30
|
+
* handles when output_file_path is given and also when not given
|
|
31
|
+
"""
|
|
32
|
+
logger.disabled = True
|
|
33
|
+
output_temp_file = None
|
|
34
|
+
input_temp_file = None
|
|
35
|
+
if raw_alignment:
|
|
36
|
+
input_temp_file = NamedTemporaryFile()
|
|
37
|
+
input_temp_file.write(bytes(raw_alignment, "utf-8"))
|
|
38
|
+
input_temp_file.flush()
|
|
39
|
+
|
|
40
|
+
if not output_file_path:
|
|
41
|
+
output_temp_file = NamedTemporaryFile()
|
|
42
|
+
|
|
43
|
+
# override some options not currently available through programmatic interface
|
|
44
|
+
complement = False
|
|
45
|
+
use_log = False
|
|
46
|
+
quiet = True
|
|
47
|
+
auxiliary_file = None # TODO: implement?
|
|
48
|
+
|
|
49
|
+
trim_run, stats = run(
|
|
50
|
+
input_temp_file.name if input_temp_file else input_file_path,
|
|
51
|
+
input_file_format,
|
|
52
|
+
output_temp_file.name if output_temp_file else output_file_path,
|
|
53
|
+
output_file_format,
|
|
54
|
+
auxiliary_file,
|
|
55
|
+
sequence_type,
|
|
56
|
+
gaps,
|
|
57
|
+
gap_characters,
|
|
58
|
+
complement,
|
|
59
|
+
codon,
|
|
60
|
+
TrimmingMode(mode),
|
|
61
|
+
use_log,
|
|
62
|
+
quiet,
|
|
63
|
+
ends_only,
|
|
64
|
+
threads,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if not output_file_path:
|
|
68
|
+
return trim_run, stats
|
|
69
|
+
else:
|
|
70
|
+
write_msa(trim_run.msa, output_file_path, trim_run.output_file_format)
|
|
71
|
+
return output_file_path, stats
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os.path
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from .helpers import SeqType
|
|
6
|
+
from .modes import TrimmingMode
|
|
7
|
+
from .settings import DEFAULT_AA_GAP_CHARS
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def process_args(args) -> dict:
|
|
13
|
+
"""
|
|
14
|
+
Process args from argparser and set defaults
|
|
15
|
+
"""
|
|
16
|
+
input_file = args.input
|
|
17
|
+
output_file = args.output or f"{input_file}.clipkit"
|
|
18
|
+
|
|
19
|
+
if not os.path.isfile(input_file):
|
|
20
|
+
logger.warning("Input file does not exist")
|
|
21
|
+
sys.exit()
|
|
22
|
+
|
|
23
|
+
if input_file == output_file:
|
|
24
|
+
logger.warning("Input and output files can't have the same name.")
|
|
25
|
+
sys.exit()
|
|
26
|
+
|
|
27
|
+
# assign optional arguments
|
|
28
|
+
complement = args.complementary or False
|
|
29
|
+
codon = args.codon or False
|
|
30
|
+
mode = TrimmingMode(args.mode) if args.mode else TrimmingMode.smart_gap
|
|
31
|
+
gaps = float(args.gaps) if args.gaps is not None else 0.9
|
|
32
|
+
gap_characters = (
|
|
33
|
+
[c for c in args.gap_characters] if args.gap_characters is not None else None
|
|
34
|
+
)
|
|
35
|
+
auxiliary_file = args.auxiliary_file
|
|
36
|
+
use_log = args.log or False
|
|
37
|
+
quiet = args.quiet or False
|
|
38
|
+
sequence_type = SeqType(args.sequence_type.lower()) if args.sequence_type else None
|
|
39
|
+
|
|
40
|
+
if codon and mode == TrimmingMode.c3:
|
|
41
|
+
logger.warning(
|
|
42
|
+
"C3 and codon-based trimming are incompatible.\nCodon-based trimming removes whole codons while C3 removes every third codon position."
|
|
43
|
+
)
|
|
44
|
+
sys.exit()
|
|
45
|
+
|
|
46
|
+
ends_only = args.ends_only or False
|
|
47
|
+
threads = args.threads if hasattr(args, 'threads') else 1
|
|
48
|
+
|
|
49
|
+
return dict(
|
|
50
|
+
input_file=input_file,
|
|
51
|
+
output_file=output_file,
|
|
52
|
+
input_file_format=args.input_file_format,
|
|
53
|
+
output_file_format=args.output_file_format,
|
|
54
|
+
auxiliary_file=auxiliary_file,
|
|
55
|
+
codon=codon,
|
|
56
|
+
sequence_type=sequence_type,
|
|
57
|
+
complement=complement,
|
|
58
|
+
gaps=gaps,
|
|
59
|
+
gap_characters=gap_characters,
|
|
60
|
+
mode=mode,
|
|
61
|
+
use_log=use_log,
|
|
62
|
+
quiet=quiet,
|
|
63
|
+
ends_only=ends_only,
|
|
64
|
+
threads=threads,
|
|
65
|
+
)
|
clipkit/clipkit.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from typing import Union
|
|
7
|
+
|
|
8
|
+
from Bio.Align import MultipleSeqAlignment
|
|
9
|
+
from .args_processing import process_args
|
|
10
|
+
from .exceptions import InvalidInputFileFormat
|
|
11
|
+
from .files import (
|
|
12
|
+
get_alignment_and_format,
|
|
13
|
+
FileFormat,
|
|
14
|
+
write_debug_log_file,
|
|
15
|
+
get_custom_sites_to_trim,
|
|
16
|
+
)
|
|
17
|
+
from .helpers import (
|
|
18
|
+
create_msa,
|
|
19
|
+
get_seq_type,
|
|
20
|
+
get_gap_chars,
|
|
21
|
+
write_msa,
|
|
22
|
+
write_complement,
|
|
23
|
+
SeqType,
|
|
24
|
+
)
|
|
25
|
+
from .logger import logger, log_file_logger
|
|
26
|
+
from .modes import TrimmingMode
|
|
27
|
+
from .msa import MSA
|
|
28
|
+
from .parser import create_parser
|
|
29
|
+
from .settings import DEFAULT_AA_GAP_CHARS, DEFAULT_NT_GAP_CHARS
|
|
30
|
+
from .smart_gap_helper import smart_gap_threshold_determination
|
|
31
|
+
from .version import __version__ as current_version
|
|
32
|
+
from .warnings import (
|
|
33
|
+
warn_if_all_sites_were_trimmed,
|
|
34
|
+
warn_if_entry_contains_only_gaps,
|
|
35
|
+
)
|
|
36
|
+
from .write import (
|
|
37
|
+
write_user_args,
|
|
38
|
+
write_output_stats,
|
|
39
|
+
write_output_files_message,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
from dataclasses import dataclass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class TrimRun:
|
|
47
|
+
alignment: MultipleSeqAlignment
|
|
48
|
+
msa: MSA
|
|
49
|
+
gap_characters: list
|
|
50
|
+
sequence_type: SeqType
|
|
51
|
+
input_file_format: FileFormat
|
|
52
|
+
output_file_format: FileFormat
|
|
53
|
+
gaps: float
|
|
54
|
+
codon: bool
|
|
55
|
+
version: str = current_version
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def complement(self):
|
|
59
|
+
return self.msa.complement_to_bio_msa()
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def trimmed(self):
|
|
63
|
+
return self.msa.to_bio_msa()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def run(
|
|
67
|
+
input_file: str,
|
|
68
|
+
input_file_format: FileFormat,
|
|
69
|
+
output_file: str,
|
|
70
|
+
output_file_format: FileFormat,
|
|
71
|
+
auxiliary_file: str,
|
|
72
|
+
sequence_type: Union[SeqType, None],
|
|
73
|
+
gaps: float,
|
|
74
|
+
gap_characters: Union[list, None],
|
|
75
|
+
complement: bool,
|
|
76
|
+
codon: bool,
|
|
77
|
+
mode: TrimmingMode,
|
|
78
|
+
use_log: bool,
|
|
79
|
+
quiet: bool,
|
|
80
|
+
ends_only: bool,
|
|
81
|
+
threads: int = 1,
|
|
82
|
+
):
|
|
83
|
+
try:
|
|
84
|
+
alignment, input_file_format = get_alignment_and_format(
|
|
85
|
+
input_file, input_file_format
|
|
86
|
+
)
|
|
87
|
+
except InvalidInputFileFormat:
|
|
88
|
+
return logger.error(
|
|
89
|
+
f"""Format type could not be read.\nPlease check acceptable input file formats: {", ".join([file_format.value for file_format in FileFormat])}"""
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
sequence_type = sequence_type or get_seq_type(alignment)
|
|
93
|
+
|
|
94
|
+
if not gap_characters:
|
|
95
|
+
gap_characters = get_gap_chars(sequence_type)
|
|
96
|
+
|
|
97
|
+
if not output_file_format:
|
|
98
|
+
output_file_format = input_file_format
|
|
99
|
+
else:
|
|
100
|
+
output_file_format = FileFormat(output_file_format)
|
|
101
|
+
|
|
102
|
+
# determine smart_gap threshold
|
|
103
|
+
if mode in {
|
|
104
|
+
TrimmingMode.smart_gap,
|
|
105
|
+
TrimmingMode.kpi_smart_gap,
|
|
106
|
+
TrimmingMode.kpic_smart_gap,
|
|
107
|
+
}:
|
|
108
|
+
gaps = smart_gap_threshold_determination(alignment, gap_characters)
|
|
109
|
+
|
|
110
|
+
site_positions_to_trim = None
|
|
111
|
+
if mode == TrimmingMode.cst:
|
|
112
|
+
aln_length = alignment.get_alignment_length()
|
|
113
|
+
site_positions_to_trim = (
|
|
114
|
+
get_custom_sites_to_trim(auxiliary_file, aln_length) or []
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
msa = create_msa(alignment, gap_characters, threads)
|
|
118
|
+
msa.trim(
|
|
119
|
+
mode,
|
|
120
|
+
gap_threshold=gaps,
|
|
121
|
+
site_positions_to_trim=site_positions_to_trim,
|
|
122
|
+
codon=codon,
|
|
123
|
+
ends_only=ends_only,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
trim_run = TrimRun(
|
|
127
|
+
alignment,
|
|
128
|
+
msa,
|
|
129
|
+
gap_characters,
|
|
130
|
+
sequence_type,
|
|
131
|
+
input_file_format,
|
|
132
|
+
output_file_format,
|
|
133
|
+
gaps,
|
|
134
|
+
codon,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return trim_run, msa.stats
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def execute(
|
|
141
|
+
input_file: str,
|
|
142
|
+
input_file_format: FileFormat,
|
|
143
|
+
output_file: str,
|
|
144
|
+
output_file_format: FileFormat,
|
|
145
|
+
sequence_type: Union[SeqType, None],
|
|
146
|
+
gaps: float,
|
|
147
|
+
gap_characters: Union[list, None],
|
|
148
|
+
complement: bool,
|
|
149
|
+
codon: bool,
|
|
150
|
+
ends_only: bool,
|
|
151
|
+
mode: TrimmingMode,
|
|
152
|
+
use_log: bool,
|
|
153
|
+
quiet: bool,
|
|
154
|
+
auxiliary_file: str = None,
|
|
155
|
+
threads: int = 1,
|
|
156
|
+
**kwargs,
|
|
157
|
+
) -> None:
|
|
158
|
+
if use_log:
|
|
159
|
+
log_file_logger.setLevel(logging.DEBUG)
|
|
160
|
+
log_file_logger.propagate = False
|
|
161
|
+
fh = logging.FileHandler(f"{output_file}.log", mode="w")
|
|
162
|
+
fh.setLevel(logging.DEBUG)
|
|
163
|
+
log_file_logger.addHandler(fh)
|
|
164
|
+
|
|
165
|
+
if quiet:
|
|
166
|
+
logger.disabled = True
|
|
167
|
+
|
|
168
|
+
# for reporting runtime duration to user
|
|
169
|
+
start_time = time.time()
|
|
170
|
+
|
|
171
|
+
trim_run, stats = run(
|
|
172
|
+
input_file,
|
|
173
|
+
input_file_format,
|
|
174
|
+
output_file,
|
|
175
|
+
output_file_format,
|
|
176
|
+
auxiliary_file,
|
|
177
|
+
sequence_type,
|
|
178
|
+
gaps,
|
|
179
|
+
gap_characters,
|
|
180
|
+
complement,
|
|
181
|
+
codon,
|
|
182
|
+
mode,
|
|
183
|
+
use_log,
|
|
184
|
+
quiet,
|
|
185
|
+
ends_only,
|
|
186
|
+
threads,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# display to user what args are being used in stdout
|
|
190
|
+
write_user_args(
|
|
191
|
+
input_file,
|
|
192
|
+
trim_run.input_file_format,
|
|
193
|
+
output_file,
|
|
194
|
+
trim_run.output_file_format,
|
|
195
|
+
trim_run.sequence_type,
|
|
196
|
+
trim_run.gaps,
|
|
197
|
+
trim_run.gap_characters,
|
|
198
|
+
mode,
|
|
199
|
+
complement,
|
|
200
|
+
codon,
|
|
201
|
+
use_log,
|
|
202
|
+
ends_only,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
write_output_files_message(output_file, complement, use_log)
|
|
206
|
+
|
|
207
|
+
if use_log:
|
|
208
|
+
warn_if_all_sites_were_trimmed(trim_run.msa)
|
|
209
|
+
warn_if_entry_contains_only_gaps(trim_run.msa)
|
|
210
|
+
write_debug_log_file(trim_run.msa)
|
|
211
|
+
|
|
212
|
+
write_msa(trim_run.msa, output_file, trim_run.output_file_format)
|
|
213
|
+
|
|
214
|
+
# if the -c/--complementary argument was used, create an alignment of the trimmed sequences
|
|
215
|
+
if complement:
|
|
216
|
+
write_complement(trim_run.msa, output_file, trim_run.output_file_format)
|
|
217
|
+
|
|
218
|
+
write_output_stats(stats, start_time)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def main(argv=None):
|
|
222
|
+
"""
|
|
223
|
+
Function that parses and collects arguments
|
|
224
|
+
"""
|
|
225
|
+
parser = create_parser()
|
|
226
|
+
args = parser.parse_args()
|
|
227
|
+
|
|
228
|
+
execute(**process_args(args))
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
if __name__ == "__main__":
|
|
232
|
+
main(sys.argv[1:])
|
clipkit/exceptions.py
ADDED
clipkit/files.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from .logger import log_file_logger
|
|
3
|
+
|
|
4
|
+
from Bio import AlignIO
|
|
5
|
+
from Bio.Align import MultipleSeqAlignment
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from .exceptions import InvalidInputFileFormat
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FileFormat(Enum):
|
|
12
|
+
fasta = "fasta"
|
|
13
|
+
clustal = "clustal"
|
|
14
|
+
maf = "maf"
|
|
15
|
+
mauve = "mauve"
|
|
16
|
+
phylip = "phylip"
|
|
17
|
+
phylip_sequential = "phylip_sequential"
|
|
18
|
+
phylip_relaxed = "phylip_relaxed"
|
|
19
|
+
stockholm = "stockholm"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_alignment_and_format(
|
|
23
|
+
input_file_name: str, file_format: FileFormat
|
|
24
|
+
) -> tuple[MultipleSeqAlignment, FileFormat]:
|
|
25
|
+
"""
|
|
26
|
+
Automatically determines what type of input file was used
|
|
27
|
+
and reads in the alignment file
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
if file_format:
|
|
31
|
+
file_format = FileFormat(file_format)
|
|
32
|
+
alignment = AlignIO.read(open(input_file_name), file_format.value)
|
|
33
|
+
return alignment, file_format
|
|
34
|
+
else:
|
|
35
|
+
# attempt to auto-detect file format
|
|
36
|
+
for fileFormat in FileFormat:
|
|
37
|
+
try:
|
|
38
|
+
alignment = AlignIO.read(open(input_file_name), fileFormat.value)
|
|
39
|
+
return alignment, fileFormat
|
|
40
|
+
# the following exceptions refer to skipping over errors
|
|
41
|
+
# associated with reading the wrong input file
|
|
42
|
+
except ValueError:
|
|
43
|
+
continue
|
|
44
|
+
except AssertionError:
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
raise InvalidInputFileFormat("File could not be read")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_custom_sites_to_trim(file_path: str, aln_length: int) -> list:
|
|
51
|
+
with open(file_path) as f:
|
|
52
|
+
lines = f.read().splitlines()
|
|
53
|
+
|
|
54
|
+
sites_to_trim = []
|
|
55
|
+
sites_to_keep = []
|
|
56
|
+
for line in lines:
|
|
57
|
+
site = line.split("\t")
|
|
58
|
+
pos = int(site[0]) - 1
|
|
59
|
+
if site[1] == "trim":
|
|
60
|
+
sites_to_trim.append(pos)
|
|
61
|
+
else:
|
|
62
|
+
sites_to_keep.append(pos)
|
|
63
|
+
|
|
64
|
+
if len(sites_to_trim) == 0:
|
|
65
|
+
# we only had keeps so treat every other site as a trim
|
|
66
|
+
sites_to_trim = list(
|
|
67
|
+
np.setdiff1d(np.arange(aln_length), np.array(sites_to_keep))
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return sites_to_trim
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def write_debug_log_file(msa):
|
|
74
|
+
for info in msa.generate_debug_log_info():
|
|
75
|
+
log_file_logger.debug(f"{str(info[0] + 1)} {info[1]} {info[2].value} {info[3]}")
|
clipkit/helpers.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from Bio import SeqIO
|
|
4
|
+
from Bio.Align import MultipleSeqAlignment
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from .msa import MSA
|
|
8
|
+
from .modes import TrimmingMode
|
|
9
|
+
from .settings import DEFAULT_AA_GAP_CHARS, DEFAULT_NT_GAP_CHARS
|
|
10
|
+
from .files import FileFormat
|
|
11
|
+
from .stats import TrimmingStats
|
|
12
|
+
|
|
13
|
+
from enum import Enum
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SeqType(Enum):
|
|
17
|
+
aa = "aa"
|
|
18
|
+
nt = "nt"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def remove_gaps(seq: str, gap_chars: list[str] = DEFAULT_AA_GAP_CHARS) -> str:
|
|
22
|
+
pattern = "|".join([re.escape(char) for char in gap_chars])
|
|
23
|
+
return re.sub(pattern, "", seq)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_seq_type(alignment: MultipleSeqAlignment) -> SeqType:
|
|
27
|
+
seq = str(alignment[0].seq)
|
|
28
|
+
seq = remove_gaps(seq)
|
|
29
|
+
if len(seq) < 200:
|
|
30
|
+
seq = "".join([str(record.seq) for record in alignment])
|
|
31
|
+
seq = remove_gaps(seq)
|
|
32
|
+
|
|
33
|
+
if len(set(seq.upper())) > 5:
|
|
34
|
+
sequence_type = SeqType.aa
|
|
35
|
+
else:
|
|
36
|
+
sequence_type = SeqType.nt
|
|
37
|
+
|
|
38
|
+
return sequence_type
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_gap_chars(seq_type: SeqType) -> list[str]:
|
|
42
|
+
if seq_type == SeqType.nt:
|
|
43
|
+
return DEFAULT_NT_GAP_CHARS
|
|
44
|
+
else:
|
|
45
|
+
return DEFAULT_AA_GAP_CHARS
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def create_msa(alignment: MultipleSeqAlignment, gap_chars: list[str] = None, threads: int = 1) -> MSA:
|
|
49
|
+
"""
|
|
50
|
+
Create MSA class
|
|
51
|
+
"""
|
|
52
|
+
return MSA.from_bio_msa(alignment, gap_chars, threads)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def write_msa(msa: MSA, out_file_name: str, out_file_format: FileFormat) -> None:
|
|
56
|
+
"""
|
|
57
|
+
msa is populated with sites that are kept after trimming is finished
|
|
58
|
+
"""
|
|
59
|
+
output_msa = msa.to_bio_msa()
|
|
60
|
+
if out_file_format.value == "phylip_relaxed":
|
|
61
|
+
SeqIO.write(output_msa, out_file_name, "phylip-relaxed")
|
|
62
|
+
elif out_file_format.value == "phylip_sequential":
|
|
63
|
+
SeqIO.write(output_msa, out_file_name, "phylip-sequential")
|
|
64
|
+
else:
|
|
65
|
+
SeqIO.write(output_msa, out_file_name, out_file_format.value)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def write_complement(msa: MSA, out_file: str, out_file_format: FileFormat) -> None:
|
|
69
|
+
"""
|
|
70
|
+
msa is populated with sites that are trimmed after trimming is finished
|
|
71
|
+
"""
|
|
72
|
+
output_msa = msa.complement_to_bio_msa()
|
|
73
|
+
completmentOut = str(out_file) + ".complement"
|
|
74
|
+
if out_file_format.value == "phylip_relaxed":
|
|
75
|
+
SeqIO.write(output_msa, out_file, "phylip-relaxed")
|
|
76
|
+
elif out_file_format.value == "phylip_sequential":
|
|
77
|
+
SeqIO.write(output_msa, out_file, "phylip-sequential")
|
|
78
|
+
SeqIO.write(output_msa, completmentOut, out_file_format.value)
|
clipkit/logger.py
ADDED
clipkit/modes.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
from .logger import log_file_logger
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from Bio.Align import MultipleSeqAlignment
|
|
7
|
+
from .msa import MSA
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TrimmingMode(Enum):
|
|
11
|
+
gappy = "gappy"
|
|
12
|
+
smart_gap = "smart-gap"
|
|
13
|
+
kpi = "kpi" # keep parsimony informative sites
|
|
14
|
+
kpi_gappy = "kpi-gappy"
|
|
15
|
+
kpi_smart_gap = "kpi-smart-gap"
|
|
16
|
+
kpic = "kpic" # keep parsimony informative and constant sites
|
|
17
|
+
kpic_gappy = "kpic-gappy"
|
|
18
|
+
kpic_smart_gap = "kpic-smart-gap"
|
|
19
|
+
cst = "cst" # custom site trimming
|
|
20
|
+
c3 = "c3"
|