clipkit 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
clipkit/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .api import clipkit
clipkit/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """clipkit.__main__: executed when clipkit is called as script"""
2
+ import sys
3
+
4
+ from .clipkit import main
5
+
6
+ main(sys.argv[1:])
clipkit/api.py ADDED
@@ -0,0 +1,71 @@
1
+ from typing import TextIO, Union
2
+ from tempfile import NamedTemporaryFile
3
+
4
+ from .clipkit import run
5
+ from .files import FileFormat
6
+ from .helpers import SeqType, write_msa
7
+ from .logger import logger
8
+ from .modes import TrimmingMode
9
+
10
+
11
+ def clipkit(
12
+ *,
13
+ raw_alignment: Union[str, None] = None,
14
+ input_file_path: Union[str, None] = None,
15
+ output_file_path: Union[str, None] = None,
16
+ mode: TrimmingMode = TrimmingMode.smart_gap,
17
+ gaps: Union[float, None] = None,
18
+ gap_characters=None,
19
+ input_file_format=FileFormat.fasta,
20
+ output_file_format=FileFormat.fasta,
21
+ sequence_type=SeqType.aa,
22
+ codon: bool = False,
23
+ ends_only=False,
24
+ threads: int = 1,
25
+ ) -> TextIO:
26
+ """
27
+ If input_file_path is given with no output_file_path -> Bio MSA (multiple sequence alignment object)
28
+ If input_file_path is given and output_file_path is given -> write to output file
29
+ If raw_alignment is given we write it to NamedTemporaryFile and then pass to execute
30
+ * handles when output_file_path is given and also when not given
31
+ """
32
+ logger.disabled = True
33
+ output_temp_file = None
34
+ input_temp_file = None
35
+ if raw_alignment:
36
+ input_temp_file = NamedTemporaryFile()
37
+ input_temp_file.write(bytes(raw_alignment, "utf-8"))
38
+ input_temp_file.flush()
39
+
40
+ if not output_file_path:
41
+ output_temp_file = NamedTemporaryFile()
42
+
43
+ # override some options not currently available through programmatic interface
44
+ complement = False
45
+ use_log = False
46
+ quiet = True
47
+ auxiliary_file = None # TODO: implement?
48
+
49
+ trim_run, stats = run(
50
+ input_temp_file.name if input_temp_file else input_file_path,
51
+ input_file_format,
52
+ output_temp_file.name if output_temp_file else output_file_path,
53
+ output_file_format,
54
+ auxiliary_file,
55
+ sequence_type,
56
+ gaps,
57
+ gap_characters,
58
+ complement,
59
+ codon,
60
+ TrimmingMode(mode),
61
+ use_log,
62
+ quiet,
63
+ ends_only,
64
+ threads,
65
+ )
66
+
67
+ if not output_file_path:
68
+ return trim_run, stats
69
+ else:
70
+ write_msa(trim_run.msa, output_file_path, trim_run.output_file_format)
71
+ return output_file_path, stats
@@ -0,0 +1,65 @@
1
+ import logging
2
+ import os.path
3
+ import sys
4
+
5
+ from .helpers import SeqType
6
+ from .modes import TrimmingMode
7
+ from .settings import DEFAULT_AA_GAP_CHARS
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def process_args(args) -> dict:
13
+ """
14
+ Process args from argparser and set defaults
15
+ """
16
+ input_file = args.input
17
+ output_file = args.output or f"{input_file}.clipkit"
18
+
19
+ if not os.path.isfile(input_file):
20
+ logger.warning("Input file does not exist")
21
+ sys.exit()
22
+
23
+ if input_file == output_file:
24
+ logger.warning("Input and output files can't have the same name.")
25
+ sys.exit()
26
+
27
+ # assign optional arguments
28
+ complement = args.complementary or False
29
+ codon = args.codon or False
30
+ mode = TrimmingMode(args.mode) if args.mode else TrimmingMode.smart_gap
31
+ gaps = float(args.gaps) if args.gaps is not None else 0.9
32
+ gap_characters = (
33
+ [c for c in args.gap_characters] if args.gap_characters is not None else None
34
+ )
35
+ auxiliary_file = args.auxiliary_file
36
+ use_log = args.log or False
37
+ quiet = args.quiet or False
38
+ sequence_type = SeqType(args.sequence_type.lower()) if args.sequence_type else None
39
+
40
+ if codon and mode == TrimmingMode.c3:
41
+ logger.warning(
42
+ "C3 and codon-based trimming are incompatible.\nCodon-based trimming removes whole codons while C3 removes every third codon position."
43
+ )
44
+ sys.exit()
45
+
46
+ ends_only = args.ends_only or False
47
+ threads = args.threads if hasattr(args, 'threads') else 1
48
+
49
+ return dict(
50
+ input_file=input_file,
51
+ output_file=output_file,
52
+ input_file_format=args.input_file_format,
53
+ output_file_format=args.output_file_format,
54
+ auxiliary_file=auxiliary_file,
55
+ codon=codon,
56
+ sequence_type=sequence_type,
57
+ complement=complement,
58
+ gaps=gaps,
59
+ gap_characters=gap_characters,
60
+ mode=mode,
61
+ use_log=use_log,
62
+ quiet=quiet,
63
+ ends_only=ends_only,
64
+ threads=threads,
65
+ )
clipkit/clipkit.py ADDED
@@ -0,0 +1,232 @@
1
+ #!/usr/bin/env python
2
+
3
+ import logging
4
+ import sys
5
+ import time
6
+ from typing import Union
7
+
8
+ from Bio.Align import MultipleSeqAlignment
9
+ from .args_processing import process_args
10
+ from .exceptions import InvalidInputFileFormat
11
+ from .files import (
12
+ get_alignment_and_format,
13
+ FileFormat,
14
+ write_debug_log_file,
15
+ get_custom_sites_to_trim,
16
+ )
17
+ from .helpers import (
18
+ create_msa,
19
+ get_seq_type,
20
+ get_gap_chars,
21
+ write_msa,
22
+ write_complement,
23
+ SeqType,
24
+ )
25
+ from .logger import logger, log_file_logger
26
+ from .modes import TrimmingMode
27
+ from .msa import MSA
28
+ from .parser import create_parser
29
+ from .settings import DEFAULT_AA_GAP_CHARS, DEFAULT_NT_GAP_CHARS
30
+ from .smart_gap_helper import smart_gap_threshold_determination
31
+ from .version import __version__ as current_version
32
+ from .warnings import (
33
+ warn_if_all_sites_were_trimmed,
34
+ warn_if_entry_contains_only_gaps,
35
+ )
36
+ from .write import (
37
+ write_user_args,
38
+ write_output_stats,
39
+ write_output_files_message,
40
+ )
41
+
42
+ from dataclasses import dataclass
43
+
44
+
45
+ @dataclass
46
+ class TrimRun:
47
+ alignment: MultipleSeqAlignment
48
+ msa: MSA
49
+ gap_characters: list
50
+ sequence_type: SeqType
51
+ input_file_format: FileFormat
52
+ output_file_format: FileFormat
53
+ gaps: float
54
+ codon: bool
55
+ version: str = current_version
56
+
57
+ @property
58
+ def complement(self):
59
+ return self.msa.complement_to_bio_msa()
60
+
61
+ @property
62
+ def trimmed(self):
63
+ return self.msa.to_bio_msa()
64
+
65
+
66
+ def run(
67
+ input_file: str,
68
+ input_file_format: FileFormat,
69
+ output_file: str,
70
+ output_file_format: FileFormat,
71
+ auxiliary_file: str,
72
+ sequence_type: Union[SeqType, None],
73
+ gaps: float,
74
+ gap_characters: Union[list, None],
75
+ complement: bool,
76
+ codon: bool,
77
+ mode: TrimmingMode,
78
+ use_log: bool,
79
+ quiet: bool,
80
+ ends_only: bool,
81
+ threads: int = 1,
82
+ ):
83
+ try:
84
+ alignment, input_file_format = get_alignment_and_format(
85
+ input_file, input_file_format
86
+ )
87
+ except InvalidInputFileFormat:
88
+ return logger.error(
89
+ f"""Format type could not be read.\nPlease check acceptable input file formats: {", ".join([file_format.value for file_format in FileFormat])}"""
90
+ )
91
+
92
+ sequence_type = sequence_type or get_seq_type(alignment)
93
+
94
+ if not gap_characters:
95
+ gap_characters = get_gap_chars(sequence_type)
96
+
97
+ if not output_file_format:
98
+ output_file_format = input_file_format
99
+ else:
100
+ output_file_format = FileFormat(output_file_format)
101
+
102
+ # determine smart_gap threshold
103
+ if mode in {
104
+ TrimmingMode.smart_gap,
105
+ TrimmingMode.kpi_smart_gap,
106
+ TrimmingMode.kpic_smart_gap,
107
+ }:
108
+ gaps = smart_gap_threshold_determination(alignment, gap_characters)
109
+
110
+ site_positions_to_trim = None
111
+ if mode == TrimmingMode.cst:
112
+ aln_length = alignment.get_alignment_length()
113
+ site_positions_to_trim = (
114
+ get_custom_sites_to_trim(auxiliary_file, aln_length) or []
115
+ )
116
+
117
+ msa = create_msa(alignment, gap_characters, threads)
118
+ msa.trim(
119
+ mode,
120
+ gap_threshold=gaps,
121
+ site_positions_to_trim=site_positions_to_trim,
122
+ codon=codon,
123
+ ends_only=ends_only,
124
+ )
125
+
126
+ trim_run = TrimRun(
127
+ alignment,
128
+ msa,
129
+ gap_characters,
130
+ sequence_type,
131
+ input_file_format,
132
+ output_file_format,
133
+ gaps,
134
+ codon,
135
+ )
136
+
137
+ return trim_run, msa.stats
138
+
139
+
140
+ def execute(
141
+ input_file: str,
142
+ input_file_format: FileFormat,
143
+ output_file: str,
144
+ output_file_format: FileFormat,
145
+ sequence_type: Union[SeqType, None],
146
+ gaps: float,
147
+ gap_characters: Union[list, None],
148
+ complement: bool,
149
+ codon: bool,
150
+ ends_only: bool,
151
+ mode: TrimmingMode,
152
+ use_log: bool,
153
+ quiet: bool,
154
+ auxiliary_file: str = None,
155
+ threads: int = 1,
156
+ **kwargs,
157
+ ) -> None:
158
+ if use_log:
159
+ log_file_logger.setLevel(logging.DEBUG)
160
+ log_file_logger.propagate = False
161
+ fh = logging.FileHandler(f"{output_file}.log", mode="w")
162
+ fh.setLevel(logging.DEBUG)
163
+ log_file_logger.addHandler(fh)
164
+
165
+ if quiet:
166
+ logger.disabled = True
167
+
168
+ # for reporting runtime duration to user
169
+ start_time = time.time()
170
+
171
+ trim_run, stats = run(
172
+ input_file,
173
+ input_file_format,
174
+ output_file,
175
+ output_file_format,
176
+ auxiliary_file,
177
+ sequence_type,
178
+ gaps,
179
+ gap_characters,
180
+ complement,
181
+ codon,
182
+ mode,
183
+ use_log,
184
+ quiet,
185
+ ends_only,
186
+ threads,
187
+ )
188
+
189
+ # display to user what args are being used in stdout
190
+ write_user_args(
191
+ input_file,
192
+ trim_run.input_file_format,
193
+ output_file,
194
+ trim_run.output_file_format,
195
+ trim_run.sequence_type,
196
+ trim_run.gaps,
197
+ trim_run.gap_characters,
198
+ mode,
199
+ complement,
200
+ codon,
201
+ use_log,
202
+ ends_only,
203
+ )
204
+
205
+ write_output_files_message(output_file, complement, use_log)
206
+
207
+ if use_log:
208
+ warn_if_all_sites_were_trimmed(trim_run.msa)
209
+ warn_if_entry_contains_only_gaps(trim_run.msa)
210
+ write_debug_log_file(trim_run.msa)
211
+
212
+ write_msa(trim_run.msa, output_file, trim_run.output_file_format)
213
+
214
+ # if the -c/--complementary argument was used, create an alignment of the trimmed sequences
215
+ if complement:
216
+ write_complement(trim_run.msa, output_file, trim_run.output_file_format)
217
+
218
+ write_output_stats(stats, start_time)
219
+
220
+
221
+ def main(argv=None):
222
+ """
223
+ Function that parses and collects arguments
224
+ """
225
+ parser = create_parser()
226
+ args = parser.parse_args()
227
+
228
+ execute(**process_args(args))
229
+
230
+
231
+ if __name__ == "__main__":
232
+ main(sys.argv[1:])
clipkit/exceptions.py ADDED
@@ -0,0 +1,6 @@
1
+ class ClipKITException(Exception):
2
+ pass
3
+
4
+
5
+ class InvalidInputFileFormat(ClipKITException):
6
+ pass
clipkit/files.py ADDED
@@ -0,0 +1,75 @@
1
+ from enum import Enum
2
+ from .logger import log_file_logger
3
+
4
+ from Bio import AlignIO
5
+ from Bio.Align import MultipleSeqAlignment
6
+ import numpy as np
7
+
8
+ from .exceptions import InvalidInputFileFormat
9
+
10
+
11
+ class FileFormat(Enum):
12
+ fasta = "fasta"
13
+ clustal = "clustal"
14
+ maf = "maf"
15
+ mauve = "mauve"
16
+ phylip = "phylip"
17
+ phylip_sequential = "phylip_sequential"
18
+ phylip_relaxed = "phylip_relaxed"
19
+ stockholm = "stockholm"
20
+
21
+
22
+ def get_alignment_and_format(
23
+ input_file_name: str, file_format: FileFormat
24
+ ) -> tuple[MultipleSeqAlignment, FileFormat]:
25
+ """
26
+ Automatically determines what type of input file was used
27
+ and reads in the alignment file
28
+ """
29
+
30
+ if file_format:
31
+ file_format = FileFormat(file_format)
32
+ alignment = AlignIO.read(open(input_file_name), file_format.value)
33
+ return alignment, file_format
34
+ else:
35
+ # attempt to auto-detect file format
36
+ for fileFormat in FileFormat:
37
+ try:
38
+ alignment = AlignIO.read(open(input_file_name), fileFormat.value)
39
+ return alignment, fileFormat
40
+ # the following exceptions refer to skipping over errors
41
+ # associated with reading the wrong input file
42
+ except ValueError:
43
+ continue
44
+ except AssertionError:
45
+ continue
46
+
47
+ raise InvalidInputFileFormat("File could not be read")
48
+
49
+
50
+ def get_custom_sites_to_trim(file_path: str, aln_length: int) -> list:
51
+ with open(file_path) as f:
52
+ lines = f.read().splitlines()
53
+
54
+ sites_to_trim = []
55
+ sites_to_keep = []
56
+ for line in lines:
57
+ site = line.split("\t")
58
+ pos = int(site[0]) - 1
59
+ if site[1] == "trim":
60
+ sites_to_trim.append(pos)
61
+ else:
62
+ sites_to_keep.append(pos)
63
+
64
+ if len(sites_to_trim) == 0:
65
+ # we only had keeps so treat every other site as a trim
66
+ sites_to_trim = list(
67
+ np.setdiff1d(np.arange(aln_length), np.array(sites_to_keep))
68
+ )
69
+
70
+ return sites_to_trim
71
+
72
+
73
+ def write_debug_log_file(msa):
74
+ for info in msa.generate_debug_log_info():
75
+ log_file_logger.debug(f"{str(info[0] + 1)} {info[1]} {info[2].value} {info[3]}")
clipkit/helpers.py ADDED
@@ -0,0 +1,78 @@
1
+ import re
2
+
3
+ from Bio import SeqIO
4
+ from Bio.Align import MultipleSeqAlignment
5
+ import numpy as np
6
+
7
+ from .msa import MSA
8
+ from .modes import TrimmingMode
9
+ from .settings import DEFAULT_AA_GAP_CHARS, DEFAULT_NT_GAP_CHARS
10
+ from .files import FileFormat
11
+ from .stats import TrimmingStats
12
+
13
+ from enum import Enum
14
+
15
+
16
+ class SeqType(Enum):
17
+ aa = "aa"
18
+ nt = "nt"
19
+
20
+
21
+ def remove_gaps(seq: str, gap_chars: list[str] = DEFAULT_AA_GAP_CHARS) -> str:
22
+ pattern = "|".join([re.escape(char) for char in gap_chars])
23
+ return re.sub(pattern, "", seq)
24
+
25
+
26
+ def get_seq_type(alignment: MultipleSeqAlignment) -> SeqType:
27
+ seq = str(alignment[0].seq)
28
+ seq = remove_gaps(seq)
29
+ if len(seq) < 200:
30
+ seq = "".join([str(record.seq) for record in alignment])
31
+ seq = remove_gaps(seq)
32
+
33
+ if len(set(seq.upper())) > 5:
34
+ sequence_type = SeqType.aa
35
+ else:
36
+ sequence_type = SeqType.nt
37
+
38
+ return sequence_type
39
+
40
+
41
+ def get_gap_chars(seq_type: SeqType) -> list[str]:
42
+ if seq_type == SeqType.nt:
43
+ return DEFAULT_NT_GAP_CHARS
44
+ else:
45
+ return DEFAULT_AA_GAP_CHARS
46
+
47
+
48
+ def create_msa(alignment: MultipleSeqAlignment, gap_chars: list[str] = None, threads: int = 1) -> MSA:
49
+ """
50
+ Create MSA class
51
+ """
52
+ return MSA.from_bio_msa(alignment, gap_chars, threads)
53
+
54
+
55
+ def write_msa(msa: MSA, out_file_name: str, out_file_format: FileFormat) -> None:
56
+ """
57
+ msa is populated with sites that are kept after trimming is finished
58
+ """
59
+ output_msa = msa.to_bio_msa()
60
+ if out_file_format.value == "phylip_relaxed":
61
+ SeqIO.write(output_msa, out_file_name, "phylip-relaxed")
62
+ elif out_file_format.value == "phylip_sequential":
63
+ SeqIO.write(output_msa, out_file_name, "phylip-sequential")
64
+ else:
65
+ SeqIO.write(output_msa, out_file_name, out_file_format.value)
66
+
67
+
68
+ def write_complement(msa: MSA, out_file: str, out_file_format: FileFormat) -> None:
69
+ """
70
+ msa is populated with sites that are trimmed after trimming is finished
71
+ """
72
+ output_msa = msa.complement_to_bio_msa()
73
+ completmentOut = str(out_file) + ".complement"
74
+ if out_file_format.value == "phylip_relaxed":
75
+ SeqIO.write(output_msa, out_file, "phylip-relaxed")
76
+ elif out_file_format.value == "phylip_sequential":
77
+ SeqIO.write(output_msa, out_file, "phylip-sequential")
78
+ SeqIO.write(output_msa, completmentOut, out_file_format.value)
clipkit/logger.py ADDED
@@ -0,0 +1,6 @@
1
+ import logging
2
+ import sys
3
+
4
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
5
+ logger = logging.getLogger(__name__)
6
+ log_file_logger = logging.getLogger("log_file")
clipkit/modes.py ADDED
@@ -0,0 +1,20 @@
1
+ from enum import Enum
2
+ from typing import TYPE_CHECKING
3
+ from .logger import log_file_logger
4
+
5
+ if TYPE_CHECKING:
6
+ from Bio.Align import MultipleSeqAlignment
7
+ from .msa import MSA
8
+
9
+
10
+ class TrimmingMode(Enum):
11
+ gappy = "gappy"
12
+ smart_gap = "smart-gap"
13
+ kpi = "kpi" # keep parsimony informative sites
14
+ kpi_gappy = "kpi-gappy"
15
+ kpi_smart_gap = "kpi-smart-gap"
16
+ kpic = "kpic" # keep parsimony informative and constant sites
17
+ kpic_gappy = "kpic-gappy"
18
+ kpic_smart_gap = "kpic-smart-gap"
19
+ cst = "cst" # custom site trimming
20
+ c3 = "c3"