REDItools3 3.1a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of REDItools3 might be problematic. Click here for more details.
- REDItools3-3.1a0.dist-info/LICENSE +674 -0
- REDItools3-3.1a0.dist-info/METADATA +36 -0
- REDItools3-3.1a0.dist-info/RECORD +21 -0
- REDItools3-3.1a0.dist-info/WHEEL +5 -0
- REDItools3-3.1a0.dist-info/top_level.txt +1 -0
- reditools/__init__.py +1 -0
- reditools/__main__.py +37 -0
- reditools/alignment_file.py +146 -0
- reditools/alignment_manager.py +136 -0
- reditools/analyze.py +552 -0
- reditools/compiled_position.py +133 -0
- reditools/compiled_reads.py +131 -0
- reditools/fasta_file.py +68 -0
- reditools/file_utils.py +132 -0
- reditools/homopolymerics.py +92 -0
- reditools/index.py +268 -0
- reditools/logger.py +44 -0
- reditools/reditools.py +456 -0
- reditools/region.py +130 -0
- reditools/rtchecks.py +274 -0
- reditools/utils.py +106 -0
reditools/rtchecks.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Quality control for REDItools analyses."""
|
|
2
|
+
|
|
3
|
+
from reditools import utils
|
|
4
|
+
from reditools.logger import Logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RTChecks(object):
|
|
8
|
+
"""Quality control for REDItools analyses."""
|
|
9
|
+
|
|
10
|
+
def __init__(self):
|
|
11
|
+
"""Create a RTChecks object."""
|
|
12
|
+
self.check_list = [self.check_is_none]
|
|
13
|
+
|
|
14
|
+
def add(self, function):
|
|
15
|
+
"""
|
|
16
|
+
Add a QC check.
|
|
17
|
+
|
|
18
|
+
Parameters:
|
|
19
|
+
function (RTChecks method): The check to perform
|
|
20
|
+
"""
|
|
21
|
+
self.check_list.append(function)
|
|
22
|
+
|
|
23
|
+
def discard(self, function):
|
|
24
|
+
"""
|
|
25
|
+
Remove a QC check.
|
|
26
|
+
|
|
27
|
+
Parameters:
|
|
28
|
+
function (RTChecks method): The check to discard
|
|
29
|
+
"""
|
|
30
|
+
if function in self.check_list:
|
|
31
|
+
self.check_list.remove(function)
|
|
32
|
+
|
|
33
|
+
def check(self, rtools, bases):
|
|
34
|
+
"""
|
|
35
|
+
Perform QC.
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
rtools (REDItools): Object performing analysis
|
|
39
|
+
bases (CompiledPosition): Base position under analysis
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
(bool): True of all checks pass, else false
|
|
43
|
+
"""
|
|
44
|
+
return utils.check_list(
|
|
45
|
+
self.check_list,
|
|
46
|
+
bases=bases,
|
|
47
|
+
rtools=rtools,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def check_splice_positions(self, rtools, bases):
|
|
51
|
+
"""
|
|
52
|
+
Check if the contig and position are in a splice site.
|
|
53
|
+
|
|
54
|
+
Parameters:
|
|
55
|
+
rtools (REDItools): Object performing analysis
|
|
56
|
+
bases (CompiledPosition): Base position under analysis
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
(bool): True if the position is not a splice site.
|
|
60
|
+
"""
|
|
61
|
+
contig = bases.contig
|
|
62
|
+
if bases.position in rtools.splice_positions.get(contig, []):
|
|
63
|
+
rtools.log(
|
|
64
|
+
Logger.debug_level,
|
|
65
|
+
'[SPLICE_SITE] Discarding ({}, {}) because in splice site',
|
|
66
|
+
contig,
|
|
67
|
+
bases.position,
|
|
68
|
+
)
|
|
69
|
+
return False
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
def check_column_min_length(self, rtools, bases):
|
|
73
|
+
"""
|
|
74
|
+
Check read depth.
|
|
75
|
+
|
|
76
|
+
Parameters:
|
|
77
|
+
rtools (REDItools): Object performing analysis
|
|
78
|
+
bases (CompiledPosition): Base position under analysis
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
(bool): True if the read depth is sufficient
|
|
82
|
+
"""
|
|
83
|
+
if len(bases) < rtools.min_column_length:
|
|
84
|
+
rtools.log(
|
|
85
|
+
Logger.debug_level,
|
|
86
|
+
'DISCARDING COLUMN {} [MIN_COLUMN_LEGNTH={}]',
|
|
87
|
+
len(bases),
|
|
88
|
+
rtools.min_column_length,
|
|
89
|
+
)
|
|
90
|
+
return False
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
# Really shouldn't use this one. I have to compute mean_q anyway
|
|
94
|
+
def check_column_quality(self, rtools, bases):
|
|
95
|
+
"""
|
|
96
|
+
Check mean quality of the position.
|
|
97
|
+
|
|
98
|
+
Parameters:
|
|
99
|
+
rtools (REDItools): Object performing analysis
|
|
100
|
+
bases (CompiledPosition): Base position under analysis
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
(bool): True if quality is sufficient
|
|
104
|
+
"""
|
|
105
|
+
if bases:
|
|
106
|
+
mean_q = sum(bases.qualities) / len(bases)
|
|
107
|
+
else:
|
|
108
|
+
mean_q = 0
|
|
109
|
+
if mean_q < rtools.min_read_quality:
|
|
110
|
+
rtools.log(
|
|
111
|
+
Logger.debug_level,
|
|
112
|
+
'DISCARD COLUMN mean_quality={} < {}',
|
|
113
|
+
mean_q,
|
|
114
|
+
rtools.min_read_quality,
|
|
115
|
+
)
|
|
116
|
+
return False
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
def check_column_edit_frequency(self, rtools, bases):
|
|
120
|
+
"""
|
|
121
|
+
Check the number of edits at the site.
|
|
122
|
+
|
|
123
|
+
Parameters:
|
|
124
|
+
rtools (REDItools): Object performing analysis
|
|
125
|
+
bases (CompiledPosition): Base position under analysis
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
(bool): True if there are sufficient edits.
|
|
129
|
+
"""
|
|
130
|
+
edits_no = len(bases) - bases['REF']
|
|
131
|
+
if edits_no < rtools.min_edits:
|
|
132
|
+
rtools.log(
|
|
133
|
+
Logger.debug_level,
|
|
134
|
+
'DISCARDING COLUMN edits={} < {}',
|
|
135
|
+
edits_no,
|
|
136
|
+
rtools.min_edits,
|
|
137
|
+
)
|
|
138
|
+
return False
|
|
139
|
+
return True
|
|
140
|
+
|
|
141
|
+
def check_column_min_edits(self, rtools, bases):
|
|
142
|
+
"""
|
|
143
|
+
Check that there are sufficient edit events for each base.
|
|
144
|
+
|
|
145
|
+
Parameters:
|
|
146
|
+
rtools (REDItools): Object performing analysis
|
|
147
|
+
bases (CompiledPosition): Base position under analysis
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
(bool): True if there are sufficient edits
|
|
151
|
+
"""
|
|
152
|
+
for num_edits in bases.get_min_edits():
|
|
153
|
+
if 0 < num_edits < rtools.min_edits_per_nucleotide:
|
|
154
|
+
rtools.log(
|
|
155
|
+
Logger.debug_level,
|
|
156
|
+
'DISCARDING COLUMN edits={} < {}',
|
|
157
|
+
num_edits,
|
|
158
|
+
rtools.min_edits_per_nucleotide,
|
|
159
|
+
)
|
|
160
|
+
return False
|
|
161
|
+
return True
|
|
162
|
+
|
|
163
|
+
def check_multiple_alts(self, bases, rtools):
|
|
164
|
+
"""
|
|
165
|
+
Check that there is, at most, one alternate base.
|
|
166
|
+
|
|
167
|
+
Parameters:
|
|
168
|
+
bases (CompiledPosition): Base position under analysis
|
|
169
|
+
rtools (REDItools): Object running the analysis
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
(bool): True if there is zero or one alt
|
|
173
|
+
"""
|
|
174
|
+
alts = bases.get_variants()
|
|
175
|
+
if len(alts) < 2:
|
|
176
|
+
rtools.log(
|
|
177
|
+
Logger.debug_level,
|
|
178
|
+
'DISCARD COLUMN alts={} > 1',
|
|
179
|
+
len(alts),
|
|
180
|
+
)
|
|
181
|
+
return False
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
def check_is_none(self, bases, rtools):
|
|
185
|
+
"""
|
|
186
|
+
Check if the bases object is None.
|
|
187
|
+
|
|
188
|
+
Parameters:
|
|
189
|
+
bases (CompiledPosition): Data for analysis
|
|
190
|
+
rtools (REDItools): Object running the analysis
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
(bool): True if bases is not None
|
|
194
|
+
"""
|
|
195
|
+
if bases is None:
|
|
196
|
+
rtools.log(Logger.debug_level, 'DISCARD COLUMN no reads')
|
|
197
|
+
return False
|
|
198
|
+
return True
|
|
199
|
+
|
|
200
|
+
def check_target_positions(self, bases, rtools):
|
|
201
|
+
"""
|
|
202
|
+
Check if the bases object is in a target region.
|
|
203
|
+
|
|
204
|
+
Parameters:
|
|
205
|
+
bases (CompiledPosition): Data for analysis
|
|
206
|
+
rtools (REDItools): Object running the analysis
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
(bool): True if the position is in a target region
|
|
210
|
+
"""
|
|
211
|
+
if bases.position not in rtools.target_positions.get(bases.contig, []):
|
|
212
|
+
rtools.log(
|
|
213
|
+
Logger.debug_level,
|
|
214
|
+
'DISCARD COLUMN not in target positions',
|
|
215
|
+
)
|
|
216
|
+
return False
|
|
217
|
+
return True
|
|
218
|
+
|
|
219
|
+
def check_ref(self, bases, rtools):
|
|
220
|
+
"""
|
|
221
|
+
Check if the reference base is of interest.
|
|
222
|
+
|
|
223
|
+
Parameters:
|
|
224
|
+
bases (CompiledPosition): Data for analysis
|
|
225
|
+
rtools (REDItools): Object running the analysis
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
(bool): True if reference base was specified
|
|
229
|
+
"""
|
|
230
|
+
if bases.ref not in rtools.include_refs:
|
|
231
|
+
rtools.log(
|
|
232
|
+
Logger.debug_level,
|
|
233
|
+
'DISCARD COLUMN base "{}" not listed for reporting',
|
|
234
|
+
bases.ref,
|
|
235
|
+
)
|
|
236
|
+
return False
|
|
237
|
+
return True
|
|
238
|
+
|
|
239
|
+
def check_exclusions(self, bases, rtools):
|
|
240
|
+
"""
|
|
241
|
+
Check if the bases object is in an excluded position.
|
|
242
|
+
|
|
243
|
+
Parameters:
|
|
244
|
+
bases (CompiledPosition): Data for analysis
|
|
245
|
+
rtools (REDItools): Object running the analysis
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
(bool): True if the position is not excluded
|
|
249
|
+
"""
|
|
250
|
+
if bases.position in rtools.exclude_positions.get(bases.contig, []):
|
|
251
|
+
rtools.log(Logger.debug_level, 'DISCARD COLUMN in excluded region')
|
|
252
|
+
return False
|
|
253
|
+
return True
|
|
254
|
+
|
|
255
|
+
def check_specific_edits(self, bases, rtools):
|
|
256
|
+
"""
|
|
257
|
+
Check whether specified edits are present.
|
|
258
|
+
|
|
259
|
+
Parameters:
|
|
260
|
+
bases (CompiledPosition): Data for analysis
|
|
261
|
+
rtools (REDItools): Object running the analysis
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
(bool): True if the edit was specified
|
|
265
|
+
"""
|
|
266
|
+
for ref, alt in rtools.specific_edits:
|
|
267
|
+
if not bases[ref] or not bases[alt]:
|
|
268
|
+
rtools.log(
|
|
269
|
+
Logger.debug_level,
|
|
270
|
+
'DISCARD COLUMN edit "{}" not specified for output',
|
|
271
|
+
ref + alt,
|
|
272
|
+
)
|
|
273
|
+
return False
|
|
274
|
+
return True
|
reditools/utils.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Miscellaneous utility functions."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import socket
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
|
|
9
|
+
from pysam.libcalignmentfile import AlignmentFile
|
|
10
|
+
from sortedcontainers import SortedSet
|
|
11
|
+
|
|
12
|
+
from reditools.file_utils import open_stream
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def read_bed_file(path):
|
|
16
|
+
"""
|
|
17
|
+
Return an iterator for a BED file.
|
|
18
|
+
|
|
19
|
+
Parameters:
|
|
20
|
+
path (str): Path to a BED file for reading.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Iterator of BED file contents.
|
|
24
|
+
"""
|
|
25
|
+
stream = open_stream(path)
|
|
26
|
+
return csv.reader(stream, delimiter='\t')
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def enumerate_positions(regions):
|
|
30
|
+
"""
|
|
31
|
+
Convert a list of regions into a list of individual positions.
|
|
32
|
+
|
|
33
|
+
Parameters:
|
|
34
|
+
regions (list): A list of iterables. Each element must start
|
|
35
|
+
with a contig and start position. End position
|
|
36
|
+
is optional. Additional values will be ignored.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
SortedSet enumerating the individual positions.
|
|
40
|
+
"""
|
|
41
|
+
positions = defaultdict(SortedSet)
|
|
42
|
+
for region in regions:
|
|
43
|
+
positions[region.contig] |= region.enumerate()
|
|
44
|
+
return positions
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_hostname_string():
|
|
48
|
+
"""
|
|
49
|
+
Retrieve the machine hostname, ip, and proccess ID.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
String in the format "hostname|ip|pid"
|
|
53
|
+
"""
|
|
54
|
+
hostname = socket.gethostname()
|
|
55
|
+
ip_addr = socket.gethostbyname(hostname)
|
|
56
|
+
pid = os.getpid()
|
|
57
|
+
return f'{hostname}|{ip_addr}|{pid}'
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def check_list(functions, **kwargs):
|
|
61
|
+
"""
|
|
62
|
+
Run through a list of functions, determining if any return False.
|
|
63
|
+
|
|
64
|
+
Parameters:
|
|
65
|
+
functions (list): A list of function references
|
|
66
|
+
**kwargs: Any arguments to be passed to the members of functions
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
False if any function in check_list returns False, else True
|
|
70
|
+
"""
|
|
71
|
+
for check in functions:
|
|
72
|
+
if not check(**kwargs):
|
|
73
|
+
return False
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def to_int(string):
|
|
78
|
+
"""
|
|
79
|
+
Convert a (potentially formatted) string to an int.
|
|
80
|
+
|
|
81
|
+
Parameters:
|
|
82
|
+
string (str): A string representation of an integer
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
The integer values of the string.
|
|
86
|
+
"""
|
|
87
|
+
return int(re.sub(r'[\s,]', '', string))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_contigs(sam_path):
|
|
91
|
+
"""
|
|
92
|
+
Retrieve contig or chromsome data from an alignment file.
|
|
93
|
+
|
|
94
|
+
Parameters:
|
|
95
|
+
sam_path (string): Path to an alignment file.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
tuple of lists containing the reference names and reference lengths in
|
|
99
|
+
corresponding order
|
|
100
|
+
"""
|
|
101
|
+
with AlignmentFile(sam_path, ignore_truncation=True) as sam:
|
|
102
|
+
contigs = list(sam.references)
|
|
103
|
+
sizes = list(sam.lengths)
|
|
104
|
+
indices = range(len(contigs))
|
|
105
|
+
indices = sorted(indices, key=lambda idx: contigs[idx])
|
|
106
|
+
return ((contigs[idx], sizes[idx]) for idx in indices)
|