REDItools3 3.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of REDItools3 might be problematic. Click here for more details.

reditools/rtchecks.py ADDED
@@ -0,0 +1,274 @@
1
+ """Quality control for REDItools analyses."""
2
+
3
+ from reditools import utils
4
+ from reditools.logger import Logger
5
+
6
+
7
+ class RTChecks(object):
8
+ """Quality control for REDItools analyses."""
9
+
10
+ def __init__(self):
11
+ """Create a RTChecks object."""
12
+ self.check_list = [self.check_is_none]
13
+
14
+ def add(self, function):
15
+ """
16
+ Add a QC check.
17
+
18
+ Parameters:
19
+ function (RTChecks method): The check to perform
20
+ """
21
+ self.check_list.append(function)
22
+
23
+ def discard(self, function):
24
+ """
25
+ Remove a QC check.
26
+
27
+ Parameters:
28
+ function (RTChecks method): The check to discard
29
+ """
30
+ if function in self.check_list:
31
+ self.check_list.remove(function)
32
+
33
+ def check(self, rtools, bases):
34
+ """
35
+ Perform QC.
36
+
37
+ Parameters:
38
+ rtools (REDItools): Object performing analysis
39
+ bases (CompiledPosition): Base position under analysis
40
+
41
+ Returns:
42
+ (bool): True of all checks pass, else false
43
+ """
44
+ return utils.check_list(
45
+ self.check_list,
46
+ bases=bases,
47
+ rtools=rtools,
48
+ )
49
+
50
+ def check_splice_positions(self, rtools, bases):
51
+ """
52
+ Check if the contig and position are in a splice site.
53
+
54
+ Parameters:
55
+ rtools (REDItools): Object performing analysis
56
+ bases (CompiledPosition): Base position under analysis
57
+
58
+ Returns:
59
+ (bool): True if the position is not a splice site.
60
+ """
61
+ contig = bases.contig
62
+ if bases.position in rtools.splice_positions.get(contig, []):
63
+ rtools.log(
64
+ Logger.debug_level,
65
+ '[SPLICE_SITE] Discarding ({}, {}) because in splice site',
66
+ contig,
67
+ bases.position,
68
+ )
69
+ return False
70
+ return True
71
+
72
+ def check_column_min_length(self, rtools, bases):
73
+ """
74
+ Check read depth.
75
+
76
+ Parameters:
77
+ rtools (REDItools): Object performing analysis
78
+ bases (CompiledPosition): Base position under analysis
79
+
80
+ Returns:
81
+ (bool): True if the read depth is sufficient
82
+ """
83
+ if len(bases) < rtools.min_column_length:
84
+ rtools.log(
85
+ Logger.debug_level,
86
+ 'DISCARDING COLUMN {} [MIN_COLUMN_LEGNTH={}]',
87
+ len(bases),
88
+ rtools.min_column_length,
89
+ )
90
+ return False
91
+ return True
92
+
93
+ # Really shouldn't use this one. I have to compute mean_q anyway
94
+ def check_column_quality(self, rtools, bases):
95
+ """
96
+ Check mean quality of the position.
97
+
98
+ Parameters:
99
+ rtools (REDItools): Object performing analysis
100
+ bases (CompiledPosition): Base position under analysis
101
+
102
+ Returns:
103
+ (bool): True if quality is sufficient
104
+ """
105
+ if bases:
106
+ mean_q = sum(bases.qualities) / len(bases)
107
+ else:
108
+ mean_q = 0
109
+ if mean_q < rtools.min_read_quality:
110
+ rtools.log(
111
+ Logger.debug_level,
112
+ 'DISCARD COLUMN mean_quality={} < {}',
113
+ mean_q,
114
+ rtools.min_read_quality,
115
+ )
116
+ return False
117
+ return True
118
+
119
+ def check_column_edit_frequency(self, rtools, bases):
120
+ """
121
+ Check the number of edits at the site.
122
+
123
+ Parameters:
124
+ rtools (REDItools): Object performing analysis
125
+ bases (CompiledPosition): Base position under analysis
126
+
127
+ Returns:
128
+ (bool): True if there are sufficient edits.
129
+ """
130
+ edits_no = len(bases) - bases['REF']
131
+ if edits_no < rtools.min_edits:
132
+ rtools.log(
133
+ Logger.debug_level,
134
+ 'DISCARDING COLUMN edits={} < {}',
135
+ edits_no,
136
+ rtools.min_edits,
137
+ )
138
+ return False
139
+ return True
140
+
141
+ def check_column_min_edits(self, rtools, bases):
142
+ """
143
+ Check that there are sufficient edit events for each base.
144
+
145
+ Parameters:
146
+ rtools (REDItools): Object performing analysis
147
+ bases (CompiledPosition): Base position under analysis
148
+
149
+ Returns:
150
+ (bool): True if there are sufficient edits
151
+ """
152
+ for num_edits in bases.get_min_edits():
153
+ if 0 < num_edits < rtools.min_edits_per_nucleotide:
154
+ rtools.log(
155
+ Logger.debug_level,
156
+ 'DISCARDING COLUMN edits={} < {}',
157
+ num_edits,
158
+ rtools.min_edits_per_nucleotide,
159
+ )
160
+ return False
161
+ return True
162
+
163
+ def check_multiple_alts(self, bases, rtools):
164
+ """
165
+ Check that there is, at most, one alternate base.
166
+
167
+ Parameters:
168
+ bases (CompiledPosition): Base position under analysis
169
+ rtools (REDItools): Object running the analysis
170
+
171
+ Returns:
172
+ (bool): True if there is zero or one alt
173
+ """
174
+ alts = bases.get_variants()
175
+ if len(alts) < 2:
176
+ rtools.log(
177
+ Logger.debug_level,
178
+ 'DISCARD COLUMN alts={} > 1',
179
+ len(alts),
180
+ )
181
+ return False
182
+ return True
183
+
184
+ def check_is_none(self, bases, rtools):
185
+ """
186
+ Check if the bases object is None.
187
+
188
+ Parameters:
189
+ bases (CompiledPosition): Data for analysis
190
+ rtools (REDItools): Object running the analysis
191
+
192
+ Returns:
193
+ (bool): True if bases is not None
194
+ """
195
+ if bases is None:
196
+ rtools.log(Logger.debug_level, 'DISCARD COLUMN no reads')
197
+ return False
198
+ return True
199
+
200
+ def check_target_positions(self, bases, rtools):
201
+ """
202
+ Check if the bases object is in a target region.
203
+
204
+ Parameters:
205
+ bases (CompiledPosition): Data for analysis
206
+ rtools (REDItools): Object running the analysis
207
+
208
+ Returns:
209
+ (bool): True if the position is in a target region
210
+ """
211
+ if bases.position not in rtools.target_positions.get(bases.contig, []):
212
+ rtools.log(
213
+ Logger.debug_level,
214
+ 'DISCARD COLUMN not in target positions',
215
+ )
216
+ return False
217
+ return True
218
+
219
+ def check_ref(self, bases, rtools):
220
+ """
221
+ Check if the reference base is of interest.
222
+
223
+ Parameters:
224
+ bases (CompiledPosition): Data for analysis
225
+ rtools (REDItools): Object running the analysis
226
+
227
+ Returns:
228
+ (bool): True if reference base was specified
229
+ """
230
+ if bases.ref not in rtools.include_refs:
231
+ rtools.log(
232
+ Logger.debug_level,
233
+ 'DISCARD COLUMN base "{}" not listed for reporting',
234
+ bases.ref,
235
+ )
236
+ return False
237
+ return True
238
+
239
+ def check_exclusions(self, bases, rtools):
240
+ """
241
+ Check if the bases object is in an excluded position.
242
+
243
+ Parameters:
244
+ bases (CompiledPosition): Data for analysis
245
+ rtools (REDItools): Object running the analysis
246
+
247
+ Returns:
248
+ (bool): True if the position is not excluded
249
+ """
250
+ if bases.position in rtools.exclude_positions.get(bases.contig, []):
251
+ rtools.log(Logger.debug_level, 'DISCARD COLUMN in excluded region')
252
+ return False
253
+ return True
254
+
255
+ def check_specific_edits(self, bases, rtools):
256
+ """
257
+ Check whether specified edits are present.
258
+
259
+ Parameters:
260
+ bases (CompiledPosition): Data for analysis
261
+ rtools (REDItools): Object running the analysis
262
+
263
+ Returns:
264
+ (bool): True if the edit was specified
265
+ """
266
+ for ref, alt in rtools.specific_edits:
267
+ if not bases[ref] or not bases[alt]:
268
+ rtools.log(
269
+ Logger.debug_level,
270
+ 'DISCARD COLUMN edit "{}" not specified for output',
271
+ ref + alt,
272
+ )
273
+ return False
274
+ return True
reditools/utils.py ADDED
@@ -0,0 +1,106 @@
1
+ """Miscellaneous utility functions."""
2
+
3
+ import csv
4
+ import os
5
+ import re
6
+ import socket
7
+ from collections import defaultdict
8
+
9
+ from pysam.libcalignmentfile import AlignmentFile
10
+ from sortedcontainers import SortedSet
11
+
12
+ from reditools.file_utils import open_stream
13
+
14
+
15
+ def read_bed_file(path):
16
+ """
17
+ Return an iterator for a BED file.
18
+
19
+ Parameters:
20
+ path (str): Path to a BED file for reading.
21
+
22
+ Returns:
23
+ Iterator of BED file contents.
24
+ """
25
+ stream = open_stream(path)
26
+ return csv.reader(stream, delimiter='\t')
27
+
28
+
29
+ def enumerate_positions(regions):
30
+ """
31
+ Convert a list of regions into a list of individual positions.
32
+
33
+ Parameters:
34
+ regions (list): A list of iterables. Each element must start
35
+ with a contig and start position. End position
36
+ is optional. Additional values will be ignored.
37
+
38
+ Returns:
39
+ SortedSet enumerating the individual positions.
40
+ """
41
+ positions = defaultdict(SortedSet)
42
+ for region in regions:
43
+ positions[region.contig] |= region.enumerate()
44
+ return positions
45
+
46
+
47
+ def get_hostname_string():
48
+ """
49
+ Retrieve the machine hostname, ip, and proccess ID.
50
+
51
+ Returns:
52
+ String in the format "hostname|ip|pid"
53
+ """
54
+ hostname = socket.gethostname()
55
+ ip_addr = socket.gethostbyname(hostname)
56
+ pid = os.getpid()
57
+ return f'{hostname}|{ip_addr}|{pid}'
58
+
59
+
60
+ def check_list(functions, **kwargs):
61
+ """
62
+ Run through a list of functions, determining if any return False.
63
+
64
+ Parameters:
65
+ functions (list): A list of function references
66
+ **kwargs: Any arguments to be passed to the members of functions
67
+
68
+ Returns:
69
+ False if any function in check_list returns False, else True
70
+ """
71
+ for check in functions:
72
+ if not check(**kwargs):
73
+ return False
74
+ return True
75
+
76
+
77
+ def to_int(string):
78
+ """
79
+ Convert a (potentially formatted) string to an int.
80
+
81
+ Parameters:
82
+ string (str): A string representation of an integer
83
+
84
+ Returns:
85
+ The integer values of the string.
86
+ """
87
+ return int(re.sub(r'[\s,]', '', string))
88
+
89
+
90
+ def get_contigs(sam_path):
91
+ """
92
+ Retrieve contig or chromsome data from an alignment file.
93
+
94
+ Parameters:
95
+ sam_path (string): Path to an alignment file.
96
+
97
+ Returns:
98
+ tuple of lists containing the reference names and reference lengths in
99
+ corresponding order
100
+ """
101
+ with AlignmentFile(sam_path, ignore_truncation=True) as sam:
102
+ contigs = list(sam.references)
103
+ sizes = list(sam.lengths)
104
+ indices = range(len(contigs))
105
+ indices = sorted(indices, key=lambda idx: contigs[idx])
106
+ return ((contigs[idx], sizes[idx]) for idx in indices)