REDItools3 3.1a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of REDItools3 might be problematic. Click here for more details.
- REDItools3-3.1a0.dist-info/LICENSE +674 -0
- REDItools3-3.1a0.dist-info/METADATA +36 -0
- REDItools3-3.1a0.dist-info/RECORD +21 -0
- REDItools3-3.1a0.dist-info/WHEEL +5 -0
- REDItools3-3.1a0.dist-info/top_level.txt +1 -0
- reditools/__init__.py +1 -0
- reditools/__main__.py +37 -0
- reditools/alignment_file.py +146 -0
- reditools/alignment_manager.py +136 -0
- reditools/analyze.py +552 -0
- reditools/compiled_position.py +133 -0
- reditools/compiled_reads.py +131 -0
- reditools/fasta_file.py +68 -0
- reditools/file_utils.py +132 -0
- reditools/homopolymerics.py +92 -0
- reditools/index.py +268 -0
- reditools/logger.py +44 -0
- reditools/reditools.py +456 -0
- reditools/region.py +130 -0
- reditools/rtchecks.py +274 -0
- reditools/utils.py +106 -0
reditools/logger.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Fast Logging for REDItools."""
|
|
2
|
+
import os
|
|
3
|
+
import socket
|
|
4
|
+
import sys
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Logger(object):
|
|
9
|
+
"""Fast logger for REDItools."""
|
|
10
|
+
|
|
11
|
+
silent_level = 'SILENT'
|
|
12
|
+
info_level = 'INFO'
|
|
13
|
+
debug_level = 'DEBUG'
|
|
14
|
+
|
|
15
|
+
def __init__(self, level):
|
|
16
|
+
"""
|
|
17
|
+
Create a new Logger.
|
|
18
|
+
|
|
19
|
+
Parameters:
|
|
20
|
+
level (str): either 'INFO' or 'DEBUG'
|
|
21
|
+
"""
|
|
22
|
+
hostname = socket.gethostname()
|
|
23
|
+
ip_addr = socket.gethostbyname(hostname)
|
|
24
|
+
pid = os.getpid()
|
|
25
|
+
self.hostname_string = f'{hostname}|{ip_addr}|{pid}'
|
|
26
|
+
|
|
27
|
+
if level.upper() == self.debug_level:
|
|
28
|
+
self.log = self._log_all
|
|
29
|
+
elif level.upper() == self.info_level:
|
|
30
|
+
self.log = self._log_info
|
|
31
|
+
else:
|
|
32
|
+
self.log = lambda *_: None
|
|
33
|
+
|
|
34
|
+
def _log_all(self, level, message, *args):
|
|
35
|
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
36
|
+
message = message.format(*args)
|
|
37
|
+
sys.stderr.write(
|
|
38
|
+
f'{timestamp} [{self.hostname_string}] ' +
|
|
39
|
+
f'[{level}] {message}\n',
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def _log_info(self, level, message, *args):
|
|
43
|
+
if level == self.info_level:
|
|
44
|
+
self._log_all(level, message, *args)
|
reditools/reditools.py
ADDED
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Analysis system for RNA editing events.
|
|
3
|
+
|
|
4
|
+
Authors:
|
|
5
|
+
flat - 2017
|
|
6
|
+
ahanden - 2022
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from reditools import utils
|
|
10
|
+
from reditools.compiled_reads import CompiledReads
|
|
11
|
+
from reditools.fasta_file import RTFastaFile
|
|
12
|
+
from reditools.logger import Logger
|
|
13
|
+
from reditools.rtchecks import RTChecks
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RTResult(object):
|
|
17
|
+
"""RNA editing analysis for a single base position."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, bases, strand, contig, position):
|
|
20
|
+
"""
|
|
21
|
+
RNA editing analysis for a single base position.
|
|
22
|
+
|
|
23
|
+
Parameters:
|
|
24
|
+
bases (compiledPosition): Bases found by REDItools
|
|
25
|
+
strand (str): Strand of the position
|
|
26
|
+
contig (str): Contig name
|
|
27
|
+
position (int): Genomic position
|
|
28
|
+
"""
|
|
29
|
+
self.contig = contig
|
|
30
|
+
self.position = position + 1
|
|
31
|
+
self.bases = bases
|
|
32
|
+
self.strand = strand
|
|
33
|
+
self._variants = bases.get_variants()
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def variants(self):
|
|
37
|
+
"""
|
|
38
|
+
The detected variants at this position.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
list
|
|
42
|
+
"""
|
|
43
|
+
ref = self.bases.ref
|
|
44
|
+
return [f'{ref}{base}' for base in self._variants]
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def mean_quality(self):
|
|
48
|
+
"""
|
|
49
|
+
Mean read quality of the base position.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
int
|
|
53
|
+
"""
|
|
54
|
+
if self.bases:
|
|
55
|
+
return sum(self.bases.qualities) / len(self.bases)
|
|
56
|
+
return 0
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def edit_ratio(self):
|
|
60
|
+
"""
|
|
61
|
+
Edit ratio.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
float
|
|
65
|
+
"""
|
|
66
|
+
if self._variants:
|
|
67
|
+
max_edits = max(self.bases[base] for base in self._variants)
|
|
68
|
+
else:
|
|
69
|
+
max_edits = 0
|
|
70
|
+
return max_edits / (max_edits + self.bases['REF'])
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def reference(self):
|
|
74
|
+
"""
|
|
75
|
+
Base in the reference genome.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
str
|
|
79
|
+
"""
|
|
80
|
+
return self.bases.ref
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def depth(self):
|
|
84
|
+
"""
|
|
85
|
+
How many reads cover the position. (post filtering).
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
int
|
|
89
|
+
"""
|
|
90
|
+
return len(self.bases)
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def per_base_depth(self):
|
|
94
|
+
"""
|
|
95
|
+
How many reads had each base for this position.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
list
|
|
99
|
+
"""
|
|
100
|
+
return list(iter(self.bases))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class REDItools(object):
|
|
104
|
+
"""Analysis system for RNA editing events."""
|
|
105
|
+
|
|
106
|
+
def __init__(self):
|
|
107
|
+
"""Create a new REDItools object."""
|
|
108
|
+
self.hostname_string = utils.get_hostname_string()
|
|
109
|
+
self._min_column_length = 1
|
|
110
|
+
self._min_edits = 0
|
|
111
|
+
self._min_edits_per_nucleotide = 0
|
|
112
|
+
|
|
113
|
+
self.log_level = Logger.silent_level
|
|
114
|
+
|
|
115
|
+
self.strand = 0
|
|
116
|
+
self._use_strand_correction = False
|
|
117
|
+
self.strand_confidence_threshold = 0.5
|
|
118
|
+
|
|
119
|
+
self.min_base_quality = 30
|
|
120
|
+
self.min_base_position = 0
|
|
121
|
+
self.max_base_position = float('inf')
|
|
122
|
+
|
|
123
|
+
self._rtqc = RTChecks()
|
|
124
|
+
|
|
125
|
+
self._min_read_quality = 0
|
|
126
|
+
|
|
127
|
+
self._target_positions = False
|
|
128
|
+
self._exclude_positions = {}
|
|
129
|
+
self._splice_positions = []
|
|
130
|
+
|
|
131
|
+
self._specific_edits = None
|
|
132
|
+
|
|
133
|
+
self.reference = None
|
|
134
|
+
|
|
135
|
+
self._include_refs = None
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def includ_refs(self):
|
|
139
|
+
"""
|
|
140
|
+
Genome reference bases to report on.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
list
|
|
144
|
+
"""
|
|
145
|
+
return self._include_refs
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def specific_edits(self):
|
|
149
|
+
"""
|
|
150
|
+
Specific edit events to report.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
iterable
|
|
154
|
+
"""
|
|
155
|
+
return self._specific_edits
|
|
156
|
+
|
|
157
|
+
@specific_edits.setter
|
|
158
|
+
def specific_edits(self, alts):
|
|
159
|
+
function_a = self._rtqc.check_specific_edits
|
|
160
|
+
function_b = self._rtqc.check_ref
|
|
161
|
+
self._specific_edits = set(alts)
|
|
162
|
+
self._include_refs = [_[0] for _ in alts]
|
|
163
|
+
if self._include_refs:
|
|
164
|
+
self._rtqc.add(function_a)
|
|
165
|
+
self._rtqc.add(function_b)
|
|
166
|
+
else:
|
|
167
|
+
self._rtqc.discard(function_a)
|
|
168
|
+
self._rtqc.discard(function_b)
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def splice_positions(self):
|
|
172
|
+
"""
|
|
173
|
+
Known splice sites.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
list
|
|
177
|
+
"""
|
|
178
|
+
return self._splice_positions
|
|
179
|
+
|
|
180
|
+
@splice_positions.setter
|
|
181
|
+
def splice_positions(self, regions):
|
|
182
|
+
function = self._rtqc.check_splice_positions
|
|
183
|
+
if regions:
|
|
184
|
+
self._splice_positions = utils.enumerate_positions(regions)
|
|
185
|
+
self._rtqc.add(function)
|
|
186
|
+
else:
|
|
187
|
+
self._splice_positions = []
|
|
188
|
+
self._rtqc.discard(function)
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def target_positions(self):
|
|
192
|
+
"""
|
|
193
|
+
Only report results for these locations.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
list
|
|
197
|
+
"""
|
|
198
|
+
return self._target_positions
|
|
199
|
+
|
|
200
|
+
@target_positions.setter
|
|
201
|
+
def target_positions(self, regions):
|
|
202
|
+
function = self._rtqc.check_target_positions
|
|
203
|
+
if regions:
|
|
204
|
+
self._target_positions = utils.enumerate_positions(regions)
|
|
205
|
+
self._rtqc.add(function)
|
|
206
|
+
else:
|
|
207
|
+
self._target_positions = False
|
|
208
|
+
self._rtqc.discard(function)
|
|
209
|
+
|
|
210
|
+
@property
|
|
211
|
+
def log_level(self):
|
|
212
|
+
"""
|
|
213
|
+
The logging level.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Log level
|
|
217
|
+
"""
|
|
218
|
+
return self._log_level
|
|
219
|
+
|
|
220
|
+
@log_level.setter
|
|
221
|
+
def log_level(self, level):
|
|
222
|
+
"""
|
|
223
|
+
Set the class logging level.
|
|
224
|
+
|
|
225
|
+
Parameters:
|
|
226
|
+
level (str): logging level
|
|
227
|
+
"""
|
|
228
|
+
self._logger = Logger(level)
|
|
229
|
+
self.log = self._logger.log
|
|
230
|
+
|
|
231
|
+
@property
|
|
232
|
+
def min_read_quality(self):
|
|
233
|
+
"""Minimum read quality for inclusion."""
|
|
234
|
+
return self._min_read_quality # noqa:DAR201
|
|
235
|
+
|
|
236
|
+
@min_read_quality.setter
|
|
237
|
+
def min_read_quality(self, threshold):
|
|
238
|
+
self._min_read_quality = threshold
|
|
239
|
+
function = self._rtqc.check_column_quality
|
|
240
|
+
if self._min_read_quality > 0:
|
|
241
|
+
self._rtqc.add(function)
|
|
242
|
+
else:
|
|
243
|
+
self._rtqc.discard(function)
|
|
244
|
+
|
|
245
|
+
@property
|
|
246
|
+
def min_column_length(self):
|
|
247
|
+
"""Minimum depth for a position to be reported."""
|
|
248
|
+
return self._min_column_length # noqa:DAR201
|
|
249
|
+
|
|
250
|
+
@min_column_length.setter
|
|
251
|
+
def min_column_length(self, threshold):
|
|
252
|
+
self._min_column_length = threshold
|
|
253
|
+
function = self._rtqc.check_column_min_length
|
|
254
|
+
if threshold > 1:
|
|
255
|
+
self._rtqc.add(function)
|
|
256
|
+
else:
|
|
257
|
+
self._rtqc.discard(function)
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def min_edits(self):
|
|
261
|
+
"""Minimum number of editing events for reporting."""
|
|
262
|
+
return self._min_edits # noqa:DAR201
|
|
263
|
+
|
|
264
|
+
@min_edits.setter
|
|
265
|
+
def min_edits(self, threshold):
|
|
266
|
+
self._min_edits = threshold
|
|
267
|
+
function = self._rtqc.check_column_edit_frequency
|
|
268
|
+
if threshold > 0:
|
|
269
|
+
self._rtqc.add(function)
|
|
270
|
+
else:
|
|
271
|
+
self._rtqc.discard(function)
|
|
272
|
+
|
|
273
|
+
@property
|
|
274
|
+
def min_edits_per_nucleotide(self):
|
|
275
|
+
"""Minimum number of edits for a single nucleotide for reporting."""
|
|
276
|
+
return self._min_edits_per_nucleotide # noqa:DAR201
|
|
277
|
+
|
|
278
|
+
@min_edits_per_nucleotide.setter
|
|
279
|
+
def min_edits_per_nucleotide(self, threshold):
|
|
280
|
+
self._min_edits_per_nucleotide = threshold
|
|
281
|
+
function = self._rtqc.check_column_min_edits
|
|
282
|
+
if threshold > 0:
|
|
283
|
+
self._rtqc.add(function)
|
|
284
|
+
else:
|
|
285
|
+
self._rtqc.discard(function)
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def exclude_positions(self):
|
|
289
|
+
"""
|
|
290
|
+
Genomic positions NOT to include in output.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Dictionary of contigs to positions
|
|
294
|
+
"""
|
|
295
|
+
return self._exclude_positions
|
|
296
|
+
|
|
297
|
+
def exclude(self, regions):
|
|
298
|
+
"""
|
|
299
|
+
Explicitly skip specified genomic regions.
|
|
300
|
+
|
|
301
|
+
Parameters:
|
|
302
|
+
regions (list): Regions to skip
|
|
303
|
+
"""
|
|
304
|
+
for region in regions:
|
|
305
|
+
contig = region.contig
|
|
306
|
+
old_pos = self._exclude_positions.get(contig, set())
|
|
307
|
+
self._exclude_positions[contig] = old_pos | region.enumerate()
|
|
308
|
+
function = self._rtqc.check_exclusion
|
|
309
|
+
if self._exclude_positions:
|
|
310
|
+
self._rtqc.add(function)
|
|
311
|
+
else:
|
|
312
|
+
self._rtqc.discard(function)
|
|
313
|
+
|
|
314
|
+
def analyze(self, alignment_manager, region=None): # noqa:WPS231,WPS213
|
|
315
|
+
"""
|
|
316
|
+
Detect RNA editing events.
|
|
317
|
+
|
|
318
|
+
Parameters:
|
|
319
|
+
alignment_manager (AlignmentManager): Source of reads
|
|
320
|
+
region (Region): Where to look for edits
|
|
321
|
+
|
|
322
|
+
Yields:
|
|
323
|
+
Analysis results for each base position in region
|
|
324
|
+
"""
|
|
325
|
+
if region is None:
|
|
326
|
+
region = {}
|
|
327
|
+
|
|
328
|
+
# Open the iterator
|
|
329
|
+
self.log(
|
|
330
|
+
Logger.info_level,
|
|
331
|
+
'Fetching data from bams {} [REGION={}]',
|
|
332
|
+
alignment_manager.file_list,
|
|
333
|
+
region,
|
|
334
|
+
)
|
|
335
|
+
read_iter = alignment_manager.fetch_by_position(region=region)
|
|
336
|
+
reads = next(read_iter, None)
|
|
337
|
+
nucleotides = CompiledReads(
|
|
338
|
+
self.strand,
|
|
339
|
+
self.min_base_position,
|
|
340
|
+
self.max_base_position,
|
|
341
|
+
self.min_base_quality,
|
|
342
|
+
)
|
|
343
|
+
if self.reference:
|
|
344
|
+
nucleotides.add_reference(self.reference)
|
|
345
|
+
total = 0
|
|
346
|
+
while reads is not None or not nucleotides.is_empty():
|
|
347
|
+
if nucleotides.is_empty():
|
|
348
|
+
self.log(
|
|
349
|
+
Logger.debug_level,
|
|
350
|
+
'Nucleotides is empty: skipping ahead',
|
|
351
|
+
)
|
|
352
|
+
position = alignment_manager.position
|
|
353
|
+
contig = alignment_manager.contig
|
|
354
|
+
else:
|
|
355
|
+
position += 1
|
|
356
|
+
|
|
357
|
+
if region.stop and position >= region.stop:
|
|
358
|
+
break
|
|
359
|
+
self.log(
|
|
360
|
+
Logger.debug_level,
|
|
361
|
+
'Analyzing position {} {}',
|
|
362
|
+
contig,
|
|
363
|
+
position,
|
|
364
|
+
)
|
|
365
|
+
# Get all the read(s) starting at position
|
|
366
|
+
if reads and reads[0].reference_start == position:
|
|
367
|
+
self.log(Logger.debug_level, 'Adding {} reads', len(reads))
|
|
368
|
+
total += len(reads)
|
|
369
|
+
nucleotides.add_reads(reads)
|
|
370
|
+
reads = next(read_iter, None)
|
|
371
|
+
# Process edits
|
|
372
|
+
bases = nucleotides.pop(position)
|
|
373
|
+
if not self._rtqc.check(self, bases):
|
|
374
|
+
continue
|
|
375
|
+
column = self._get_column(position, bases, region)
|
|
376
|
+
if column is None:
|
|
377
|
+
self.log(Logger.debug_level, 'Bad column - skipping')
|
|
378
|
+
continue
|
|
379
|
+
if self._specific_edits:
|
|
380
|
+
if not self._specific_edits & set(column.variants):
|
|
381
|
+
self.log(
|
|
382
|
+
Logger.debug_level,
|
|
383
|
+
'Requested edits not found - skipping',
|
|
384
|
+
)
|
|
385
|
+
continue
|
|
386
|
+
self.log(
|
|
387
|
+
Logger.debug_level,
|
|
388
|
+
'Yielding output for {} reads',
|
|
389
|
+
len(bases),
|
|
390
|
+
)
|
|
391
|
+
yield column
|
|
392
|
+
self.log(
|
|
393
|
+
Logger.info_level,
|
|
394
|
+
'[REGION={}] {} total reads',
|
|
395
|
+
region,
|
|
396
|
+
total,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
def use_strand_correction(self):
|
|
400
|
+
"""Only reports reads/positions that match `strand`."""
|
|
401
|
+
self._use_strand_correction = True
|
|
402
|
+
|
|
403
|
+
def only_one_alt(self):
|
|
404
|
+
"""Only report a position if there is less than 2 alt bases."""
|
|
405
|
+
self._rtqc.add(self._rtqc.check_multiple_alts)
|
|
406
|
+
|
|
407
|
+
def add_reference(self, reference_fname):
|
|
408
|
+
"""
|
|
409
|
+
Use a reference fasta file instead of reference from the BAM files.
|
|
410
|
+
|
|
411
|
+
Parameters:
|
|
412
|
+
reference_fname (str): File path to FASTA reference
|
|
413
|
+
"""
|
|
414
|
+
self.reference = RTFastaFile(reference_fname)
|
|
415
|
+
|
|
416
|
+
def _get_column(self, position, bases, region):
|
|
417
|
+
strand = bases.get_strand(threshold=self.strand_confidence_threshold)
|
|
418
|
+
if self._use_strand_correction:
|
|
419
|
+
bases.filter_by_strand(strand)
|
|
420
|
+
if not bases:
|
|
421
|
+
return None
|
|
422
|
+
if strand == '-':
|
|
423
|
+
bases.complement()
|
|
424
|
+
|
|
425
|
+
past_stop = position + 1 >= (region.stop or 0)
|
|
426
|
+
if past_stop or bases is None:
|
|
427
|
+
return None
|
|
428
|
+
|
|
429
|
+
return RTResult(bases, strand, region.contig, position)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
class REDItoolsDNA(REDItools):
|
|
433
|
+
"""
|
|
434
|
+
Analysis system for editing events in DNA.
|
|
435
|
+
|
|
436
|
+
Raises:
|
|
437
|
+
ValueError: You cannot set the strand parameter using this class.
|
|
438
|
+
"""
|
|
439
|
+
|
|
440
|
+
def __init__(self):
|
|
441
|
+
"""Create a new REDItoolsDNA object."""
|
|
442
|
+
self.get_position_strand = lambda *_: '*'
|
|
443
|
+
self._get_strand = lambda *_: '*'
|
|
444
|
+
REDItools.__init__(self)
|
|
445
|
+
|
|
446
|
+
def set_strand(self, strand):
|
|
447
|
+
"""
|
|
448
|
+
Not applicable for DNA analysis.
|
|
449
|
+
|
|
450
|
+
Parameters:
|
|
451
|
+
strand (int): N/A
|
|
452
|
+
|
|
453
|
+
Raises:
|
|
454
|
+
ValueError: You cannot call this method for DNA analyses.
|
|
455
|
+
"""
|
|
456
|
+
raise ValueError('Cannot set strand value if DNA is True')
|
reditools/region.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Genomic Region."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Region(object):
|
|
7
|
+
"""Genomic Region."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, **kwargs):
|
|
10
|
+
"""
|
|
11
|
+
Create a new genomic region.
|
|
12
|
+
|
|
13
|
+
Parameters:
|
|
14
|
+
**kwargs (dict):
|
|
15
|
+
string (str): String representation of a region
|
|
16
|
+
OR
|
|
17
|
+
contig (str): Contig name
|
|
18
|
+
start (int): Genomic start
|
|
19
|
+
stop (int): Genomic stop
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ValueError: The contig is missing
|
|
23
|
+
"""
|
|
24
|
+
if 'string' in kwargs:
|
|
25
|
+
region = self._parse_string(kwargs['string']) # noqa:WPS529
|
|
26
|
+
self.contig = region[0]
|
|
27
|
+
self.start = region[1]
|
|
28
|
+
self.stop = region[2]
|
|
29
|
+
else:
|
|
30
|
+
if 'contig' not in kwargs:
|
|
31
|
+
raise ValueError('Region constructor requires a contig.')
|
|
32
|
+
self.contig = kwargs['contig']
|
|
33
|
+
self.start = self._to_int(kwargs.get('start', 1))
|
|
34
|
+
self.stop = self._to_int(kwargs.get('stop', None))
|
|
35
|
+
|
|
36
|
+
def __str__(self):
|
|
37
|
+
"""
|
|
38
|
+
Put the region into standard string format.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
(str): contig:start-stop
|
|
42
|
+
"""
|
|
43
|
+
region = self.contig
|
|
44
|
+
if self.start:
|
|
45
|
+
region = f'{region}:{self.start}'
|
|
46
|
+
if self.stop:
|
|
47
|
+
region = f'{region}-{self.stop}'
|
|
48
|
+
return region
|
|
49
|
+
|
|
50
|
+
def split(self, window):
|
|
51
|
+
"""
|
|
52
|
+
Split the region into a list of smaller regions.
|
|
53
|
+
|
|
54
|
+
Parameters:
|
|
55
|
+
window (int): The size of the sub regions in bp
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
list
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
IndexError: The region is missing a start or stop
|
|
62
|
+
"""
|
|
63
|
+
if not self.stop or not self.start:
|
|
64
|
+
raise IndexError('Can only split a region with a start and stop.')
|
|
65
|
+
length = self.stop - self.start
|
|
66
|
+
sub_regions = []
|
|
67
|
+
for offset in range(0, length + 1, window):
|
|
68
|
+
sub_regions.append(Region(
|
|
69
|
+
contig=self.contig,
|
|
70
|
+
start=self.start + offset,
|
|
71
|
+
stop=self.start + offset + window,
|
|
72
|
+
))
|
|
73
|
+
if self.start < length:
|
|
74
|
+
sub_regions.append(Region(
|
|
75
|
+
contig=self.contig,
|
|
76
|
+
start=sub_regions[-1].stop,
|
|
77
|
+
stop=self.stop,
|
|
78
|
+
))
|
|
79
|
+
return sub_regions
|
|
80
|
+
|
|
81
|
+
def enumerate(self):
|
|
82
|
+
"""
|
|
83
|
+
Convert a list of regions into a list of individual positions.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Set enumerating the individual positions.
|
|
87
|
+
"""
|
|
88
|
+
return set(range(self.start, self.stop))
|
|
89
|
+
|
|
90
|
+
def contains(self, contig, position):
|
|
91
|
+
"""
|
|
92
|
+
Determines if a given genomic location is within the region.
|
|
93
|
+
|
|
94
|
+
Parameters:
|
|
95
|
+
contig (str): Contig/Chromosome name
|
|
96
|
+
position (int): Position
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
bool
|
|
100
|
+
"""
|
|
101
|
+
if self.contig != contig:
|
|
102
|
+
return False
|
|
103
|
+
left = self.start is None or self.start <= position
|
|
104
|
+
right = self.stop is None or position < self.stop
|
|
105
|
+
return left and right
|
|
106
|
+
|
|
107
|
+
def _parse_string(self, region_str):
|
|
108
|
+
if region_str is None:
|
|
109
|
+
return None
|
|
110
|
+
region = re.split('[:-]', region_str)
|
|
111
|
+
if not region:
|
|
112
|
+
return None
|
|
113
|
+
contig = region[0]
|
|
114
|
+
start = None
|
|
115
|
+
stop = None
|
|
116
|
+
|
|
117
|
+
if len(region) > 3:
|
|
118
|
+
raise ValueError(f'Unrecognized format: {region_str}.')
|
|
119
|
+
if len(region) > 1:
|
|
120
|
+
start = self._to_int(region[1])
|
|
121
|
+
if len(region) == 3:
|
|
122
|
+
stop = self._to_int(region[2])
|
|
123
|
+
return (contig, start, stop)
|
|
124
|
+
|
|
125
|
+
def _to_int(self, number):
|
|
126
|
+
if isinstance(number, str):
|
|
127
|
+
return int(re.sub(r'[\s,]', '', number))
|
|
128
|
+
if number is None:
|
|
129
|
+
return None
|
|
130
|
+
return int(number)
|