REDItools3 3.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of REDItools3 might be problematic. Click here for more details.

reditools/logger.py ADDED
@@ -0,0 +1,44 @@
1
+ """Fast Logging for REDItools."""
2
+ import os
3
+ import socket
4
+ import sys
5
+ from datetime import datetime
6
+
7
+
8
+ class Logger(object):
9
+ """Fast logger for REDItools."""
10
+
11
+ silent_level = 'SILENT'
12
+ info_level = 'INFO'
13
+ debug_level = 'DEBUG'
14
+
15
+ def __init__(self, level):
16
+ """
17
+ Create a new Logger.
18
+
19
+ Parameters:
20
+ level (str): either 'INFO' or 'DEBUG'
21
+ """
22
+ hostname = socket.gethostname()
23
+ ip_addr = socket.gethostbyname(hostname)
24
+ pid = os.getpid()
25
+ self.hostname_string = f'{hostname}|{ip_addr}|{pid}'
26
+
27
+ if level.upper() == self.debug_level:
28
+ self.log = self._log_all
29
+ elif level.upper() == self.info_level:
30
+ self.log = self._log_info
31
+ else:
32
+ self.log = lambda *_: None
33
+
34
+ def _log_all(self, level, message, *args):
35
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
36
+ message = message.format(*args)
37
+ sys.stderr.write(
38
+ f'{timestamp} [{self.hostname_string}] ' +
39
+ f'[{level}] {message}\n',
40
+ )
41
+
42
+ def _log_info(self, level, message, *args):
43
+ if level == self.info_level:
44
+ self._log_all(level, message, *args)
reditools/reditools.py ADDED
@@ -0,0 +1,456 @@
1
+ """
2
+ Analysis system for RNA editing events.
3
+
4
+ Authors:
5
+ flat - 2017
6
+ ahanden - 2022
7
+ """
8
+
9
+ from reditools import utils
10
+ from reditools.compiled_reads import CompiledReads
11
+ from reditools.fasta_file import RTFastaFile
12
+ from reditools.logger import Logger
13
+ from reditools.rtchecks import RTChecks
14
+
15
+
16
+ class RTResult(object):
17
+ """RNA editing analysis for a single base position."""
18
+
19
+ def __init__(self, bases, strand, contig, position):
20
+ """
21
+ RNA editing analysis for a single base position.
22
+
23
+ Parameters:
24
+ bases (compiledPosition): Bases found by REDItools
25
+ strand (str): Strand of the position
26
+ contig (str): Contig name
27
+ position (int): Genomic position
28
+ """
29
+ self.contig = contig
30
+ self.position = position + 1
31
+ self.bases = bases
32
+ self.strand = strand
33
+ self._variants = bases.get_variants()
34
+
35
+ @property
36
+ def variants(self):
37
+ """
38
+ The detected variants at this position.
39
+
40
+ Returns:
41
+ list
42
+ """
43
+ ref = self.bases.ref
44
+ return [f'{ref}{base}' for base in self._variants]
45
+
46
+ @property
47
+ def mean_quality(self):
48
+ """
49
+ Mean read quality of the base position.
50
+
51
+ Returns:
52
+ int
53
+ """
54
+ if self.bases:
55
+ return sum(self.bases.qualities) / len(self.bases)
56
+ return 0
57
+
58
+ @property
59
+ def edit_ratio(self):
60
+ """
61
+ Edit ratio.
62
+
63
+ Returns:
64
+ float
65
+ """
66
+ if self._variants:
67
+ max_edits = max(self.bases[base] for base in self._variants)
68
+ else:
69
+ max_edits = 0
70
+ return max_edits / (max_edits + self.bases['REF'])
71
+
72
+ @property
73
+ def reference(self):
74
+ """
75
+ Base in the reference genome.
76
+
77
+ Returns:
78
+ str
79
+ """
80
+ return self.bases.ref
81
+
82
+ @property
83
+ def depth(self):
84
+ """
85
+ How many reads cover the position. (post filtering).
86
+
87
+ Returns:
88
+ int
89
+ """
90
+ return len(self.bases)
91
+
92
+ @property
93
+ def per_base_depth(self):
94
+ """
95
+ How many reads had each base for this position.
96
+
97
+ Returns:
98
+ list
99
+ """
100
+ return list(iter(self.bases))
101
+
102
+
103
+ class REDItools(object):
104
+ """Analysis system for RNA editing events."""
105
+
106
+ def __init__(self):
107
+ """Create a new REDItools object."""
108
+ self.hostname_string = utils.get_hostname_string()
109
+ self._min_column_length = 1
110
+ self._min_edits = 0
111
+ self._min_edits_per_nucleotide = 0
112
+
113
+ self.log_level = Logger.silent_level
114
+
115
+ self.strand = 0
116
+ self._use_strand_correction = False
117
+ self.strand_confidence_threshold = 0.5
118
+
119
+ self.min_base_quality = 30
120
+ self.min_base_position = 0
121
+ self.max_base_position = float('inf')
122
+
123
+ self._rtqc = RTChecks()
124
+
125
+ self._min_read_quality = 0
126
+
127
+ self._target_positions = False
128
+ self._exclude_positions = {}
129
+ self._splice_positions = []
130
+
131
+ self._specific_edits = None
132
+
133
+ self.reference = None
134
+
135
+ self._include_refs = None
136
+
137
+ @property
138
+ def includ_refs(self):
139
+ """
140
+ Genome reference bases to report on.
141
+
142
+ Returns:
143
+ list
144
+ """
145
+ return self._include_refs
146
+
147
+ @property
148
+ def specific_edits(self):
149
+ """
150
+ Specific edit events to report.
151
+
152
+ Returns:
153
+ iterable
154
+ """
155
+ return self._specific_edits
156
+
157
+ @specific_edits.setter
158
+ def specific_edits(self, alts):
159
+ function_a = self._rtqc.check_specific_edits
160
+ function_b = self._rtqc.check_ref
161
+ self._specific_edits = set(alts)
162
+ self._include_refs = [_[0] for _ in alts]
163
+ if self._include_refs:
164
+ self._rtqc.add(function_a)
165
+ self._rtqc.add(function_b)
166
+ else:
167
+ self._rtqc.discard(function_a)
168
+ self._rtqc.discard(function_b)
169
+
170
+ @property
171
+ def splice_positions(self):
172
+ """
173
+ Known splice sites.
174
+
175
+ Returns:
176
+ list
177
+ """
178
+ return self._splice_positions
179
+
180
+ @splice_positions.setter
181
+ def splice_positions(self, regions):
182
+ function = self._rtqc.check_splice_positions
183
+ if regions:
184
+ self._splice_positions = utils.enumerate_positions(regions)
185
+ self._rtqc.add(function)
186
+ else:
187
+ self._splice_positions = []
188
+ self._rtqc.discard(function)
189
+
190
+ @property
191
+ def target_positions(self):
192
+ """
193
+ Only report results for these locations.
194
+
195
+ Returns:
196
+ list
197
+ """
198
+ return self._target_positions
199
+
200
+ @target_positions.setter
201
+ def target_positions(self, regions):
202
+ function = self._rtqc.check_target_positions
203
+ if regions:
204
+ self._target_positions = utils.enumerate_positions(regions)
205
+ self._rtqc.add(function)
206
+ else:
207
+ self._target_positions = False
208
+ self._rtqc.discard(function)
209
+
210
+ @property
211
+ def log_level(self):
212
+ """
213
+ The logging level.
214
+
215
+ Returns:
216
+ Log level
217
+ """
218
+ return self._log_level
219
+
220
+ @log_level.setter
221
+ def log_level(self, level):
222
+ """
223
+ Set the class logging level.
224
+
225
+ Parameters:
226
+ level (str): logging level
227
+ """
228
+ self._logger = Logger(level)
229
+ self.log = self._logger.log
230
+
231
+ @property
232
+ def min_read_quality(self):
233
+ """Minimum read quality for inclusion."""
234
+ return self._min_read_quality # noqa:DAR201
235
+
236
+ @min_read_quality.setter
237
+ def min_read_quality(self, threshold):
238
+ self._min_read_quality = threshold
239
+ function = self._rtqc.check_column_quality
240
+ if self._min_read_quality > 0:
241
+ self._rtqc.add(function)
242
+ else:
243
+ self._rtqc.discard(function)
244
+
245
+ @property
246
+ def min_column_length(self):
247
+ """Minimum depth for a position to be reported."""
248
+ return self._min_column_length # noqa:DAR201
249
+
250
+ @min_column_length.setter
251
+ def min_column_length(self, threshold):
252
+ self._min_column_length = threshold
253
+ function = self._rtqc.check_column_min_length
254
+ if threshold > 1:
255
+ self._rtqc.add(function)
256
+ else:
257
+ self._rtqc.discard(function)
258
+
259
+ @property
260
+ def min_edits(self):
261
+ """Minimum number of editing events for reporting."""
262
+ return self._min_edits # noqa:DAR201
263
+
264
+ @min_edits.setter
265
+ def min_edits(self, threshold):
266
+ self._min_edits = threshold
267
+ function = self._rtqc.check_column_edit_frequency
268
+ if threshold > 0:
269
+ self._rtqc.add(function)
270
+ else:
271
+ self._rtqc.discard(function)
272
+
273
+ @property
274
+ def min_edits_per_nucleotide(self):
275
+ """Minimum number of edits for a single nucleotide for reporting."""
276
+ return self._min_edits_per_nucleotide # noqa:DAR201
277
+
278
+ @min_edits_per_nucleotide.setter
279
+ def min_edits_per_nucleotide(self, threshold):
280
+ self._min_edits_per_nucleotide = threshold
281
+ function = self._rtqc.check_column_min_edits
282
+ if threshold > 0:
283
+ self._rtqc.add(function)
284
+ else:
285
+ self._rtqc.discard(function)
286
+
287
+ @property
288
+ def exclude_positions(self):
289
+ """
290
+ Genomic positions NOT to include in output.
291
+
292
+ Returns:
293
+ Dictionary of contigs to positions
294
+ """
295
+ return self._exclude_positions
296
+
297
+ def exclude(self, regions):
298
+ """
299
+ Explicitly skip specified genomic regions.
300
+
301
+ Parameters:
302
+ regions (list): Regions to skip
303
+ """
304
+ for region in regions:
305
+ contig = region.contig
306
+ old_pos = self._exclude_positions.get(contig, set())
307
+ self._exclude_positions[contig] = old_pos | region.enumerate()
308
+ function = self._rtqc.check_exclusion
309
+ if self._exclude_positions:
310
+ self._rtqc.add(function)
311
+ else:
312
+ self._rtqc.discard(function)
313
+
314
+ def analyze(self, alignment_manager, region=None): # noqa:WPS231,WPS213
315
+ """
316
+ Detect RNA editing events.
317
+
318
+ Parameters:
319
+ alignment_manager (AlignmentManager): Source of reads
320
+ region (Region): Where to look for edits
321
+
322
+ Yields:
323
+ Analysis results for each base position in region
324
+ """
325
+ if region is None:
326
+ region = {}
327
+
328
+ # Open the iterator
329
+ self.log(
330
+ Logger.info_level,
331
+ 'Fetching data from bams {} [REGION={}]',
332
+ alignment_manager.file_list,
333
+ region,
334
+ )
335
+ read_iter = alignment_manager.fetch_by_position(region=region)
336
+ reads = next(read_iter, None)
337
+ nucleotides = CompiledReads(
338
+ self.strand,
339
+ self.min_base_position,
340
+ self.max_base_position,
341
+ self.min_base_quality,
342
+ )
343
+ if self.reference:
344
+ nucleotides.add_reference(self.reference)
345
+ total = 0
346
+ while reads is not None or not nucleotides.is_empty():
347
+ if nucleotides.is_empty():
348
+ self.log(
349
+ Logger.debug_level,
350
+ 'Nucleotides is empty: skipping ahead',
351
+ )
352
+ position = alignment_manager.position
353
+ contig = alignment_manager.contig
354
+ else:
355
+ position += 1
356
+
357
+ if region.stop and position >= region.stop:
358
+ break
359
+ self.log(
360
+ Logger.debug_level,
361
+ 'Analyzing position {} {}',
362
+ contig,
363
+ position,
364
+ )
365
+ # Get all the read(s) starting at position
366
+ if reads and reads[0].reference_start == position:
367
+ self.log(Logger.debug_level, 'Adding {} reads', len(reads))
368
+ total += len(reads)
369
+ nucleotides.add_reads(reads)
370
+ reads = next(read_iter, None)
371
+ # Process edits
372
+ bases = nucleotides.pop(position)
373
+ if not self._rtqc.check(self, bases):
374
+ continue
375
+ column = self._get_column(position, bases, region)
376
+ if column is None:
377
+ self.log(Logger.debug_level, 'Bad column - skipping')
378
+ continue
379
+ if self._specific_edits:
380
+ if not self._specific_edits & set(column.variants):
381
+ self.log(
382
+ Logger.debug_level,
383
+ 'Requested edits not found - skipping',
384
+ )
385
+ continue
386
+ self.log(
387
+ Logger.debug_level,
388
+ 'Yielding output for {} reads',
389
+ len(bases),
390
+ )
391
+ yield column
392
+ self.log(
393
+ Logger.info_level,
394
+ '[REGION={}] {} total reads',
395
+ region,
396
+ total,
397
+ )
398
+
399
+ def use_strand_correction(self):
400
+ """Only reports reads/positions that match `strand`."""
401
+ self._use_strand_correction = True
402
+
403
+ def only_one_alt(self):
404
+ """Only report a position if there is less than 2 alt bases."""
405
+ self._rtqc.add(self._rtqc.check_multiple_alts)
406
+
407
+ def add_reference(self, reference_fname):
408
+ """
409
+ Use a reference fasta file instead of reference from the BAM files.
410
+
411
+ Parameters:
412
+ reference_fname (str): File path to FASTA reference
413
+ """
414
+ self.reference = RTFastaFile(reference_fname)
415
+
416
+ def _get_column(self, position, bases, region):
417
+ strand = bases.get_strand(threshold=self.strand_confidence_threshold)
418
+ if self._use_strand_correction:
419
+ bases.filter_by_strand(strand)
420
+ if not bases:
421
+ return None
422
+ if strand == '-':
423
+ bases.complement()
424
+
425
+ past_stop = position + 1 >= (region.stop or 0)
426
+ if past_stop or bases is None:
427
+ return None
428
+
429
+ return RTResult(bases, strand, region.contig, position)
430
+
431
+
432
+ class REDItoolsDNA(REDItools):
433
+ """
434
+ Analysis system for editing events in DNA.
435
+
436
+ Raises:
437
+ ValueError: You cannot set the strand parameter using this class.
438
+ """
439
+
440
+ def __init__(self):
441
+ """Create a new REDItoolsDNA object."""
442
+ self.get_position_strand = lambda *_: '*'
443
+ self._get_strand = lambda *_: '*'
444
+ REDItools.__init__(self)
445
+
446
+ def set_strand(self, strand):
447
+ """
448
+ Not applicable for DNA analysis.
449
+
450
+ Parameters:
451
+ strand (int): N/A
452
+
453
+ Raises:
454
+ ValueError: You cannot call this method for DNA analyses.
455
+ """
456
+ raise ValueError('Cannot set strand value if DNA is True')
reditools/region.py ADDED
@@ -0,0 +1,130 @@
1
+ """Genomic Region."""
2
+
3
+ import re
4
+
5
+
6
+ class Region(object):
7
+ """Genomic Region."""
8
+
9
+ def __init__(self, **kwargs):
10
+ """
11
+ Create a new genomic region.
12
+
13
+ Parameters:
14
+ **kwargs (dict):
15
+ string (str): String representation of a region
16
+ OR
17
+ contig (str): Contig name
18
+ start (int): Genomic start
19
+ stop (int): Genomic stop
20
+
21
+ Raises:
22
+ ValueError: The contig is missing
23
+ """
24
+ if 'string' in kwargs:
25
+ region = self._parse_string(kwargs['string']) # noqa:WPS529
26
+ self.contig = region[0]
27
+ self.start = region[1]
28
+ self.stop = region[2]
29
+ else:
30
+ if 'contig' not in kwargs:
31
+ raise ValueError('Region constructor requires a contig.')
32
+ self.contig = kwargs['contig']
33
+ self.start = self._to_int(kwargs.get('start', 1))
34
+ self.stop = self._to_int(kwargs.get('stop', None))
35
+
36
+ def __str__(self):
37
+ """
38
+ Put the region into standard string format.
39
+
40
+ Returns:
41
+ (str): contig:start-stop
42
+ """
43
+ region = self.contig
44
+ if self.start:
45
+ region = f'{region}:{self.start}'
46
+ if self.stop:
47
+ region = f'{region}-{self.stop}'
48
+ return region
49
+
50
+ def split(self, window):
51
+ """
52
+ Split the region into a list of smaller regions.
53
+
54
+ Parameters:
55
+ window (int): The size of the sub regions in bp
56
+
57
+ Returns:
58
+ list
59
+
60
+ Raises:
61
+ IndexError: The region is missing a start or stop
62
+ """
63
+ if not self.stop or not self.start:
64
+ raise IndexError('Can only split a region with a start and stop.')
65
+ length = self.stop - self.start
66
+ sub_regions = []
67
+ for offset in range(0, length + 1, window):
68
+ sub_regions.append(Region(
69
+ contig=self.contig,
70
+ start=self.start + offset,
71
+ stop=self.start + offset + window,
72
+ ))
73
+ if self.start < length:
74
+ sub_regions.append(Region(
75
+ contig=self.contig,
76
+ start=sub_regions[-1].stop,
77
+ stop=self.stop,
78
+ ))
79
+ return sub_regions
80
+
81
+ def enumerate(self):
82
+ """
83
+ Convert a list of regions into a list of individual positions.
84
+
85
+ Returns:
86
+ Set enumerating the individual positions.
87
+ """
88
+ return set(range(self.start, self.stop))
89
+
90
+ def contains(self, contig, position):
91
+ """
92
+ Determines if a given genomic location is within the region.
93
+
94
+ Parameters:
95
+ contig (str): Contig/Chromosome name
96
+ position (int): Position
97
+
98
+ Returns:
99
+ bool
100
+ """
101
+ if self.contig != contig:
102
+ return False
103
+ left = self.start is None or self.start <= position
104
+ right = self.stop is None or position < self.stop
105
+ return left and right
106
+
107
+ def _parse_string(self, region_str):
108
+ if region_str is None:
109
+ return None
110
+ region = re.split('[:-]', region_str)
111
+ if not region:
112
+ return None
113
+ contig = region[0]
114
+ start = None
115
+ stop = None
116
+
117
+ if len(region) > 3:
118
+ raise ValueError(f'Unrecognized format: {region_str}.')
119
+ if len(region) > 1:
120
+ start = self._to_int(region[1])
121
+ if len(region) == 3:
122
+ stop = self._to_int(region[2])
123
+ return (contig, start, stop)
124
+
125
+ def _to_int(self, number):
126
+ if isinstance(number, str):
127
+ return int(re.sub(r'[\s,]', '', number))
128
+ if number is None:
129
+ return None
130
+ return int(number)