REDItools3 3.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of REDItools3 might be problematic. Click here for more details.

reditools/analyze.py ADDED
@@ -0,0 +1,552 @@
1
+ """Commandline tool for REDItools."""
2
+
3
+ import argparse
4
+ import csv
5
+ import sys
6
+ import traceback
7
+ from multiprocessing import Process, Queue
8
+ from queue import Empty as EmptyQueueException
9
+ from tempfile import NamedTemporaryFile
10
+
11
+ from reditools import file_utils, reditools, utils
12
+ from reditools.alignment_manager import AlignmentManager
13
+ from reditools.logger import Logger
14
+ from reditools.region import Region
15
+
16
+ _contig = 'contig'
17
+ _start = 'start'
18
+ _stop = 'stop'
19
+
20
+ fieldnames = [
21
+ 'Region',
22
+ 'Position',
23
+ 'Reference',
24
+ 'Strand',
25
+ 'Coverage-q30',
26
+ 'MeanQ',
27
+ 'BaseCount[A,C,G,T]',
28
+ 'AllSubs',
29
+ 'Frequency',
30
+ 'gCoverage-q30',
31
+ 'gMeanQ',
32
+ 'gBaseCount[A,C,G,T]',
33
+ 'gAllSubs',
34
+ 'gFrequency',
35
+ ]
36
+
37
+
38
+ def setup_alignment_manager(options):
39
+ """
40
+ Create an AlignmentManager for REDItools.
41
+
42
+ Parameters:
43
+ options (namespace): Commandline arguments
44
+
45
+ Returns:
46
+ AlignmentManager
47
+ """
48
+ sam_manager = AlignmentManager(
49
+ ignore_truncation=True,
50
+ )
51
+ sam_manager.min_quality = options.min_read_quality
52
+ sam_manager.min_length = options.min_read_length
53
+ for sam in options.file:
54
+ sam_manager.add_file(
55
+ sam,
56
+ options.exclude_reads,
57
+ )
58
+ return sam_manager
59
+
60
+
61
+ def setup_rtools(options): # noqa:WPS213,WPS231
62
+ """
63
+ Create a REDItools object.
64
+
65
+ Parameters:
66
+ options (namespace): Commandline arguments from argparse
67
+
68
+ Returns:
69
+ A configured REDItools object
70
+ """
71
+ if options.dna:
72
+ rtools = reditools.REDItoolsDNA()
73
+ else:
74
+ rtools = reditools.REDItools()
75
+
76
+ if options.debug:
77
+ rtools.log_level = Logger.debug_level
78
+ elif options.verbose:
79
+ rtools.log_level = Logger.info_level
80
+
81
+ if options.load_omopolymeric_file:
82
+ regions = file_utils.load_omopolymeric_regions(
83
+ options.load_omopolymeric_file,
84
+ )
85
+ rtools.exclude(regions)
86
+
87
+ if options.create_omopolymeric_file:
88
+ rtools.create_omopolymeric_positions(
89
+ options.create_omopolymeric_file,
90
+ options.omopolymeric_span,
91
+ )
92
+
93
+ if options.splicing_file:
94
+ rtools.load_splicing_file(
95
+ options.splicing_file,
96
+ options.splicing_span,
97
+ )
98
+
99
+ if options.bed_file:
100
+ rtools.load_target_positions(options.bed_file)
101
+ if options.exclude_regions:
102
+ for fname in options.exclude_regions:
103
+ regions = file_utils.read_bed_file(fname)
104
+ rtools.exclude(regions)
105
+ if options.reference:
106
+ rtools.add_reference(options.reference)
107
+
108
+ rtools.min_base_position = options.min_base_position
109
+ rtools.max_base_position = options.max_base_position
110
+ rtools.min_base_quality = options.min_base_quality
111
+
112
+ rtools.min_column_length = options.min_column_length
113
+ rtools.min_edits = options.min_edits
114
+ rtools.min_edits_per_nucleotide = options.min_edits_per_nucleotide
115
+ rtools.strand = options.strand
116
+
117
+ rtools.strand_confidence_threshold = options.strand_confidence_threshold
118
+
119
+ if options.strand_correction:
120
+ rtools.use_strand_correction()
121
+ if options.exclude_multis:
122
+ rtools.only_one_alt()
123
+
124
+ return rtools
125
+
126
+
127
+ def region_args(bam_fname, region, window):
128
+ """
129
+ Split a region into segments for paralllel processing.
130
+
131
+ Parameters:
132
+ bam_fname (str): BAM file to collect contig info from
133
+ region (Region): Genomic region to split
134
+ window (int): How large the sub regions should be.
135
+
136
+ Returns:
137
+ (list): Sub regions
138
+ """
139
+ if region is not None:
140
+ if window:
141
+ return region.split(window)
142
+ return [region]
143
+
144
+ args = []
145
+ for contig, size in utils.get_contigs(bam_fname):
146
+ region = Region(contig=contig, start=1, stop=size+1)
147
+ if window:
148
+ args.extend(region.split(window))
149
+ else:
150
+ args.append(region)
151
+ return args
152
+
153
+
154
+ def write_results(rtools, sam_manager, file_name, region, output_format):
155
+ """
156
+ Write the results from a REDItools analysis to a temporary file.
157
+
158
+ Parameters:
159
+ rtools (REDItools): REDItools instance
160
+ sam_manager (AlignmentManager): Source of reads
161
+ file_name (string): Input file name for analysis
162
+ region: Region to analyze
163
+ output_format (dict): keyword arguments for csv.writer constructor.
164
+
165
+ Returns:
166
+ string: Name of the temporary file.
167
+ """
168
+ with NamedTemporaryFile(mode='w', delete=False) as stream:
169
+ writer = csv.writer(stream, **output_format)
170
+ for rt_result in rtools.analyze(sam_manager, region):
171
+ variants = rt_result.variants
172
+ writer.writerow([
173
+ rt_result.contig,
174
+ rt_result.position,
175
+ rt_result.reference,
176
+ rt_result.strand,
177
+ rt_result.depth,
178
+ f'{rt_result.mean_quality:.2f}',
179
+ rt_result.per_base_depth,
180
+ ' '.join(sorted(variants)) if variants else '-',
181
+ f'{rt_result.edit_ratio:.2f}',
182
+ '\t'.join(['-' for _ in range(5)]),
183
+ ])
184
+ return stream.name
185
+
186
+
187
+ def run(options, in_queue, out_queue):
188
+ """
189
+ Analyze a genomic segment using REDItools.
190
+
191
+ Parameters:
192
+ options (namesapce): Configuration options from argparse for REDItools
193
+ in_queue (Queue): Queue of input arguments for analysis
194
+ out_queue (Queue): Queue to store paths to analysis results
195
+
196
+ Returns:
197
+ bool: True if the in_queue is empty
198
+ """
199
+ try:
200
+ rtools = setup_rtools(options)
201
+ while True:
202
+ args = in_queue.get()
203
+ if args is None:
204
+ return True
205
+ sam_manager = setup_alignment_manager(options)
206
+ idx, region = args
207
+ file_name = write_results(
208
+ rtools,
209
+ sam_manager,
210
+ options.file,
211
+ region,
212
+ options.output_format,
213
+ )
214
+ out_queue.put((idx, file_name))
215
+ except Exception as exc:
216
+ if options.debug:
217
+ traceback.print_exception(*sys.exc_info())
218
+ sys.stderr.write(f'[ERROR] {exc}\n')
219
+
220
+
221
+ def parse_options(): # noqa:WPS213
222
+ """
223
+ Parse commandline options for REDItools.
224
+
225
+ Returns:
226
+ namespace: commandline args
227
+ """
228
+ parser = argparse.ArgumentParser(description='REDItools 2.0')
229
+ parser.add_argument(
230
+ 'file',
231
+ nargs='+',
232
+ help='The bam file to be analyzed',
233
+ )
234
+ parser.add_argument(
235
+ '-r',
236
+ '--reference',
237
+ help='The reference FASTA file',
238
+ )
239
+ parser.add_argument(
240
+ '-o',
241
+ '--output-file',
242
+ help='The output statistics file',
243
+ )
244
+ parser.add_argument(
245
+ '-s',
246
+ '--strand',
247
+ choices=(0, 1, 2),
248
+ type=int,
249
+ default=0,
250
+ help='Strand: this can be 0 (unstranded),' +
251
+ '1 (secondstrand oriented) or ' +
252
+ '2 (firststrand oriented)',
253
+ )
254
+ parser.add_argument(
255
+ '-a',
256
+ '--append-file',
257
+ action='store_true',
258
+ help='Appends results to file (and creates if not existing)',
259
+ )
260
+ parser.add_argument(
261
+ '-g',
262
+ '--region',
263
+ help='The self.region of the bam file to be analyzed',
264
+ )
265
+ parser.add_argument(
266
+ '-m',
267
+ '--load-omopolymeric-file',
268
+ help='The file containing the omopolymeric positions',
269
+ )
270
+ parser.add_argument(
271
+ '-c',
272
+ '--create-omopolymeric-file',
273
+ default=False,
274
+ help='Path to write omopolymeric positions to',
275
+ action='store_true',
276
+ )
277
+ parser.add_argument(
278
+ '-os',
279
+ '--omopolymeric-span',
280
+ type=int,
281
+ default=5,
282
+ help='The omopolymeric span',
283
+ )
284
+ parser.add_argument(
285
+ '-sf',
286
+ '--splicing-file',
287
+ help='The file containing the splicing sites positions',
288
+ )
289
+ parser.add_argument(
290
+ '-ss',
291
+ '--splicing-span',
292
+ type=int,
293
+ default=4,
294
+ help='The splicing span',
295
+ )
296
+ parser.add_argument(
297
+ '-mrl',
298
+ '--min-read-length',
299
+ type=int,
300
+ default=30, # noqa:WPS432
301
+ help='Reads whose length is below this value will be discarded.',
302
+ )
303
+ parser.add_argument(
304
+ '-q',
305
+ '--min-read-quality',
306
+ type=int,
307
+ default=20, # noqa:WPS432
308
+ help='Reads with mapping quality below this value will be discarded.',
309
+ )
310
+ parser.add_argument(
311
+ '-bq',
312
+ '--min-base-quality',
313
+ type=int,
314
+ default=30, # noqa:WPS432
315
+ help='Base quality below this value will not be included in ' +
316
+ 'the analysis.',
317
+ )
318
+ parser.add_argument(
319
+ '-mbp',
320
+ '--min-base-position',
321
+ type=int,
322
+ default=0,
323
+ help='Bases which reside in a previous position (in the read)' +
324
+ 'will not be included in the analysis.',
325
+ )
326
+ parser.add_argument(
327
+ '-Mbp',
328
+ '--max-base-position',
329
+ type=int,
330
+ default=0,
331
+ help='Bases which reside in a further position (in the read)' +
332
+ 'will not be included in the analysis.',
333
+ )
334
+ parser.add_argument(
335
+ '-l',
336
+ '--min-column-length',
337
+ type=int,
338
+ default=1,
339
+ help='Positions whose columns have length below this value will' +
340
+ 'not be included in the analysis.',
341
+ )
342
+ parser.add_argument(
343
+ '-e',
344
+ '--exclude-multis',
345
+ default=False,
346
+ help='Do not report any position with more than one alternate base.',
347
+ action='store_true',
348
+ )
349
+ parser.add_argument(
350
+ '-men',
351
+ '--min-edits-per-nucleotide',
352
+ type=int,
353
+ default=0,
354
+ help='Positions whose columns have bases with less than' +
355
+ 'min-edits-per-base edits will not be included in the analysis.',
356
+ )
357
+ parser.add_argument(
358
+ '-me',
359
+ '--min-edits',
360
+ type=int,
361
+ default=0, # noqa:WPS432
362
+ help='The minimum number of editing events (per position). ' +
363
+ 'Positions whose columns have bases with less than ' +
364
+ '"min-edits-per-base edits" will not be included in the ' +
365
+ 'analysis.',
366
+ )
367
+ parser.add_argument(
368
+ '-Men',
369
+ '--max-editing-nucleotides',
370
+ type=int,
371
+ default=100, # noqa:WPS432
372
+ help='The maximum number of editing nucleotides, from 0 to 4 ' +
373
+ '(per position). Positions whose columns have more than ' +
374
+ '"max-editing-nucleotides" will not be included in the analysis.',
375
+ )
376
+ parser.add_argument(
377
+ '-T',
378
+ '--strand-confidence-threshold',
379
+ type=float,
380
+ default=0.7, # noqa:WPS432
381
+ help='Only report the strandedness if at least this proportion of ' +
382
+ 'reads are of a given strand',
383
+ )
384
+ parser.add_argument(
385
+ '-C',
386
+ '--strand-correction',
387
+ default=False,
388
+ help='Strand correction. Once the strand has been inferred, ' +
389
+ 'only bases according to this strand will be selected.',
390
+ action='store_true',
391
+ )
392
+ parser.add_argument(
393
+ '-V',
394
+ '--verbose',
395
+ default=False,
396
+ help='Verbose information in stderr',
397
+ action='store_true',
398
+ )
399
+ parser.add_argument(
400
+ '-N',
401
+ '--dna',
402
+ default=False,
403
+ help='Run REDItools 2.0 on DNA-Seq data',
404
+ action='store_true',
405
+ )
406
+ parser.add_argument(
407
+ '-B',
408
+ '--bed_file',
409
+ help='Path of BED file containing target self.regions',
410
+ )
411
+ parser.add_argument(
412
+ '-t',
413
+ '--threads',
414
+ help='Number of threads to run',
415
+ type=int,
416
+ default=1,
417
+ )
418
+ parser.add_argument(
419
+ '-w',
420
+ '--window',
421
+ help='How many bp should be processed by each thread at a time. ' +
422
+ 'Defaults to full contig.',
423
+ type=int,
424
+ default=0,
425
+ )
426
+ parser.add_argument(
427
+ '-k',
428
+ '--exclude_regions',
429
+ nargs='+',
430
+ help='Path of BED file containing regions to exclude from analysis',
431
+ )
432
+ parser.add_argument(
433
+ '-E',
434
+ '--exclude_reads',
435
+ help='Path to a text file listing read names to exclude from analysis',
436
+ )
437
+ parser.add_argument(
438
+ '-d',
439
+ '--debug',
440
+ default=False,
441
+ help='REDItools is run in DEBUG mode.',
442
+ action='store_true',
443
+ )
444
+
445
+ return parser.parse_args()
446
+
447
+
448
+ def check_dead(processes):
449
+ """
450
+ Look through processes to determine if any have died unexpectedly.
451
+
452
+ If any process has an exit code of 1, this method will terminate all other
453
+ processes and then exit with code 1.
454
+
455
+ Parameters:
456
+ processes (list): Processes to check
457
+ """
458
+ for proc in processes:
459
+ if proc.exitcode == 1:
460
+ for to_kill in processes:
461
+ to_kill.kill()
462
+ sys.stderr.write('[ERROR] Killing job\n')
463
+ sys.exit(1)
464
+
465
+
466
+ def main():
467
+ """Perform RNA editing analysis."""
468
+ options = parse_options()
469
+ options.output_format = {'delimiter': '\t', 'lineterminator': '\n'}
470
+ options.encoding = 'utf-8'
471
+ if options.exclude_reads:
472
+ options.exclude_reads = file_utils.load_text_file(
473
+ options.exclude_reads,
474
+ )
475
+
476
+ # Put analysis chunks into queue
477
+ regions = region_args(
478
+ options.file[0],
479
+ Region(string=options.region) if options.region else None,
480
+ window=options.window,
481
+ )
482
+
483
+ in_queue = Queue()
484
+ for args in enumerate(regions):
485
+ in_queue.put(args)
486
+ for _ in range(options.threads):
487
+ in_queue.put(None)
488
+
489
+ # Start parallel jobs
490
+ out_queue = Queue()
491
+ processes = [
492
+ Process(
493
+ target=run,
494
+ args=(options, in_queue, out_queue),
495
+ ) for _ in range(options.threads)
496
+ ]
497
+ concat_output(
498
+ options,
499
+ monitor(processes, out_queue, in_queue.qsize()),
500
+ )
501
+
502
+
503
+ def monitor(processes, out_queue, chunks):
504
+ """
505
+ Monitor parallel REDItools jobs.
506
+
507
+ Parameters:
508
+ processes (list): Threads
509
+ out_queue (Queue): Output of threads
510
+ chunks (int): Number of chunks for analysis
511
+
512
+ Returns:
513
+ list: Temporary files containing the output of each chunk.
514
+ """
515
+ tfs = [None for _ in range(chunks - len(processes))]
516
+
517
+ for prc in processes:
518
+ prc.start()
519
+
520
+ while None in tfs:
521
+ try:
522
+ idx, fname = out_queue.get(block=False, timeout=1)
523
+ tfs[idx] = fname
524
+ except EmptyQueueException:
525
+ check_dead(processes)
526
+ return tfs
527
+
528
+
529
+ def concat_output(options, tfs):
530
+ """
531
+ Write the output of a REDItools analysis.
532
+
533
+ Parameters:
534
+ options (namespace): Commandline options for file formatting.
535
+ tfs (list): Temporary files containing REDItools results
536
+ """
537
+ # Setup final output file
538
+ if options.output_file:
539
+ mode = 'a' if options.append_file else 'w'
540
+ stream = file_utils.open_stream(
541
+ options.output_file,
542
+ mode,
543
+ encoding=options.encoding,
544
+ )
545
+ else:
546
+ stream = sys.stdout
547
+
548
+ with stream:
549
+ writer = csv.writer(stream, **options.output_format)
550
+ if not options.append_file:
551
+ writer.writerow(fieldnames)
552
+ file_utils.concat(stream, *tfs, encoding=options.encoding)
@@ -0,0 +1,133 @@
1
+ """Organizational structure for tracking base coverage of genomic positions."""
2
+
3
+
4
+ class CompiledPosition(object):
5
+ """Tracks base frequency for a genomic position."""
6
+
7
+ _bases = 'ACGT'
8
+ _comp = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
9
+
10
+ def __init__(self, ref, contig, position):
11
+ """
12
+ Create a new compiled position.
13
+
14
+ Parameters:
15
+ ref (string): The reference base for this position
16
+ contig (string): Chromosome name
17
+ position (int): Genomic coordinate
18
+ """
19
+ self.qualities = []
20
+ self.strands = []
21
+ self.bases = []
22
+ self.counter = False
23
+ self.ref = ref
24
+ self.contig = contig
25
+ self.position = position
26
+
27
+ def __len__(self):
28
+ """
29
+ Position depth.
30
+
31
+ Returns:
32
+ int
33
+ """
34
+ return len(self.bases)
35
+
36
+ def __getitem__(self, base):
37
+ """
38
+ Frequency of a given nucleotide at this position.
39
+
40
+ Parameters:
41
+ base (str): The nucleotide (A, C, G, T, or REF)
42
+
43
+ Returns:
44
+ int: The total number of reads with the given base
45
+ """
46
+ if not self.counter:
47
+ self.counter = {base: 0 for base in self._bases}
48
+ for base_member in self.bases:
49
+ self.counter[base_member] += 1
50
+ if base.upper() == 'REF':
51
+ return self.counter[self.ref]
52
+ return self.counter[base]
53
+
54
+ def __iter__(self):
55
+ """
56
+ Iterate over each base frequency.
57
+
58
+ Returns:
59
+ iterator
60
+ """
61
+ return (self[base] for base in self._bases)
62
+
63
+ def add_base(self, quality, strand, base):
64
+ """
65
+ Add details for a base at this position.
66
+
67
+ Parameters:
68
+ quality (int): The quality of the read
69
+ strand (str): The strand the base is on (+, -, or *)
70
+ base (str): The nucleotide at the position( A, C, G, or T)
71
+ """
72
+ self.qualities.append(quality)
73
+ self.strands.append(strand)
74
+ self.bases.append(base)
75
+ self.counter = False
76
+
77
+ def complement(self):
78
+ """Modify all the summarized nucleotides to their complements."""
79
+ self.bases = [self._comp[base] for base in self.bases]
80
+ self.ref = self._comp[self.ref]
81
+ if not self.counter:
82
+ return
83
+ complements = self._comp.items()
84
+ self.counter = {sb: self.counter[bs] for bs, sb in complements}
85
+
86
+ def get_variants(self):
87
+ """
88
+ List all detected variants.
89
+
90
+ Returns:
91
+ list
92
+ """
93
+ alts = set(self._bases) - {self.ref}
94
+ return [base for base in alts if self[base]]
95
+
96
+ def get_strand(self, threshold=0):
97
+ """
98
+ Determine the mean strandedness of a position.
99
+
100
+ Parameters:
101
+ threshold (int): Confidence minimum for strand identification
102
+
103
+ Returns:
104
+ '+', '-', or '*'
105
+ """
106
+ strand_counts = {'+': 0, '-': 0, '*': 0}
107
+ for idx in self.strands:
108
+ strand_counts[idx] += 1
109
+ total = strand_counts['+'] + strand_counts['-']
110
+ if total == 0:
111
+ return '*'
112
+
113
+ strand = max(strand_counts, key=strand_counts.get)
114
+ if strand_counts[strand] / total >= threshold:
115
+ return strand
116
+ return '*'
117
+
118
+ def filter_by_strand(self, strand):
119
+ """
120
+ Remove all bases not on the strand.
121
+
122
+ Parameters:
123
+ strand (str): Either +, -, or *
124
+ """
125
+ keep = range(len(self.bases))
126
+ keep = [idx for idx in keep if self.strands[idx] == strand]
127
+ self.qualities = self._filter(self.qualities, keep)
128
+ self.strands = self._filter(self.strands, keep)
129
+ self.bases = self._filter(self.bases, keep)
130
+ self.counter = False
131
+
132
+ def _filter(self, lst, indx):
133
+ return [lst[idx] for idx in indx]