REDItools3 3.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of REDItools3 might be problematic. Click here for more details.

@@ -0,0 +1,131 @@
1
+ """Organizational structure for tracking base coverage of genomic positions."""
2
+
3
+ from reditools.compiled_position import CompiledPosition
4
+
5
+ inf = float('inf')
6
+
7
+
8
+ class CompiledReads(object):
9
+ """Manager for CompiledPositions."""
10
+
11
+ _strands = ('-', '+', '*')
12
+
13
+ def __init__(
14
+ self,
15
+ strand=0,
16
+ min_base_position=0,
17
+ max_base_position=inf,
18
+ min_base_quality=0,
19
+ ):
20
+ """
21
+ Create a new CompiledReads object.
22
+
23
+ Parameters:
24
+ strand (int): Strand detection mode
25
+ min_base_position (int): Left trims bases
26
+ max_base_position (int): Right trims bases
27
+ min_base_quality (int): Minimum base quality to report
28
+ """
29
+ self._nucleotides = {}
30
+ if strand == 0:
31
+ self.get_strand = lambda read: read.is_reverse
32
+ else:
33
+ self.get_strand = self._get_strand
34
+
35
+ self._strand_one = strand == 1
36
+ self._ref = None
37
+ self._ref_seq = self._get_ref_from_read
38
+
39
+ self._qc = {
40
+ 'min_base_quality': min_base_quality,
41
+ 'min_base_position': min_base_position,
42
+ 'max_base_position': max_base_position,
43
+ }
44
+
45
+ def add_reference(self, ref):
46
+ """
47
+ Add a reference FASTA file to use.
48
+
49
+ Parameters:
50
+ ref (RTFastaFile): Reference sequence
51
+ """
52
+ self._ref = ref
53
+ self._ref_seq = self._get_ref_from_fasta
54
+
55
+ def add_reads(self, reads):
56
+ """
57
+ Add iterable of pysam reads to the object.
58
+
59
+ The reads are broken down. into individual nucleotides that are
60
+ tracked by chromosomal location.
61
+
62
+ Parameters:
63
+ reads (iterable): pysam reads
64
+ """
65
+ for read in reads:
66
+ strand = self._strands[self.get_strand(read)]
67
+ for pos, base, quality, ref in self._prep_read(read):
68
+ try:
69
+ self._nucleotides[pos].add_base(quality, strand, base)
70
+ except KeyError:
71
+ self._nucleotides[pos] = CompiledPosition(
72
+ ref=ref,
73
+ position=pos,
74
+ contig=read.reference_name,
75
+ )
76
+ self._nucleotides[pos].add_base(quality, strand, base)
77
+
78
+ def pop(self, position):
79
+ """
80
+ Remove and return the CompiledPosition at position.
81
+
82
+ Method returns None if the position is empty.
83
+
84
+ Parameters:
85
+ position (int): The chromosomal location to pop
86
+
87
+ Returns:
88
+ A CompiledPosition or None if position is empty.
89
+ """
90
+ return self._nucleotides.pop(position, None)
91
+
92
+ def is_empty(self):
93
+ """
94
+ Determine if there are any CompiledPositions still in the object.
95
+
96
+ Returns:
97
+ True if the object is empty, else False
98
+ """
99
+ return not self._nucleotides
100
+
101
+ def _get_ref_from_read(self, read):
102
+ return list(read.get_reference_sequence().upper())
103
+
104
+ def _get_ref_from_fasta(self, read):
105
+ pairs = read.get_aligned_pairs(matches_only=True)
106
+ indices = [ref for _, ref in pairs]
107
+ return self._ref.get_base(read.reference_name, *indices)
108
+
109
+ def _qc_base_position(self, read, position):
110
+ return read.query_length - position >= self._qc['max_base_position']
111
+
112
+ def _prep_read(self, read):
113
+ pairs = read.get_aligned_pairs(matches_only=True)
114
+ seq = read.query_sequence.upper()
115
+ qualities = read.query_qualities
116
+ ref_seq = self._ref_seq(read)
117
+ while pairs and pairs[0][0] < self._qc['min_base_position']:
118
+ pairs.pop(0)
119
+ ref_seq.pop(0)
120
+ if not pairs:
121
+ return
122
+
123
+ while pairs and self._qc_base_position(read, pairs[0][0]):
124
+ offset, ref_pos = pairs.pop(0)
125
+ ref_base = ref_seq.pop(0)
126
+ if ref_base != 'N' != seq[offset]:
127
+ if qualities[offset] >= self._qc['min_base_quality']:
128
+ yield (ref_pos, seq[offset], qualities[offset], ref_base)
129
+
130
+ def _get_strand(self, read):
131
+ return read.is_read2 ^ self._strand_one ^ read.is_reverse
@@ -0,0 +1,68 @@
1
+ """Wrappers for PysamFastaFile."""
2
+
3
+ from pysam.libcfaidx import FastaFile as PysamFastaFile
4
+
5
+
6
+ class RTFastaFile(PysamFastaFile):
7
+ """Wrapper for pysam.FastaFile to provide sequence cache."""
8
+
9
+ def __new__(cls, *args, **kwargs):
10
+ r"""
11
+ Create a wrapper for pysam.FastaFile.
12
+
13
+ Parameters:
14
+ *args (list): positional arguments for PysamFastaFile constructor
15
+ **kwargs (dict): named arguments for PysamFastaFile constructor
16
+
17
+ Returns:
18
+ PysamFastaFIle
19
+ """
20
+ return PysamFastaFile.__new__(cls, *args, **kwargs)
21
+
22
+ def __init__(self, *args, **kwargs):
23
+ r"""
24
+ Create a wrapper for pysam.FastaFile.
25
+
26
+ Parameters:
27
+ *args (list): positional arguments for PysamFastaFile constructor
28
+ **kwargs (dict): named arguments for PysamFastaFile constructor
29
+ """
30
+ PysamFastaFile.__init__(self)
31
+
32
+ self._contig_name = False
33
+ self._contig_cache = None
34
+
35
+ def get_base(self, contig, *position):
36
+ """
37
+ Retrieve the base at the given position.
38
+
39
+ Parameters:
40
+ contig (string): Chromsome name
41
+ position (int): Zero-indexed position on reference
42
+
43
+ Returns:
44
+ Base the position as a string.
45
+
46
+ Raises:
47
+ IndexError: The position is not within the contig
48
+ """
49
+ if contig != self._contig_name:
50
+ self._update_contig_cache(contig)
51
+ try:
52
+ if len(position) == 1:
53
+ return self._contig_cache[position[0]]
54
+ return [self._contig_cache[idx] for idx in position]
55
+ except IndexError as exc:
56
+ raise IndexError(
57
+ f'Base position {position} is outside the bounds of ' +
58
+ '{contig}. Are you using the correct reference?',
59
+ ) from exc
60
+
61
+ def _update_contig_cache(self, contig):
62
+ keys = (contig, f'chr{contig}', contig.replace('chr', ''))
63
+ for ref in keys:
64
+ if ref in self:
65
+ self._contig_cache = self.fetch(ref).upper()
66
+ self._contig_name = contig
67
+ return
68
+ raise KeyError(f'Reference name {contig} not found in FASTA file.')
@@ -0,0 +1,132 @@
1
+ """Miscellaneous utility functions."""
2
+
3
+ import csv
4
+ import os
5
+ from collections import defaultdict
6
+ from gzip import open as gzip_open
7
+
8
+ from sortedcontainers import SortedSet
9
+
10
+ from reditools.region import Region
11
+
12
+
13
+ def open_stream(path, mode='rt', encoding='utf-8'):
14
+ """
15
+ Open a input or output stream from a file, accounting for gzip.
16
+
17
+ Parameters:
18
+ path (str): Path to file for reading or writing
19
+ mode (str): File mode
20
+ encoding (str): File encoding
21
+
22
+ Returns:
23
+ TextIOWrapper to the file
24
+ """
25
+ if path.endswith('gz'):
26
+ return gzip_open(path, mode, encoding=encoding)
27
+ return open(path, mode, encoding=encoding) # noqa:WPS515
28
+
29
+
30
+ def read_bed_file(path):
31
+ """
32
+ Return an iterator for a BED file.
33
+
34
+ Parameters:
35
+ path (str): Path to a BED file for reading.
36
+
37
+ Yields:
38
+ BED file contents as Regions.
39
+ """
40
+ stream = open_stream(path)
41
+ reader = csv.reader(
42
+ filter(lambda row: row[0] != '#', stream),
43
+ delimiter='\t',
44
+ )
45
+ yield from (Region(
46
+ contig=row[0],
47
+ start=row[1],
48
+ stop=row[2],
49
+ ) for row in reader
50
+ )
51
+
52
+
53
+ def concat(output, *fnames, clean_up=True, encoding='utf-8'):
54
+ """
55
+ Combine one or more files into another file.
56
+
57
+ Parameters:
58
+ output (file): A file like object for writing
59
+ *fnames (string): Paths to files for concatenation
60
+ clean_up (bool): If True, deletes the files after concatenation
61
+ encoding (string): File encoding
62
+ """
63
+ for fname in fnames:
64
+ with open(fname, 'r', encoding=encoding) as stream:
65
+ for line in stream:
66
+ output.write(line)
67
+ if clean_up:
68
+ os.remove(fname)
69
+
70
+
71
+ def load_poly_regions(fname):
72
+ """
73
+ Read omopolymeric positions from a file.
74
+
75
+ Parameters:
76
+ fname (str): File path
77
+
78
+ Returns:
79
+ (dict): Contigs and regions
80
+ """
81
+ poly_regions = defaultdict(set)
82
+ with read_bed_file(fname) as reader:
83
+ for row in reader:
84
+ poly_regions[row[0]] = Region(
85
+ contig=row[0],
86
+ start=row[1],
87
+ stop=row[2],
88
+ )
89
+ return poly_regions
90
+
91
+
92
+ def load_splicing_file(splicing_file, span):
93
+ """
94
+ Read splicing positions from a file.
95
+
96
+ Parameters:
97
+ splicing_file (str): File path
98
+ span(int): Width of splice sites
99
+
100
+ Returns:
101
+ (dict): Contig and positions
102
+ """
103
+ splice_positions = defaultdict(SortedSet)
104
+ strand_map = {'-': 'D', '+': 'A'}
105
+
106
+ with open_stream(splicing_file, 'r') as stream:
107
+ for line in stream:
108
+ fields = line.strip().split()
109
+
110
+ chrom = fields[0]
111
+ strand = fields[4]
112
+ splice = fields[3]
113
+ span = int(fields[1])
114
+
115
+ coe = -1 if strand_map.get(strand, None) == splice else 1
116
+ new_positions = [1 + span + coe * fctr for fctr in range(span)]
117
+ splice_positions[chrom] |= new_positions
118
+ return splice_positions
119
+
120
+
121
+ def load_text_file(file_name):
122
+ """
123
+ Extract file contents to a list.
124
+
125
+ Parameters:
126
+ file_name (str): The file to open.
127
+
128
+ Returns:
129
+ List of content
130
+ """
131
+ with open_stream(file_name, 'r') as stream:
132
+ return [line.strip() for line in stream]
@@ -0,0 +1,92 @@
1
+ """Repeat Sequence Identifier."""
2
+ import argparse
3
+ import sys
4
+
5
+ from pysam import FastaFile
6
+
7
+ from reditools import file_utils
8
+
9
+
10
+ def find_homo_seqs(seq, length=5):
11
+ """
12
+ Locate regions of repeated bases.
13
+
14
+ Parameters:
15
+ seq (str): The DNA sequence
16
+ length (int): Minimum number of sequential repeats.
17
+
18
+ Yields:
19
+ start, stop, base
20
+ """
21
+ h_base = None
22
+ start = 0
23
+ count = 0
24
+
25
+ for pos, base in enumerate(seq):
26
+ if base == h_base:
27
+ count += 1
28
+ else:
29
+ if count >= length:
30
+ yield (start, start + count, h_base)
31
+ count = 0
32
+ start = pos
33
+ h_base = base
34
+ if count >= length:
35
+ yield (start, start + count, h_base)
36
+
37
+
38
+ def parse_options():
39
+ """
40
+ Parse commandline arguments.
41
+
42
+ Returns:
43
+ namespace
44
+ """
45
+ parser = argparse.ArgumentParser(description='REDItools 2.0')
46
+ parser.add_argument(
47
+ 'file',
48
+ help='The fasta file to be analyzed',
49
+ )
50
+ parser.add_argument(
51
+ '-l',
52
+ '--min-length',
53
+ type=int,
54
+ default=5,
55
+ help='Minimum length of repeat region',
56
+ )
57
+ parser.add_argument(
58
+ '-o',
59
+ '--output',
60
+ help='Destination to write results. Default is to use STDOUT. ' +
61
+ 'If the filename ends in .gz, the contents will be gzipped.',
62
+ )
63
+
64
+ return parser.parse_args()
65
+
66
+
67
+ def main():
68
+ """Report repetative regions."""
69
+ options = parse_options()
70
+ fasta = FastaFile(options.file)
71
+
72
+ if options.output:
73
+ stream = file_utils.open_stream(
74
+ options.output,
75
+ 'wt',
76
+ encoding='utf-8',
77
+ )
78
+ else:
79
+ stream = sys.stdout
80
+
81
+ for seq_name in fasta.references:
82
+ seq = fasta.fetch(seq_name)
83
+ for region in find_homo_seqs(seq, options.min_length):
84
+ fields = [
85
+ seq_name,
86
+ region[0],
87
+ region[1],
88
+ region[1] - region[0],
89
+ region[2],
90
+ ]
91
+ as_str = [str(_) for _ in fields]
92
+ stream.write('\t'.join(as_str) + '\n')
reditools/index.py ADDED
@@ -0,0 +1,268 @@
1
+ """Commandline tool for REDItools."""
2
+
3
+ import argparse
4
+ import csv
5
+ import sys
6
+ from itertools import permutations
7
+ from json import loads as load_json
8
+
9
+ from reditools.file_utils import open_stream, read_bed_file
10
+ from reditools.region import Region
11
+
12
+ _ref = 'Reference'
13
+ _position = 'Position'
14
+ _contig = 'Region'
15
+ _count = 'BaseCount[A,C,G,T]'
16
+ _strand = 'Strand'
17
+ _nucs = 'ACGT'
18
+ _ref_set = {f'{nuc}-{nuc}' for nuc in _nucs}
19
+
20
+
21
+ class Index(object):
22
+ """Utility for calculating editing indices."""
23
+
24
+ def __init__(self, region=None, strand=0):
25
+ """
26
+ Create a new Index.
27
+
28
+ Parameters:
29
+ region (Region): Limit results to the given genomic region
30
+ strand (int): Either 0, 1, or 2 for unstranded, reverse, or forward
31
+ """
32
+ self.targets = {}
33
+ self.exclusions = {}
34
+ self.counts = {'-'.join(_): 0 for _ in permutations(_nucs, 2)}
35
+ self.region = region
36
+ self.strand = ['*', '-', '+'][strand]
37
+
38
+ def add_target_from_bed(self, fname):
39
+ """
40
+ Only report index data for regions from a given bed file.
41
+
42
+ Parameters:
43
+ fname (str): Path to BED formatted file.
44
+ """
45
+ for region in read_bed_file(fname):
46
+ self.targets[region.contig] = update_region_dict(
47
+ self.targets,
48
+ region,
49
+ )
50
+
51
+ def add_exclusions_from_bed(self, fname):
52
+ """
53
+ Exclude index data for regions from a given bed file.
54
+
55
+ Parameters:
56
+ fname (str): Path to BED formatted file.
57
+ """
58
+ for region in read_bed_file(fname):
59
+ self.exclusions[region.contig] = update_region_dict(
60
+ self.exclusions,
61
+ region,
62
+ )
63
+
64
+ def in_region_list(self, region_list, contig, position):
65
+ """
66
+ Check if a genomic position is in a list of regions.
67
+
68
+ Parameters:
69
+ region_list (dict): Region list to check
70
+ contig (str): Contig/Chromsome name
71
+ position (int): Coordinate
72
+
73
+ Returns:
74
+ True if the position is present, else False
75
+ """
76
+ return position in region_list.get(contig, [])
77
+
78
+ def in_targets(self, contig, position):
79
+ """
80
+ Check if a genomic position is in the target list.
81
+
82
+ Parameters:
83
+ contig (str): Contig/Chromsome name
84
+ position (int): Coordiante
85
+
86
+ Returns:
87
+ True if there are no targets or the position is in the target
88
+ list; else False
89
+ """
90
+ return not self.targets or self.in_region_list(self.targets)
91
+
92
+ def in_exclusions(self, contig, position):
93
+ """
94
+ Check if a genomic position is in the exclusions list.
95
+
96
+ Parameters:
97
+ contig (str): Contig/Chromsome name
98
+ position (int): Coordiante
99
+
100
+ Returns:
101
+ True if there are no exclusions or the position is in the
102
+ exclusions list; else False
103
+ """
104
+ return self.exclusions and self.in_region_list(self.exclusions)
105
+
106
+ def do_ignore(self, row):
107
+ """
108
+ Check whether a row should meets analysis criteria.
109
+
110
+ Parameters:
111
+ row (dict): Row from REIDtools output file.
112
+
113
+ Returns:
114
+ True if the row should be discarded; else False
115
+ """
116
+ if '*' != self.strand != row[_strand]:
117
+ return True
118
+ if self.region:
119
+ if not self.region.contains(row[_contig], row[_position]):
120
+ return True
121
+ if self.in_exclusions(row[_contig], row[_position]):
122
+ return True
123
+ return not self.in_targets(row[_contig], row[_position])
124
+
125
+ def add_rt_output(self, fname):
126
+ """
127
+ Count the number of reads with matches and substitutions.
128
+
129
+ Parameters:
130
+ fname (str): File path to a REDItools output
131
+ """
132
+ stream = open_stream(fname)
133
+ reader = csv.DictReader(stream, delimiter='\t')
134
+ for row in reader:
135
+ if self.do_ignore(row):
136
+ continue
137
+ ref = row[_ref]
138
+ reads = load_json(row[_count])
139
+ for nuc, count in zip(_nucs, reads):
140
+ key = f'{nuc}-{ref}'
141
+ self.counts[key] = self.counts.get(key, 0) + count
142
+ stream.close()
143
+
144
+ def calc_index(self):
145
+ """
146
+ Compute all editing indices.
147
+
148
+ Returns:
149
+ Dictionary of indices
150
+ """
151
+ keys = set(self.counts) - _ref_set
152
+ indices = {}
153
+ for idx in keys:
154
+ ref = idx[-1]
155
+ numerator = self.counts[idx]
156
+ denominator = self.counts.get(self.ref_edit(ref), 0) + numerator
157
+ if denominator == 0:
158
+ indices[idx] = 0
159
+ else:
160
+ indices[idx] = numerator / denominator
161
+ return indices
162
+
163
+ def ref_edit(self, ref):
164
+ """
165
+ Format a base as a non-edit.
166
+
167
+ Parameters:
168
+ ref (str): Reference base
169
+
170
+ Returns:
171
+ A string in the format of {ref}-{ref}
172
+ """
173
+ return f'{ref}-{ref}'
174
+
175
+
176
+ def parse_options(): # noqa:WPS213
177
+ """
178
+ Parse commandline options for REDItools.
179
+
180
+ Returns:
181
+ namespace: commandline args
182
+ """
183
+ parser = argparse.ArgumentParser(description='REDItools 2.0')
184
+ parser.add_argument(
185
+ 'file',
186
+ nargs='+',
187
+ help='The REDItools output file to be analyzed',
188
+ )
189
+ parser.add_argument(
190
+ '-o',
191
+ '--output-file',
192
+ help='The output statistics file',
193
+ )
194
+ parser.add_argument(
195
+ '-s',
196
+ '--strand',
197
+ choices=(0, 1, 2),
198
+ type=int,
199
+ default=0,
200
+ help='Strand: this can be 0 (unstranded),' +
201
+ '1 (secondstrand oriented) or ' +
202
+ '2 (firststrand oriented)',
203
+ )
204
+ parser.add_argument(
205
+ '-g',
206
+ '--region',
207
+ help='The genomic region to be analyzed',
208
+ )
209
+ parser.add_argument(
210
+ '-B',
211
+ '--bed_file',
212
+ nargs='+',
213
+ help='Path of BED file containing target regions',
214
+ )
215
+ parser.add_argument(
216
+ '-k',
217
+ '--exclude_regions',
218
+ nargs='+',
219
+ help='Path of BED file containing regions to exclude from analysis',
220
+ )
221
+
222
+ return parser.parse_args()
223
+
224
+
225
+ def main():
226
+ """Perform RNA editing analysis."""
227
+ options = parse_options()
228
+ if options.region:
229
+ indexer = Index(Region(string=options.region), strand=options.strand)
230
+ else:
231
+ indexer = Index(strand=options.strand)
232
+
233
+ if options.exclude_regions:
234
+ for exc_fname in options.exclude_regions:
235
+ indexer.add_exclusions_from_bed(exc_fname)
236
+
237
+ if options.bed_file:
238
+ for trg_fname in options.bed_file:
239
+ indexer.add_target_from_bed(trg_fname)
240
+
241
+ if options.output_file:
242
+ stream = open_stream(options.output_fipe, 'w')
243
+ else:
244
+ stream = sys.stdout
245
+
246
+ for fname in options.file:
247
+ indexer.add_rt_output(fname)
248
+
249
+ for nuc, idx in sorted(indexer.calc_index().items()):
250
+ stream.write(f'{nuc}\t{idx}\n')
251
+
252
+
253
+ def update_region_dict(region_dict, region):
254
+ """
255
+ Add a region to a region dictionary.
256
+
257
+ Parameters:
258
+ region_dict (dict): Region dictionary
259
+ region (Region): Region to add
260
+
261
+ Returns:
262
+ An updated copy of region_dict
263
+ """
264
+ return region_dict.get(region.contig, set()) | region.enumerate()
265
+
266
+
267
+ if __name__ == '__main__':
268
+ main()