PyPI - REDItools3 - Versions diffs - 3.1a0__py3-none-any.whl - Mend

REDItools3 3.1a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of REDItools3 might be problematic. Click here for more details.

Files changed (21) hide show

REDItools3-3.1a0.dist-info/LICENSE +674 -0
REDItools3-3.1a0.dist-info/METADATA +36 -0
REDItools3-3.1a0.dist-info/RECORD +21 -0
REDItools3-3.1a0.dist-info/WHEEL +5 -0
REDItools3-3.1a0.dist-info/top_level.txt +1 -0
reditools/__init__.py +1 -0
reditools/__main__.py +37 -0
reditools/alignment_file.py +146 -0
reditools/alignment_manager.py +136 -0
reditools/analyze.py +552 -0
reditools/compiled_position.py +133 -0
reditools/compiled_reads.py +131 -0
reditools/fasta_file.py +68 -0
reditools/file_utils.py +132 -0
reditools/homopolymerics.py +92 -0
reditools/index.py +268 -0
reditools/logger.py +44 -0
reditools/reditools.py +456 -0
reditools/region.py +130 -0
reditools/rtchecks.py +274 -0
reditools/utils.py +106 -0

reditools/compiled_reads.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""Organizational structure for tracking base coverage of genomic positions."""
+from reditools.compiled_position import CompiledPosition
+inf = float('inf')
+class CompiledReads(object):
+    """Manager for CompiledPositions."""
+    _strands = ('-', '+', '*')
+    def __init__(
+        self,
+        strand=0,
+        min_base_position=0,
+        max_base_position=inf,
+        min_base_quality=0,
+    ):
+        """
+        Create a new CompiledReads object.
+        Parameters:
+            strand (int): Strand detection mode
+            min_base_position (int): Left trims bases
+            max_base_position (int): Right trims bases
+            min_base_quality (int): Minimum base quality to report
+        """
+        self._nucleotides = {}
+        if strand == 0:
+            self.get_strand = lambda read: read.is_reverse
+        else:
+            self.get_strand = self._get_strand
+        self._strand_one = strand == 1
+        self._ref = None
+        self._ref_seq = self._get_ref_from_read
+        self._qc = {
+            'min_base_quality': min_base_quality,
+            'min_base_position': min_base_position,
+            'max_base_position': max_base_position,
+        }
+    def add_reference(self, ref):
+        """
+        Add a reference FASTA file to use.
+        Parameters:
+            ref (RTFastaFile): Reference sequence
+        """
+        self._ref = ref
+        self._ref_seq = self._get_ref_from_fasta
+    def add_reads(self, reads):
+        """
+        Add iterable of pysam reads to the object.
+        The reads are broken down. into individual nucleotides that are
+        tracked by chromosomal location.
+        Parameters:
+            reads (iterable): pysam reads
+        """
+        for read in reads:
+            strand = self._strands[self.get_strand(read)]
+            for pos, base, quality, ref in self._prep_read(read):
+                try:
+                    self._nucleotides[pos].add_base(quality, strand, base)
+                except KeyError:
+                    self._nucleotides[pos] = CompiledPosition(
+                        ref=ref,
+                        position=pos,
+                        contig=read.reference_name,
+                    )
+                    self._nucleotides[pos].add_base(quality, strand, base)
+    def pop(self, position):
+        """
+        Remove and return the CompiledPosition at position.
+        Method returns None if the position is empty.
+        Parameters:
+            position (int): The chromosomal location to pop
+        Returns:
+            A CompiledPosition or None if position is empty.
+        """
+        return self._nucleotides.pop(position, None)
+    def is_empty(self):
+        """
+        Determine if there are any CompiledPositions still in the object.
+        Returns:
+            True if the object is empty, else False
+        """
+        return not self._nucleotides
+    def _get_ref_from_read(self, read):
+        return list(read.get_reference_sequence().upper())
+    def _get_ref_from_fasta(self, read):
+        pairs = read.get_aligned_pairs(matches_only=True)
+        indices = [ref for _, ref in pairs]
+        return self._ref.get_base(read.reference_name, *indices)
+    def _qc_base_position(self, read, position):
+        return read.query_length - position >= self._qc['max_base_position']
+    def _prep_read(self, read):
+        pairs = read.get_aligned_pairs(matches_only=True)
+        seq = read.query_sequence.upper()
+        qualities = read.query_qualities
+        ref_seq = self._ref_seq(read)
+        while pairs and pairs[0][0] < self._qc['min_base_position']:
+            pairs.pop(0)
+            ref_seq.pop(0)
+        if not pairs:
+            return
+        while pairs and self._qc_base_position(read, pairs[0][0]):
+            offset, ref_pos = pairs.pop(0)
+            ref_base = ref_seq.pop(0)
+            if ref_base != 'N' != seq[offset]:
+                if qualities[offset] >= self._qc['min_base_quality']:
+                    yield (ref_pos, seq[offset], qualities[offset], ref_base)
+    def _get_strand(self, read):
+        return read.is_read2 ^ self._strand_one ^ read.is_reverse

reditools/fasta_file.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Wrappers for PysamFastaFile."""
+from pysam.libcfaidx import FastaFile as PysamFastaFile
+class RTFastaFile(PysamFastaFile):
+    """Wrapper for pysam.FastaFile to provide sequence cache."""
+    def __new__(cls, *args, **kwargs):
+        r"""
+        Create a wrapper for pysam.FastaFile.
+        Parameters:
+            *args (list): positional arguments for PysamFastaFile constructor
+            **kwargs (dict): named arguments for PysamFastaFile constructor
+        Returns:
+            PysamFastaFIle
+        """
+        return PysamFastaFile.__new__(cls, *args, **kwargs)
+    def __init__(self, *args, **kwargs):
+        r"""
+        Create a wrapper for pysam.FastaFile.
+        Parameters:
+            *args (list): positional arguments for PysamFastaFile constructor
+            **kwargs (dict): named arguments for PysamFastaFile constructor
+        """
+        PysamFastaFile.__init__(self)
+        self._contig_name = False
+        self._contig_cache = None
+    def get_base(self, contig, *position):
+        """
+        Retrieve the base at the given position.
+        Parameters:
+            contig (string): Chromsome name
+            position (int): Zero-indexed position on reference
+        Returns:
+            Base the position as a string.
+        Raises:
+            IndexError: The position is not within the contig
+        """
+        if contig != self._contig_name:
+            self._update_contig_cache(contig)
+        try:
+            if len(position) == 1:
+                return self._contig_cache[position[0]]
+            return [self._contig_cache[idx] for idx in position]
+        except IndexError as exc:
+            raise IndexError(
+                f'Base position {position} is outside the bounds of ' +
+                '{contig}. Are you using the correct reference?',
+            ) from exc
+    def _update_contig_cache(self, contig):
+        keys = (contig, f'chr{contig}', contig.replace('chr', ''))
+        for ref in keys:
+            if ref in self:
+                self._contig_cache = self.fetch(ref).upper()
+                self._contig_name = contig
+                return
+        raise KeyError(f'Reference name {contig} not found in FASTA file.')

reditools/file_utils.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""Miscellaneous utility functions."""
+import csv
+import os
+from collections import defaultdict
+from gzip import open as gzip_open
+from sortedcontainers import SortedSet
+from reditools.region import Region
+def open_stream(path, mode='rt', encoding='utf-8'):
+    """
+    Open a input or output stream from a file, accounting for gzip.
+    Parameters:
+        path (str): Path to file for reading or writing
+        mode (str): File mode
+        encoding (str): File encoding
+    Returns:
+        TextIOWrapper to the file
+    """
+    if path.endswith('gz'):
+        return gzip_open(path, mode, encoding=encoding)
+    return open(path, mode, encoding=encoding)  # noqa:WPS515
+def read_bed_file(path):
+    """
+    Return an iterator for a BED file.
+    Parameters:
+        path (str): Path to a BED file for reading.
+    Yields:
+        BED file contents as Regions.
+    """
+    stream = open_stream(path)
+    reader = csv.reader(
+        filter(lambda row: row[0] != '#', stream),
+        delimiter='\t',
+    )
+    yield from (Region(
+        contig=row[0],
+        start=row[1],
+        stop=row[2],
+        ) for row in reader
+    )
+def concat(output, *fnames, clean_up=True, encoding='utf-8'):
+    """
+    Combine one or more files into another file.
+    Parameters:
+        output (file): A file like object for writing
+        *fnames (string): Paths to files for concatenation
+        clean_up (bool): If True, deletes the files after concatenation
+        encoding (string): File encoding
+    """
+    for fname in fnames:
+        with open(fname, 'r', encoding=encoding) as stream:
+            for line in stream:
+                output.write(line)
+        if clean_up:
+            os.remove(fname)
+def load_poly_regions(fname):
+    """
+    Read omopolymeric positions from a file.
+    Parameters:
+        fname (str): File path
+    Returns:
+        (dict): Contigs and regions
+    """
+    poly_regions = defaultdict(set)
+    with read_bed_file(fname) as reader:
+        for row in reader:
+            poly_regions[row[0]] = Region(
+                contig=row[0],
+                start=row[1],
+                stop=row[2],
+            )
+    return poly_regions
+def load_splicing_file(splicing_file, span):
+    """
+    Read splicing positions from a file.
+    Parameters:
+        splicing_file (str): File path
+        span(int): Width of splice sites
+    Returns:
+        (dict): Contig and positions
+    """
+    splice_positions = defaultdict(SortedSet)
+    strand_map = {'-': 'D', '+': 'A'}
+    with open_stream(splicing_file, 'r') as stream:
+        for line in stream:
+            fields = line.strip().split()
+            chrom = fields[0]
+            strand = fields[4]
+            splice = fields[3]
+            span = int(fields[1])
+            coe = -1 if strand_map.get(strand, None) == splice else 1
+            new_positions = [1 + span + coe * fctr for fctr in range(span)]
+            splice_positions[chrom] |= new_positions
+        return splice_positions
+def load_text_file(file_name):
+    """
+    Extract file contents to a list.
+    Parameters:
+        file_name (str): The file to open.
+    Returns:
+        List of content
+    """
+    with open_stream(file_name, 'r') as stream:
+        return [line.strip() for line in stream]

reditools/homopolymerics.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""Repeat Sequence Identifier."""
+import argparse
+import sys
+from pysam import FastaFile
+from reditools import file_utils
+def find_homo_seqs(seq, length=5):
+    """
+    Locate regions of repeated bases.
+    Parameters:
+        seq (str): The DNA sequence
+        length (int): Minimum number of sequential repeats.
+    Yields:
+        start, stop, base
+    """
+    h_base = None
+    start = 0
+    count = 0
+    for pos, base in enumerate(seq):
+        if base == h_base:
+            count += 1
+        else:
+            if count >= length:
+                yield (start, start + count, h_base)
+            count = 0
+            start = pos
+            h_base = base
+    if count >= length:
+        yield (start, start + count, h_base)
+def parse_options():
+    """
+    Parse commandline arguments.
+    Returns:
+        namespace
+    """
+    parser = argparse.ArgumentParser(description='REDItools 2.0')
+    parser.add_argument(
+        'file',
+        help='The fasta file to be analyzed',
+    )
+    parser.add_argument(
+        '-l',
+        '--min-length',
+        type=int,
+        default=5,
+        help='Minimum length of repeat region',
+    )
+    parser.add_argument(
+        '-o',
+        '--output',
+        help='Destination to write results. Default is to use STDOUT. ' +
+        'If the filename ends in .gz, the contents will be gzipped.',
+    )
+    return parser.parse_args()
+def main():
+    """Report repetative regions."""
+    options = parse_options()
+    fasta = FastaFile(options.file)
+    if options.output:
+        stream = file_utils.open_stream(
+            options.output,
+            'wt',
+            encoding='utf-8',
+        )
+    else:
+        stream = sys.stdout
+    for seq_name in fasta.references:
+        seq = fasta.fetch(seq_name)
+        for region in find_homo_seqs(seq, options.min_length):
+            fields = [
+                seq_name,
+                region[0],
+                region[1],
+                region[1] - region[0],
+                region[2],
+            ]
+            as_str = [str(_) for _ in fields]
+            stream.write('\t'.join(as_str) + '\n')

reditools/index.py ADDED Viewed

@@ -0,0 +1,268 @@
+"""Commandline tool for REDItools."""
+import argparse
+import csv
+import sys
+from itertools import permutations
+from json import loads as load_json
+from reditools.file_utils import open_stream, read_bed_file
+from reditools.region import Region
+_ref = 'Reference'
+_position = 'Position'
+_contig = 'Region'
+_count = 'BaseCount[A,C,G,T]'
+_strand = 'Strand'
+_nucs = 'ACGT'
+_ref_set = {f'{nuc}-{nuc}' for nuc in _nucs}
+class Index(object):
+    """Utility for calculating editing indices."""
+    def __init__(self, region=None, strand=0):
+        """
+        Create a new Index.
+        Parameters:
+            region (Region): Limit results to the given genomic region
+            strand (int): Either 0, 1, or 2 for unstranded, reverse, or forward
+        """
+        self.targets = {}
+        self.exclusions = {}
+        self.counts = {'-'.join(_): 0 for _ in permutations(_nucs, 2)}
+        self.region = region
+        self.strand = ['*', '-', '+'][strand]
+    def add_target_from_bed(self, fname):
+        """
+        Only report index data for regions from a given bed file.
+        Parameters:
+            fname (str): Path to BED formatted file.
+        """
+        for region in read_bed_file(fname):
+            self.targets[region.contig] = update_region_dict(
+                self.targets,
+                region,
+            )
+    def add_exclusions_from_bed(self, fname):
+        """
+        Exclude index data for regions from a given bed file.
+        Parameters:
+            fname (str): Path to BED formatted file.
+        """
+        for region in read_bed_file(fname):
+            self.exclusions[region.contig] = update_region_dict(
+                self.exclusions,
+                region,
+            )
+    def in_region_list(self, region_list, contig, position):
+        """
+        Check if a genomic position is in a list of regions.
+        Parameters:
+            region_list (dict): Region list to check
+            contig (str): Contig/Chromsome name
+            position (int): Coordinate
+        Returns:
+            True if the position is present, else False
+        """
+        return position in region_list.get(contig, [])
+    def in_targets(self, contig, position):
+        """
+        Check if a genomic position is in the target list.
+        Parameters:
+            contig (str): Contig/Chromsome name
+            position (int): Coordiante
+        Returns:
+            True if there are no targets or the position is in the target
+            list; else False
+        """
+        return not self.targets or self.in_region_list(self.targets)
+    def in_exclusions(self, contig, position):
+        """
+        Check if a genomic position is in the exclusions list.
+        Parameters:
+            contig (str): Contig/Chromsome name
+            position (int): Coordiante
+        Returns:
+            True if there are no exclusions or the position is in the
+            exclusions list; else False
+        """
+        return self.exclusions and self.in_region_list(self.exclusions)
+    def do_ignore(self, row):
+        """
+        Check whether a row should meets analysis criteria.
+        Parameters:
+            row (dict): Row from REIDtools output file.
+        Returns:
+            True if the row should be discarded; else False
+        """
+        if '*' != self.strand != row[_strand]:
+            return True
+        if self.region:
+            if not self.region.contains(row[_contig], row[_position]):
+                return True
+        if self.in_exclusions(row[_contig], row[_position]):
+            return True
+        return not self.in_targets(row[_contig], row[_position])
+    def add_rt_output(self, fname):
+        """
+        Count the number of reads with matches and substitutions.
+        Parameters:
+            fname (str): File path to a REDItools output
+        """
+        stream = open_stream(fname)
+        reader = csv.DictReader(stream, delimiter='\t')
+        for row in reader:
+            if self.do_ignore(row):
+                continue
+            ref = row[_ref]
+            reads = load_json(row[_count])
+            for nuc, count in zip(_nucs, reads):
+                key = f'{nuc}-{ref}'
+                self.counts[key] = self.counts.get(key, 0) + count
+        stream.close()
+    def calc_index(self):
+        """
+        Compute all editing indices.
+        Returns:
+            Dictionary of indices
+        """
+        keys = set(self.counts) - _ref_set
+        indices = {}
+        for idx in keys:
+            ref = idx[-1]
+            numerator = self.counts[idx]
+            denominator = self.counts.get(self.ref_edit(ref), 0) + numerator
+            if denominator == 0:
+                indices[idx] = 0
+            else:
+                indices[idx] = numerator / denominator
+        return indices
+    def ref_edit(self, ref):
+        """
+        Format a base as a non-edit.
+        Parameters:
+            ref (str): Reference base
+        Returns:
+            A string in the format of {ref}-{ref}
+        """
+        return f'{ref}-{ref}'
+def parse_options():  # noqa:WPS213
+    """
+    Parse commandline options for REDItools.
+    Returns:
+        namespace: commandline args
+    """
+    parser = argparse.ArgumentParser(description='REDItools 2.0')
+    parser.add_argument(
+        'file',
+        nargs='+',
+        help='The REDItools output file to be analyzed',
+    )
+    parser.add_argument(
+        '-o',
+        '--output-file',
+        help='The output statistics file',
+    )
+    parser.add_argument(
+        '-s',
+        '--strand',
+        choices=(0, 1, 2),
+        type=int,
+        default=0,
+        help='Strand: this can be 0 (unstranded),' +
+        '1 (secondstrand oriented) or ' +
+        '2 (firststrand oriented)',
+    )
+    parser.add_argument(
+        '-g',
+        '--region',
+        help='The genomic region to be analyzed',
+    )
+    parser.add_argument(
+        '-B',
+        '--bed_file',
+        nargs='+',
+        help='Path of BED file containing target regions',
+    )
+    parser.add_argument(
+        '-k',
+        '--exclude_regions',
+        nargs='+',
+        help='Path of BED file containing regions to exclude from analysis',
+    )
+    return parser.parse_args()
+def main():
+    """Perform RNA editing analysis."""
+    options = parse_options()
+    if options.region:
+        indexer = Index(Region(string=options.region), strand=options.strand)
+    else:
+        indexer = Index(strand=options.strand)
+    if options.exclude_regions:
+        for exc_fname in options.exclude_regions:
+            indexer.add_exclusions_from_bed(exc_fname)
+    if options.bed_file:
+        for trg_fname in options.bed_file:
+            indexer.add_target_from_bed(trg_fname)
+    if options.output_file:
+        stream = open_stream(options.output_fipe, 'w')
+    else:
+        stream = sys.stdout
+    for fname in options.file:
+        indexer.add_rt_output(fname)
+    for nuc, idx in sorted(indexer.calc_index().items()):
+        stream.write(f'{nuc}\t{idx}\n')
+def update_region_dict(region_dict, region):
+    """
+    Add a region to a region dictionary.
+    Parameters:
+        region_dict (dict): Region dictionary
+        region (Region): Region to add
+    Returns:
+        An updated copy of region_dict
+    """
+    return region_dict.get(region.contig, set()) | region.enumerate()
+if __name__ == '__main__':
+    main()