PyPI - ourotools - Versions diffs - 0.2.0__py3-none-any.whl - Mend

ourotools 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

ourotools/__init__.py +10 -0
ourotools/__main__.py +10 -0
ourotools/core/BA.py +225 -0
ourotools/core/MAP.py +38 -0
ourotools/core/ONT.py +174 -0
ourotools/core/OT.py +125 -0
ourotools/core/SAM.py +588 -0
ourotools/core/SC.py +647 -0
ourotools/core/SEQ.py +99 -0
ourotools/core/STR.py +398 -0
ourotools/core/__init__.py +3 -0
ourotools/core/alternative_splicing_analysis.py +903 -0
ourotools/core/biobookshelf.py +2538 -0
ourotools/core/core.py +17252 -0
ourotools-0.2.0.dist-info/LICENSE +21 -0
ourotools-0.2.0.dist-info/METADATA +545 -0
ourotools-0.2.0.dist-info/RECORD +20 -0
ourotools-0.2.0.dist-info/WHEEL +5 -0
ourotools-0.2.0.dist-info/entry_points.txt +2 -0
ourotools-0.2.0.dist-info/top_level.txt +1 -0

ourotools/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+# import top-level functions
+from .__main__ import *
+from ourotools.core import *
+# Version of cressp package
+__version__ = "0.2.0"
+# import modules
+__all__ = ["core"]

ourotools/__main__.py ADDED Viewed

@@ -0,0 +1,10 @@
+#!/usr/bin/env python
+# the date of last modification
+from ourotools.core import *
+import warnings
+warnings.filterwarnings(action="ignore")
+if __name__ == "__main__":
+    pass

ourotools/core/BA.py ADDED Viewed

@@ -0,0 +1,225 @@
+from bitarray import bitarray
+import numpy as np
+def Find(ba, val=1):
+    """deprecated
+    # 2022-06-12 21:09:22
+    search the given value in the bitarray iteratively and return the start position of the occurrences
+    """
+    len_ba = len(ba)
+    l_int_pos_occurrence = []
+    int_pos_start = 0
+    while int_pos_start < len_ba:
+        int_pos_occurrence = ba.find(val, int_pos_start + 1)
+        if int_pos_occurrence < 0:
+            break
+        else:
+            l_int_pos_occurrence.append(int_pos_occurrence)
+            int_pos_start = int_pos_occurrence
+    return l_int_pos_occurrence
+def find(ba, val=1):
+    """# 2022-07-03 21:41:36
+    generator that returns the location of 'val' from the start of the bitarray.
+    Use this function for memory-efficient iteration of position of active entries in a bitarray (np.int64 takes 64 times more memory than an entry in bitarray, which is 1 bit).
+    """
+    len_ba = len(ba)
+    if len(ba) == 0:  # handle empty bitarray input
+        return
+    int_pos_start = 0
+    if ba[0] == val:  # handle the case when the first bit is equal to 'val'
+        yield int_pos_start
+    while int_pos_start < len_ba:
+        int_pos_occurrence = ba.find(val, int_pos_start + 1)
+        if int_pos_occurrence < 0:
+            break
+        else:
+            yield int_pos_occurrence
+            int_pos_start = int_pos_occurrence
+# def Find_Segment( ba, background = 1, flag_use_bitwise_operation = True ) :
+#     ''' # 2022-06-12 19:36:38
+#     find segment of a given bitarray for the given background 'background'. for example, when background = 1, find segment of 0
+#     'flag_use_bitwise_operation' : use bitwise operation. useful when the length of bitarray is very long
+#     '''
+#     len_ba = len( ba )
+#     if flag_use_bitwise_operation :
+#         ''' the implementation using bitwise operation '''
+#         mask = ( ba ^ ba >> 1 ) # find boundary of value change
+#         l_int_pos = Find( mask, 1 )
+#         flag_start_is_a_segment = ba[ 0 ] != background
+#         if flag_start_is_a_segment :
+#             l_int_pos = [ 0 ] + l_int_pos
+#         if len( l_int_pos ) % 2 != 0 :
+#             l_int_pos += [ len_ba ]
+#         return np.array( l_int_pos ).reshape( ( int( len( l_int_pos ) / 2 ), 2 ) )
+#     else :
+#         ''' the implementation using simple iteration '''
+#         int_pos = 0
+#         int_seg_start = None
+#         l_seg = [ ]
+#         while int_pos < len_ba :
+#             if int_seg_start is None and ba[ int_pos ] != background :
+#                 int_seg_start = int_pos
+#             elif int_seg_start is not None and ba[ int_pos ] == background :
+#                 l_seg.append( [ int_seg_start, int_pos ] )
+#                 int_seg_start = None
+#             int_pos += 1
+#         if int_seg_start is not None :
+#             l_seg.append( [ int_seg_start, len_ba ] )
+#         return l_seg
+def Find_Segment(ba, background=1):
+    """# 2022-06-12 19:36:38
+    find segment of a given bitarray for the given background 'background'. for example, when background = 1, find segment of 0
+    # 2022-06-22 18:36:45
+    updated to a new implementation that does not require generating a copy of the bitarray.
+    """
+    def toggle_bit(bit):
+        return (bit + 1) % 2  # toggle the bit
+    # initialize
+    l_int_pos = []
+    state_of_interest = toggle_bit(background)  # look for a non-background state
+    int_pos = 0
+    while True:
+        int_pos = ba.find(state_of_interest, int_pos)
+        if int_pos < 0:
+            break
+        l_int_pos.append(int_pos)
+        state_of_interest = toggle_bit(
+            state_of_interest
+        )  # toggle the state of interest
+    if len(l_int_pos) % 2 != 0:
+        l_int_pos.append(len(ba))
+    return np.array(l_int_pos).reshape(
+        (int(len(l_int_pos) / 2), 2)
+    )  # reshape a list of int_pos to list of segments
+def Retrieve_Integer_Indices(ba, background=0):
+    """deprecated (slow)
+    # 2022-06-23 08:31:45
+    returns a list of integer indices of non-background bits in a given bitarray 'ba'
+    'background' : a bit representing background. either 0 or 1
+    """
+    # compose a list of integer indices of active rows after applying filter
+    l = []
+    for st, en in Find_Segment(
+        ba, background=background
+    ):  # retrieve active segments from bitarray filter
+        l.extend(range(st, en))  # retrieve integer indices of the active rows
+    return l
+def to_array(ba):
+    """# 2022-06-24 23:54:12
+    return a boolean numpy array of the given bitarray"""
+    return np.frombuffer(ba.unpack(), dtype=bool)
+def to_bitarray(arr_bool):
+    """# 2022-06-27 15:29:56
+    convert numpy boolean array to bitarray
+    """
+    ba = bitarray()
+    ba.pack(arr_bool.tobytes())
+    return ba
+def from_integer_indices_to_bitarray(l_int_index, length):
+    """# 2022-07-01 11:21:12
+    convert list of integer indices to bitarray
+    'length' : length of the output bitarray object
+    """
+    ba = bitarray(length)
+    ba.setall(0)
+    for int_index in l_int_index:
+        ba[int_index] = True
+    return ba
+def to_integer_indices(ba):
+    """# 2022-07-01 21:38:17
+    retrieve integer indices of the active entries of the given bitarray
+    'ba' : input bitarray object
+    """
+    return np.where(to_array(ba))[0]
+def COUNTER(l_values, dict_counter=None, ignore_float=True):  # 2020-07-29 23:49:51
+    """Count values in l_values and return a dictionary containing count values. if 'dict_counter' is given, countinue counting by using the 'dict_counter'. if 'ignore_float' is True, ignore float values, including np.nan"""
+    if dict_counter is None:
+        dict_counter = dict()
+    if ignore_float:  # if 'ignore_float' is True, ignore float values, including np.nan
+        for value in l_values:
+            if isinstance(value, float):
+                continue  # ignore float values
+            if value in dict_counter:
+                dict_counter[value] += 1
+            else:
+                dict_counter[value] = 1
+    else:  # faster counting by not checking type of value
+        for value in l_values:
+            if value in dict_counter:
+                dict_counter[value] += 1
+            else:
+                dict_counter[value] = 1
+    return dict_counter
+def detect_boolean_mask(ba):
+    """# 2022-08-10 23:29:06
+    detect boolean mask by looking up to 10 values
+    """
+    # extract the first row from the ndarray data
+    if isinstance(ba, np.ndarray) and len(ba.shape) > 1:
+        for i in range(len(ba.shape) - 1):
+            ba = ba[0]
+    if not hasattr(ba, "__iter__"):  # 'ba' should be iterable to be a boolean mask
+        return False
+    ba = ba[:10]
+    # if the length of array of interest is <= 2 and only consists of 0, 1, consider it as an integer array, not boolean array
+    if len(ba) <= 2 and set(COUNTER(ba[:10])).issubset({0, 1}):
+        return False
+    return set(COUNTER(ba[:10])).issubset({0, 1, True, False})
+def convert_mask_to_bitarray(ba):
+    """# 2022-07-03 00:54:46
+    convert boolean mask to a bitarray
+    """
+    # handle when a list type has been given (convert it to np.ndarray)
+    if isinstance(ba, list):
+        ba = np.array(ba, dtype=bool)
+    # handle when a numpy ndarray has been given (convert it to bitarray)
+    if isinstance(ba, np.ndarray):
+        ba = to_bitarray(ba)
+    assert isinstance(ba, bitarray)  # check the return value is bitarray object
+    return ba
+def convert_mask_to_array(ba):
+    """# 2022-07-03 00:54:46
+    convert boolean mask to a array
+    """
+    """ handle non-bitarray mask types """
+    # handle when a list type has been given (convert it to np.ndarray)
+    if isinstance(ba, list):
+        ba = np.array(ba, dtype=bool)
+    # handle when a numpy ndarray has been given (convert it to np.ndarray)
+    if isinstance(ba, bitarray):
+        ba = to_array(ba)
+    assert isinstance(ba, np.ndarray)  # check the return value is np.ndarray object
+    return ba

ourotools/core/MAP.py ADDED Viewed

@@ -0,0 +1,38 @@
+import numpy as np
+# In[ ]:
+dict_a2b = dict()  # an empty dictionary for mapping
+# In[ ]:
+def Remove_version_info(ID):
+    return ID.split(".")[0]
+# In[ ]:
+def Retrive_Length(entry):
+    return len(entry)
+class Map(object):
+    def __init__(self, dict_a2b):
+        self.dict_a2b = dict_a2b
+    def a2b(self, a):
+        if a in self.dict_a2b:
+            return self.dict_a2b[a]
+        else:
+            return np.nan
+    def a2b_if_mapping_available_else_Map_a2a(self, a):
+        if a in self.dict_a2b:
+            return self.dict_a2b[a]
+        else:
+            return a

ourotools/core/ONT.py ADDED Viewed

@@ -0,0 +1,174 @@
+# load internal module
+from .biobookshelf import *
+from . import biobookshelf as bk
+from typing import Union, List, Literal, Dict, Callable, Set, Iterable, Tuple
+def Minimap2_Align(
+    path_file_fastq,
+    path_file_minimap2_index="/node210data/shared/ensembl/Mus_musculus/index/minimap2/Mus_musculus.GRCm38.dna.primary_assembly.k_14.idx",
+    path_folder_minimap2_output=None,
+    n_threads=20,
+    verbose=True,
+    drop_unaligned=False,
+    return_bash_shellscript=False,
+    n_threads_for_sort=10,
+    flag_use_split_prefix: bool = False,
+    path_file_junc_bed: Union[
+        None, str
+    ] = None,  # if given, the bed file will be used for prioritizing known splice sites.
+    path_file_gtf: Union[
+        None, str
+    ] = None,  # path to gene and exon annotation files, required if 'path_file_junc_bed' is given but the file does not exist
+):
+    """
+    # 2023-04-23 01:18:58
+    align given fastq file of nanopore reads using minimap2 and write an output as a bam file
+    'path_file_fastq' : input fastq or fasta file (gzipped or uncompressed file is accepted)
+    'path_file_minimap2_index' : minimap2 index file
+    'path_folder_minimap2_output' : minimap2 output folder
+    'drop_unaligned' : a flag indicating whether reads not aligned to the reference ('SAM flag == 4') are included in the output bam file
+    'return_bash_shellscript' : return shellscript instead of running minimap2 using the subprocess module
+    'flag_use_split_prefix' = False # for large index, split-prefix should be used
+    """
+    path_folder_fastq, name_file_fastq = path_file_fastq.rsplit("/", 1)
+    if (
+        path_folder_minimap2_output is None
+    ):  # default output folder is a subdirectory of the folder containing the input fastq file
+        path_folder_minimap2_output = f"{path_folder_fastq}/minimap2/"
+    if (
+        path_folder_minimap2_output[-1] != "/"
+    ):  # add '/' at the end of the output directory if it does not exist
+        path_folder_minimap2_output += "/"
+    os.makedirs(
+        path_folder_minimap2_output, exist_ok=True
+    )  # create folder if it does not exist
+    path_file_sam = (
+        f"{path_folder_minimap2_output}{name_file_fastq}.minimap2_aligned.sam"
+    )
+    path_file_bam = (
+        f"{path_folder_minimap2_output}{name_file_fastq}.minimap2_aligned.bam"
+    )
+    # if index file of the output BAM file exists, exit
+    if os.path.exists( f"{path_file_bam}.bai" ) :
+        return
+    l_bash_shellscript = []
+    """ perform minimap2 alignment """
+    l_arg = [
+        "minimap2",
+        "-t",
+        str(int(n_threads)),
+        "-ax",
+        "splice",
+        "-o",
+        path_file_sam,
+    ]
+    # for large index, split-prefix should be used
+    if flag_use_split_prefix:
+        l_arg += [f"--split-prefix={path_folder_minimap2_output}{UUID( )}"]
+    if path_file_junc_bed is not None:
+        if (
+            not os.path.exists(path_file_junc_bed) and path_file_gtf is not None
+        ):  # if the bed file does not exist, create the bed file using paftools.js, packaged with the minimap2 executable
+            l_args_for_creating_junc_bed = ["paftools.js", "gff2bed", path_file_gtf]
+            if (
+                return_bash_shellscript
+            ):  # perform minimap2 alignment using subprocess module
+                l_bash_shellscript.append(
+                    " ".join(l_args_for_creating_junc_bed + [">", path_file_junc_bed])
+                )
+            else:
+                bk.OS_Run(
+                    l_args_for_creating_junc_bed,
+                    path_file_stdout=path_file_junc_bed,
+                    stdout_binary=False,
+                )
+        if os.path.exists(path_file_junc_bed):
+            l_arg += ["--junc-bed", path_file_junc_bed]
+    if drop_unaligned:
+        l_arg += ["--sam-hit-only"]
+    l_arg += [path_file_minimap2_index, path_file_fastq]
+    if return_bash_shellscript:  # perform minimap2 alignment using subprocess module
+        l_bash_shellscript.append(" ".join(l_arg))
+    else:
+        run = subprocess.run(l_arg, capture_output=True)
+        with open(
+            f"{path_folder_minimap2_output}{name_file_fastq}.minimap2_aligned.out", "w"
+        ) as file:
+            file.write(run.stdout.decode())
+        if verbose:
+            print("minimap2 completed")
+    """ sort output SAM file """
+    l_arg = [
+        "samtools",
+        "sort",
+        "-@",
+        str(int(min(n_threads_for_sort, 10))),
+        "-O",
+        "BAM",
+        "-o",
+        path_file_bam,
+        path_file_sam,
+    ]
+    if return_bash_shellscript:  # perform minimap2 alignment using subprocess module
+        l_bash_shellscript.append(" ".join(l_arg))
+        l_bash_shellscript.append(" ".join(["rm", "-f", path_file_sam]))
+    else:
+        run = subprocess.run(l_arg, capture_output=False)
+        os.remove(path_file_sam)  # remove sam file
+    """ index resulting BAM file """
+    l_arg = ["samtools", "index", path_file_bam]
+    if return_bash_shellscript:  # perform minimap2 alignment using subprocess module
+        l_bash_shellscript.append(" ".join(l_arg))
+    else:
+        run = subprocess.run(l_arg, capture_output=False)
+        if verbose:
+            print("samtools bam file compressing and indexing completed")
+    if return_bash_shellscript:  # retrun bash shell scripts
+        return " && ".join(l_bash_shellscript)
+def Minimap2_Index(path_file_fasta, path_file_minimap2_index=None, verbose=False):
+    """
+    # 2021-03-24 00:44:51
+    index given fasta file for nanopore reads alignment
+    'path_file_fasta' : input reference fasta file
+    'path_file_minimap2_index' : minimap2 index file
+    """
+    path_folder_fastq, name_file_fasta = path_file_fasta.rsplit("/", 1)
+    if (
+        path_file_minimap2_index is None
+    ):  # set the default directory of the minimap index
+        path_file_minimap2_index = (
+            f"{path_folder_fastq}/index/minimap2/{name_file_fasta}.ont.mmi"
+        )
+    path_folder_minimap2_index, name_file_index = path_file_minimap2_index.rsplit(
+        "/", 1
+    )
+    path_folder_minimap2_index += "/"
+    os.makedirs(
+        path_folder_minimap2_index, exist_ok=True
+    )  # create folder if it does not exist
+    if os.path.exists(path_file_minimap2_index):  # exit if an index file already exists
+        return
+    # build minimap2 index
+    run = subprocess.run(
+        ["minimap2", "-x", "map-ont", "-d", path_file_minimap2_index, path_file_fasta],
+        capture_output=True,
+    )
+    with open(
+        f"{path_folder_minimap2_index}{name_file_index}.minimap2_index.out", "w"
+    ) as file:
+        file.write(run.stdout.decode())
+    if verbose:
+        print("minimap2 indexing completed")

ourotools/core/OT.py ADDED Viewed

@@ -0,0 +1,125 @@
+from . import biobookshelf as bk
+"""
+Implementing Ontology functions
+"""
+import owlready2 as ol
+class OntologyTerms :
+    def __init__( self, path_file_owl : str, name_prefix : str, name_root_term : str ) :
+        """
+        load an ontology file, given as 'path_file_owl'
+        name_prefix : str # prefix of the name
+        name_root_term : str # name of the root term
+        # 2024-02-29 22:50:55
+        """
+        self.path_file_owl = path_file_owl
+        self.name_prefix = name_prefix
+        self.name_root_term = name_root_term
+        self.onto = ol.get_ontology(f"file://{path_file_owl}").load( )
+        self._set_terms = set( self.onto.classes( ) )
+        self._root_term = self[ self.name_root_term ]
+    def __repr__( self ) :
+        return f"<{len( self._set_terms )} ontology terms stored at {self.path_file_owl}>"
+    def __contains__( self, term ) :
+        """
+        # 2024-03-01 21:35:55
+        """
+        return self[ term ] in self._set_terms
+    def __getitem__( self, ontology_id : str ) :
+        """
+        get ontology term using an ID
+        # 2024-02-29 22:51:49
+        """
+        # handle 'ontology_id' that is not a string
+        if not isinstance( ontology_id, str ) :
+            ontology_id
+        l = self.onto.search( iri = f"*{ontology_id}*")
+        if len( l ) == 0 :
+            return None
+        elif len( l ) == 1 :
+            return l[ 0 ]
+        else :
+            return l # if more then one terms are matched, return more than one elements
+    def __iter__( self ) :
+        """
+        return an iterater returning each class
+        # 2024-03-01 00:54:16
+        """
+        return self.onto.classes( )
+    def get_ancestor_chain( self, term ) :
+        """
+        get a chain of ancestors (excluding restriction objects), excluding self, from the most distant ancestor (owl.Thing) to the closest ancestor.
+        Note)
+        This function utilizes a recursive algorithm to explore the tree structure.
+        # 2024-03-01 21:45:16
+        """
+        # get the ontology term
+        term = self[ term ]
+        # initialize the 'l_ancestor'
+        def get_superclasses( term ) :
+            """
+            get filtered super classes of a term
+            # 2024-03-01 23:15:04
+            """
+            return list( e for e in term.is_a if hasattr( e, 'name' ) and e.name[ : len( self.name_prefix ) ] == self.name_prefix )
+        def get_ancestor_chains( term ) :
+            l_ancestor_chain = [ ]
+            l_term_super = get_superclasses( term )
+            # termination condition
+            if len( l_term_super ) == 0 :
+                if term == self._root_term :
+                    return [ [ ] ]
+                else : # if the chain terminate with a term that is not a root term, add the root term
+                    return [ [ self._root_term ] ]
+            # recursive condition
+            for e in l_term_super :
+                for l_ancestor in get_ancestor_chains( e ) :
+                    l_ancestor_chain.append( [ e ] + l_ancestor )
+            return l_ancestor_chain
+        # reverse the order (from the most distant ancestor (the root term) to the closest ancestor)
+        l_ancestor_chain_reverse_order = get_ancestor_chains( term )
+        l_ancestor_chain = [ ] # l_ancestor_chain
+        for ancestor_chain_reverse_order in l_ancestor_chain_reverse_order :
+            l_ancestor_chain.append( ancestor_chain_reverse_order[ : : -1 ] )
+        return l_ancestor_chain
+    def get_longest_shared_ancestor_chains( self, term_1, term_2 ) -> set :
+        """
+        return the ancestor chains to the most closest shared ancestors between the term1 and term2
+        # 2024-02-29 22:58:13
+        """
+        # retrieve ancestor chains
+        l_ancestor_chain_1 = self.get_ancestor_chain( term_1 )
+        l_ancestor_chain_2 = self.get_ancestor_chain( term_2 )
+        # collect the ancestor chains to the most closest shared ancestors
+        set_ancestor_chain_to_most_closest_shared_ancestor = set( ) # initialize 'set_ancestor_chain_to_most_closest_shared_ancestor'
+        for ancestor_chain_1 in l_ancestor_chain_1 : # iterate over chain list #1
+            # find the chain in the chain list # 2 that contains the longest shared chain with the chain in the chain list #1
+            l_index_most_closest_shared_ancestor = [ ]
+            for ancestor_chain_2 in l_ancestor_chain_2 : # iterate over chain list #2
+                index_most_closest_shared_ancestor = 0 # initialize the index that indicate the location of the most closest shared ancestor between the two chains # initialize with the index of the root term
+                for ancestor_1, ancestor_2 in zip( ancestor_chain_1[ 1 : ], ancestor_chain_2[ 1 : ] ) : # retrieve ancester from chain #1 and chain #2 (from the most distant ancestor (excluding the root term) to the closest ancestor)
+                    if ancestor_1 != ancestor_2 : # if the ancestors diverged between chain_1 and chain_2
+                        break
+                    index_most_closest_shared_ancestor += 1 # increase the pointer (take into account the current shared ancestor)
+                l_index_most_closest_shared_ancestor.append( index_most_closest_shared_ancestor )
+            index_chain_2_with_most_closest_shared_ancestor = np.argmax( l_index_most_closest_shared_ancestor ) # define 'most closest shared ancestor' as the ancestor that has the the largest number of ancestors between itself and the root term.
+            ancestor_chain_to_most_closest_shared_ancestor = tuple( l_ancestor_chain_2[ index_chain_2_with_most_closest_shared_ancestor ][ : l_index_most_closest_shared_ancestor[ index_chain_2_with_most_closest_shared_ancestor ] + 1 ] ) # including the most_closest_shared_ancestor in the chain
+            set_ancestor_chain_to_most_closest_shared_ancestor.add( ancestor_chain_to_most_closest_shared_ancestor )
+        return set_ancestor_chain_to_most_closest_shared_ancestor
+    def get_properties( self, term ) :
+        """
+        return the properties of the termk
+        # 2024-03-01 12:51:18
+        """
+        # get ontology terms
+        term = self[ term ]
+        # retrieve properties
+        l_label, l_comment, l_broadsynonym, l_exactsynonym = list( set( term.label ) ), list( set( term.comment ) ), list( set( term.hasBroadSynonym ) ), list( set( term.hasExactSynonym ) )
+        def _parse_property( l ) :
+            return l[ 0 ] if len( l ) > 0 else None
+        dict_property = { 'label' : _parse_property( l_label ), 'comment' : _parse_property( l_comment ), 'broad_synonym' : l_broadsynonym, 'exact_synonym' : l_exactsynonym }
+        return dict_property