PyPI - split3c - Versions diffs - 0.0.1__py3-none-any.whl - Mend

split3c 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

split3c/__init__.py +0 -0
split3c/cli.py +336 -0
split3c/nssite/__init__.py +0 -0
split3c/nssite/auxiliary.py +190 -0
split3c/nssite/bam.py +299 -0
split3c/nssite/fastq.py +148 -0
split3c/nssite/main.py +368 -0
split3c/nssite/processmanager.py +51 -0
split3c/nssite/split.py +849 -0
split3c/resite/__init__.py +33 -0
split3c/resite/frag.py +576 -0
split3c/resite/header.py +91 -0
split3c/resite/index.py +236 -0
split3c/resite/main.py +506 -0
split3c/resite/pretreatment.py +299 -0
split3c/resite/read.py +91 -0
split3c/resite/write_control.py +111 -0
split3c/resolve/__init__.py +0 -0
split3c/resolve/bam.py +129 -0
split3c/resolve/io_utils.py +77 -0
split3c/resolve/main.py +506 -0
split3c/resolve/pairs.py +56 -0
split3c/resolve/parse.py +1218 -0
split3c-0.0.1.dist-info/METADATA +100 -0
split3c-0.0.1.dist-info/RECORD +29 -0
split3c-0.0.1.dist-info/WHEEL +5 -0
split3c-0.0.1.dist-info/entry_points.txt +5 -0
split3c-0.0.1.dist-info/licenses/LICENSE +235 -0
split3c-0.0.1.dist-info/top_level.txt +1 -0

split3c/resite/index.py ADDED Viewed

@@ -0,0 +1,236 @@
+"""
+This script is a the split3c project, designed to process paired-end FASTQ files by fragmenting DNA sequences at specified restriction enzyme sites.
+Copyright © 2024 Samir Bertache
+SPDX-License-Identifier: AGPL-3.0-or-later
+===============================================================================
+This program is free software: you can redistribute it and/or modify it under
+the terms of the GNU Affero General Public License as published by the
+Free Software Foundation, either version 3 of the License, or (at your option)
+any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU Affero General Public License for more details.
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <https://www.gnu.org/licenses/>.
+"""
+import logging
+import re
+from typing import List, Tuple
+logging.basicConfig(level=logging.INFO)
+#################### Specific part of Borderless option #####################
+def find_positions_for_one_site_borderless(
+    text: str, Enzyme: Tuple[re.Pattern, int]
+) -> List[List[int]]:
+    """
+    Find all positions of a specific pattern (RESite) in a given text using regular expressions.
+    Parameters:
+        text (str): The text to search in.
+        Enzyme (Tuple[re.Pattern, int]): A compiled regex and an offset.
+    Returns:
+        List of [position, offset].
+    Examples:
+        >>> find_positions_for_one_site_borderless("", (re.compile("A"), 1))
+        []
+        >>> find_positions_for_one_site_borderless("AAAA", (re.compile("AA"), 2))
+        [[0, 2], [2, 2]]
+        >>> find_positions_for_one_site_borderless("XYZ", (re.compile("A"), 1))
+        []
+        >>> find_positions_for_one_site_borderless("GATCGATC", (re.compile("GATC"), 4))
+        [[0, 4], [4, 4]]
+    """
+    regex, offset = Enzyme
+    return [[m.start(), offset] for m in regex.finditer(text)]
+def find_all_pos_borderless(
+    text: str,
+    ligation_site_list: List[Tuple[re.Pattern, int]],
+) -> List[List[int]]:
+    """
+    Aggregate all borderless site positions plus the end-of-text marker.
+    Examples:
+        >>> find_all_pos_borderless("", [])
+        [[0, 0]]
+        >>> find_all_pos_borderless("AAAA", [])
+        [[4, 0]]
+        >>> find_all_pos_borderless("GAATTC", [(re.compile("GAATTC"), 6)])
+        [[0, 6], [6, 0]]
+        >>> find_all_pos_borderless("XXGAATTCYYGATCZZ", [(re.compile("GAATTC"), 6), (re.compile("GATC"), 4)])
+        [[2, 6], [10, 4], [16, 0]]
+    """
+    AllSite: List[List[int]] = []
+    for enzyme in ligation_site_list:
+        AllSite += find_positions_for_one_site_borderless(text, enzyme)
+    # always mark end of sequence
+    AllSite.append([len(text), 0])
+    return sorted(AllSite, key=lambda x: x[0])
+def IndexFragList_borderless(
+    all_index_list: List[List[int]], seed_size: int
+) -> List[List[int]]:
+    """
+    From a list [[pos,offset],…], build fragments [start,end] discarding
+    those shorter than seed_size, and trimming out the enzyme borders.
+    Examples:
+        >>> # no fragment (text too short)
+        >>> IndexFragList_borderless([[3,2],[5,2]], 10)
+        []
+        >>> # exact seed_size
+        >>> IndexFragList_borderless([[0,1],[5,1],[10,0]], 4)
+        [[1, 5], [6, 10]]
+    """
+    ListFragListIndex: List[List[int]] = []
+    previous_position = 0
+    for i, (current_position, offset) in enumerate(all_index_list):
+        prev_offset = all_index_list[i - 1][1] if i > 0 else 0
+        start = previous_position + prev_offset
+        end = current_position
+        if end - start >= seed_size:
+            ListFragListIndex.append([start, end])
+        previous_position = current_position
+    return ListFragListIndex
+def index_list_single_borderless(
+    sequence: str,
+    ligation_site_list: List[Tuple[re.Pattern, int]],
+    seed_size: int,
+) -> List[List[int]]:
+    """
+    Compute fragment indices for one sequence in borderless mode.
+    Examples:
+        >>> # no enzyme sites => single fragment from 0 to len
+        >>> index_list_single_borderless("AAAA", [], 0)
+        [[0, 4]]
+    """
+    positions = find_all_pos_borderless(sequence, ligation_site_list)
+    return IndexFragList_borderless(positions, seed_size)
+def index_list_borderless(
+    Sequences: List[str],
+    ligation_site_list: List[Tuple[re.Pattern, int]],
+    seed_size: int,
+) -> Tuple[List[List[int]], List[List[int]]]:
+    """
+    Compute fragments for forward and reverse sequences in borderless mode.
+    Examples:
+        >>> f,r = index_list_borderless(["AAAA","BBBB"], [], 0)
+        >>> f, r
+        ([[0, 4]], [[0, 4]])
+    """
+    for_seq = index_list_single_borderless(Sequences[0], ligation_site_list, seed_size)
+    rev_seq = index_list_single_borderless(Sequences[1], ligation_site_list, seed_size)
+    return for_seq, rev_seq
+# ------------------- Classic part (simple positions) -----------------------
+def find_positions_for_one_site(text: str, Enzyme: Tuple[re.Pattern, int]) -> List[int]:
+    """
+    Return end-positions (start + offset) for each occurrence.
+    Examples:
+        >>> find_positions_for_one_site("", (re.compile("A"),1))
+        []
+        >>> find_positions_for_one_site("AZAA", (re.compile("ZAA"),3))
+        [4]
+        >>> find_positions_for_one_site("XYZ", (re.compile("A"),1))
+        []
+    """
+    regex, offset = Enzyme
+    return [m.start() + offset for m in regex.finditer(text)]
+def find_all_pos(
+    text: str, ligation_site_list: List[Tuple[re.Pattern, int]]
+) -> List[int]:
+    """
+    Aggregate positions 0, all site-ends, and len(text) sorted.
+    Examples:
+        >>> find_all_pos("AAAA", [])
+        [0, 4]
+        >>> find_all_pos("AXA", [(re.compile("XA"),2)])
+        [0, 3]
+    """
+    sites = [0]
+    for enzyme in ligation_site_list:
+        sites += find_positions_for_one_site(text, enzyme)
+    sites.append(len(text))
+    return sorted(set(sites))
+def IndexFragList(index_list: List[int], seed_size: int) -> List[List[int]]:
+    """
+    From a sorted list of positions, extract [prev,curr] where size>seed_size.
+    Examples:
+        >>> IndexFragList([0,5,10], 0)
+        [[0, 5], [5, 10]]
+        >>> IndexFragList([0,3,5], 4)
+        []
+    """
+    fragments: List[List[int]] = []
+    for prev, curr in zip(index_list, index_list[1:]):
+        if curr - prev >= seed_size:
+            fragments.append([prev, curr])
+    return fragments
+def index_list_single(
+    sequence: str,
+    ligation_site_list: List[Tuple[re.Pattern, int]],
+    seed_size: int,
+) -> List[List[int]]:
+    """
+    Wrapper: find_all_pos + IndexFragList.
+    Examples:
+        >>> index_list_single("AAAA", [], 0)
+        [[0, 4]]
+        >>> index_list_single("AXA", [(re.compile("XA"),2)], 0)
+        [[0, 3]]
+    """
+    positions = find_all_pos(sequence, ligation_site_list)
+    return IndexFragList(positions, seed_size)
+def index_list(
+    Sequences: List[str],
+    ligation_site_list: List[Tuple[re.Pattern, int]],
+    seed_size: int,
+) -> Tuple[List[List[int]], List[List[int]]]:
+    """
+    Dual wrapper for forward & reverse.
+    Examples:
+        >>> index_list(["AAAA","BBBB"], [], 0)
+        ([[0, 4]], [[0, 4]])
+    """
+    return (
+        index_list_single(Sequences[0], ligation_site_list, seed_size),
+        index_list_single(Sequences[1], ligation_site_list, seed_size),
+    )