PyPI - RiboParser - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

RiboParser 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

{riboparser-0.2.2 → riboparser-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: RiboParser
-Version: 0.2.2
+Version: 0.2.3
 Summary: A pipeline for ribosome profiling data analysis
 Author-email: Ren Shuchao <rensc0718@163.com>
 License-Expression: GPL-3.0-or-later

{riboparser-0.2.2 → riboparser-0.2.3}/RiboParser.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: RiboParser
-Version: 0.2.2
+Version: 0.2.3
 Summary: A pipeline for ribosome profiling data analysis
 Author-email: Ren Shuchao <rensc0718@163.com>
 License-Expression: GPL-3.0-or-later

{riboparser-0.2.2 → riboparser-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "RiboParser"
-version = "0.2.2"
+version = "0.2.3"
 authors = [{ name = "Ren Shuchao", email = "rensc0718@163.com" }]
 description = "A pipeline for ribosome profiling data analysis"
 readme = "README.md"

riboparser-0.2.3/utils/smorf/overlap.py ADDED Viewed

@@ -0,0 +1,234 @@
+# Author: Rensc
+# date: 2026-05-21
+"""
+ORF overlap marker.
+This module marks ORF overlap types and assigns priority labels.
+Priority rule:
+1. annotated_mORF is preferred.
+2. complete ORF is preferred over partial ORF.
+3. ATG start codon is preferred over non-ATG start codons.
+4. Stronger Kozak context is preferred.
+5. Longer ORF is preferred.
+6. More upstream start site is preferred.
+Important:
+Different-frame overlapping ORFs are not suppressed by default.
+"""
+from typing import List, Dict, Tuple
+from .models import ORFRecord
+START_CODON_RANK = {
+    "ATG": 1,
+    "CTG": 2,
+    "GTG": 3,
+    "TTG": 4,
+    "ACG": 5,
+    "ATA": 6,
+    "ATT": 7,
+    "ATC": 8,
+}
+class ORFOverlapMarker:
+    """
+    Mark ORF overlap type and priority.
+    """
+    @staticmethod
+    def mark(records: List[ORFRecord]) -> None:
+        """
+        Mark ORF overlap relationships.
+        Parameters
+        ----------
+        records : list
+            List of ORFRecord objects.
+        """
+        ORFOverlapMarker._mark_different_frame_overlap(records)
+        ORFOverlapMarker._mark_same_frame_overlap(records)
+    @staticmethod
+    def _mark_same_frame_overlap(records: List[ORFRecord]) -> None:
+        """
+        Mark overlaps among ORFs with the same transcript, strand, and frame.
+        """
+        grouped: Dict[Tuple[str, str, int], List[ORFRecord]] = {}
+        for rec in records:
+            key = (rec.transcript_id, rec.source_strand, rec.frame)
+            grouped.setdefault(key, []).append(rec)
+        for _, items in grouped.items():
+            items.sort(key=lambda x: (x.tx_orf_start, x.tx_orf_end))
+            for rec in items:
+                competitors = [
+                    other for other in items
+                    if other is not rec
+                    and ORFOverlapMarker._is_overlap(rec, other)
+                ]
+                if not competitors:
+                    continue
+                best = ORFOverlapMarker._select_best_orf([rec] + competitors)
+                if rec is best:
+                    if rec.overlap_type == "none":
+                        rec.overlap_type = "same_frame_overlap"
+                    rec.priority = "primary"
+                else:
+                    ORFOverlapMarker._downgrade_orf(rec, best)
+    @staticmethod
+    def _mark_different_frame_overlap(records: List[ORFRecord]) -> None:
+        """
+        Mark different-frame overlaps without suppressing either ORF.
+        """
+        grouped: Dict[Tuple[str, str], List[ORFRecord]] = {}
+        for rec in records:
+            key = (rec.transcript_id, rec.source_strand)
+            grouped.setdefault(key, []).append(rec)
+        for _, items in grouped.items():
+            for i, rec in enumerate(items):
+                for j, other in enumerate(items):
+                    if i >= j:
+                        continue
+                    if rec.frame == other.frame:
+                        continue
+                    if ORFOverlapMarker._is_overlap(rec, other):
+                        if rec.overlap_type == "none":
+                            rec.overlap_type = "overlap_different_frame"
+                        if other.overlap_type == "none":
+                            other.overlap_type = "overlap_different_frame"
+    @staticmethod
+    def _downgrade_orf(rec: ORFRecord, best: ORFRecord) -> None:
+        """
+        Downgrade an ORF according to its relationship with the selected best ORF.
+        """
+        rec.priority = "secondary"
+        if ORFOverlapMarker._is_identical(rec, best):
+            rec.overlap_type = "identical_ORF"
+        elif ORFOverlapMarker._is_nested(rec, best):
+            if rec.start_codon != "ATG" and best.start_codon == "ATG":
+                rec.overlap_type = "secondary_noncanonical_start"
+            else:
+                rec.overlap_type = "nested"
+        elif rec.tx_orf_end == best.tx_orf_end:
+            if rec.start_codon != best.start_codon:
+                rec.overlap_type = "alternative_start_same_stop"
+            else:
+                rec.overlap_type = "same_stop_overlap"
+        elif rec.tx_orf_start == best.tx_orf_start:
+            rec.overlap_type = "same_start_different_stop"
+        else:
+            rec.overlap_type = "same_frame_overlap_different_stop"
+    @staticmethod
+    def _select_best_orf(records: List[ORFRecord]) -> ORFRecord:
+        """
+        Select the most reliable ORF from overlapping ORFs.
+        """
+        return sorted(records, key=ORFOverlapMarker._priority_key)[0]
+    @staticmethod
+    def _priority_key(rec: ORFRecord):
+        """
+        Build sorting key for ORF priority.
+        Lower value means higher priority.
+        """
+        annotated_rank = 0 if rec.category == "annotated_mORF" else 1
+        completeness_rank = 0 if rec.completeness == "complete" else 1
+        start_rank = START_CODON_RANK.get(rec.start_codon, 99)
+        kozak_rank = -ORFOverlapMarker._kozak_score(rec.kozak_seq)
+        # Longer ORFs are preferred after biological confidence rules.
+        length_rank = -rec.aa_length
+        # More upstream start site is preferred if all other ranks are equal.
+        start_position_rank = rec.tx_orf_start
+        return (
+            annotated_rank,
+            completeness_rank,
+            start_rank,
+            kozak_rank,
+            length_rank,
+            start_position_rank,
+        )
+    @staticmethod
+    def _kozak_score(kozak_seq: str) -> int:
+        """
+        Calculate a simple Kozak score.
+        Rule:
+        - Position -3 is A/G: +1
+        - Position +4 is G: +1
+        The input sequence is expected to contain:
+        upstream sequence + start codon + downstream sequence.
+        """
+        if not kozak_seq:
+            return 0
+        seq = kozak_seq.upper()
+        score = 0
+        # Default scanner extracts 6 nt upstream + 3 nt start codon + downstream.
+        # Therefore start codon begins at index 6 if full Kozak sequence exists.
+        start_index = 6 if len(seq) >= 9 else max(0, len(seq) // 2 - 1)
+        minus3_index = start_index - 3
+        plus4_index = start_index + 3
+        if 0 <= minus3_index < len(seq) and seq[minus3_index] in {"A", "G"}:
+            score += 1
+        if 0 <= plus4_index < len(seq) and seq[plus4_index] == "G":
+            score += 1
+        return score
+    @staticmethod
+    def _is_overlap(a: ORFRecord, b: ORFRecord) -> bool:
+        """
+        Check whether two ORFs overlap in transcript coordinates.
+        """
+        return a.tx_orf_start < b.tx_orf_end and a.tx_orf_end > b.tx_orf_start
+    @staticmethod
+    def _is_nested(a: ORFRecord, b: ORFRecord) -> bool:
+        """
+        Check whether ORF a is fully contained within ORF b.
+        """
+        return b.tx_orf_start <= a.tx_orf_start and b.tx_orf_end >= a.tx_orf_end
+    @staticmethod
+    def _is_identical(a: ORFRecord, b: ORFRecord) -> bool:
+        """
+        Check whether two ORFs have identical transcript coordinates.
+        """
+        return a.tx_orf_start == b.tx_orf_start and a.tx_orf_end == b.tx_orf_end

riboparser-0.2.3/utils/smorf/pipeline.py ADDED Viewed

@@ -0,0 +1,287 @@
+# Author: Rensc
+# date: 2026-05-21
+"""
+Main smORF pipeline.
+This pipeline connects all functional modules:
+1. Read genome FASTA.
+2. Read genePred annotation.
+3. Reconstruct transcript sequences.
+4. Scan ORFs.
+5. Classify ORFs.
+6. Mark ORF overlaps.
+7. Write output files.
+Parallel mode uses multiprocessing instead of threading because ORF scanning
+is CPU-intensive and Python threads are limited by the GIL.
+"""
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from .fasta import FastaParser
+from .genepred import GenePredParser
+from .coordinate import CoordinateMapper
+from .scanner import ORFScanner
+from .classifier import ORFClassifier
+from .overlap import ORFOverlapMarker
+from .writer import GenePredWriter, MessageWriter, FastaWriter
+_WORKER_GENOME = None
+_WORKER_CONFIG = None
+def _init_worker(genome, config):
+    """
+    Initialize worker-level global objects.
+    This avoids sending the genome dictionary to every single transcript task.
+    """
+    global _WORKER_GENOME
+    global _WORKER_CONFIG
+    _WORKER_GENOME = genome
+    _WORKER_CONFIG = config
+def _scan_transcript_worker(task):
+    """
+    Worker function for scanning one transcript.
+    Parameters
+    ----------
+    task : tuple
+        Tuple of transcript index and Transcript object.
+    Returns
+    -------
+    tuple
+        Transcript index, transcript ID, gene ID, and ORF records.
+    """
+    idx, tx = task
+    scanner = ORFScanner(
+        start_codons=_WORKER_CONFIG["start_codons"],
+        min_aa=_WORKER_CONFIG["min_aa"],
+        max_aa=_WORKER_CONFIG["max_aa"],
+        scan_strand=_WORKER_CONFIG["scan_strand"],
+        kozak_up=_WORKER_CONFIG["kozak_up"],
+        kozak_down=_WORKER_CONFIG["kozak_down"],
+        include_stop=_WORKER_CONFIG["include_stop"],
+    )
+    # Reconstruct spliced transcript sequence before ORF scanning.
+    CoordinateMapper.build_transcript_sequence(tx, _WORKER_GENOME)
+    # Scan candidate ORFs from the transcript sequence.
+    tx_records = scanner.scan_transcript(tx)
+    # Assign ORF category labels.
+    ORFClassifier.classify(tx, tx_records)
+    # Mark nested or overlapping ORFs if requested.
+    if _WORKER_CONFIG["mark_overlap"]:
+        ORFOverlapMarker.mark(tx_records)
+    # Remove same-frame internal ORFs if requested.
+    if _WORKER_CONFIG["remove_discarded"]:
+        tx_records = [x for x in tx_records if x.priority != "discarded"]
+    return idx, tx.transcript_id, tx.gene_id, tx_records
+class SmORFPipeline:
+    """
+    High-level pipeline for transcript-centric smORF detection.
+    """
+    def __init__(
+        self,
+        genome: str,
+        annotation: str,
+        out_prefix: str = "ORF",
+        orf_prefix: str = "ORF",
+        start_codons: str = "ATG",
+        min_aa: int = 8,
+        max_aa: int = 10000,
+        scan_strand: str = "sense",
+        kozak_up: int = 6,
+        kozak_down: int = 6,
+        mark_overlap: bool = False,
+        remove_discarded: bool = False,
+        include_stop: bool = False,
+        threads: int = 1,
+    ):
+        """
+        Initialize smORF pipeline.
+        """
+        self.genome_path = genome
+        self.annotation_path = annotation
+        self.out_prefix = out_prefix
+        self.orf_prefix = orf_prefix
+        self.start_codons = [x.strip().upper() for x in start_codons.split(",")]
+        self.min_aa = min_aa
+        self.max_aa = max_aa
+        self.scan_strand = scan_strand
+        self.kozak_up = kozak_up
+        self.kozak_down = kozak_down
+        self.mark_overlap = mark_overlap
+        self.remove_discarded = remove_discarded
+        self.include_stop = include_stop
+        self.threads = max(1, int(threads))
+        self.records = []
+    def run(self) -> None:
+        """
+        Run the complete smORF scanning pipeline.
+        """
+        genome = FastaParser.read_fasta(self.genome_path)
+        transcripts = GenePredParser.read_genepred(self.annotation_path)
+        if self.threads == 1:
+            self._run_single_process(genome, transcripts)
+        else:
+            self._run_multi_process(genome, transcripts)
+        self.write_outputs()
+    def _run_single_process(self, genome, transcripts) -> None:
+        """
+        Run smORF scanning in single-process mode.
+        """
+        scanner = ORFScanner(
+            start_codons=self.start_codons,
+            min_aa=self.min_aa,
+            max_aa=self.max_aa,
+            scan_strand=self.scan_strand,
+            kozak_up=self.kozak_up,
+            kozak_down=self.kozak_down,
+            include_stop=self.include_stop,
+        )
+        total_tx = len(transcripts)
+        orf_index = 1
+        for idx, tx in enumerate(transcripts, start=1):
+            # Print scanning progress.
+            print(
+                "[smORFScanner] [{}/{}] Scanning gene={}, transcript={}, chrom={}, strand={}".format(
+                    idx,
+                    total_tx,
+                    tx.gene_id,
+                    tx.transcript_id,
+                    tx.chrom,
+                    tx.strand,
+                ),
+                flush=True,
+            )
+            # Reconstruct spliced transcript sequence before ORF scanning.
+            CoordinateMapper.build_transcript_sequence(tx, genome)
+            # Scan candidate ORFs from the transcript sequence.
+            tx_records = scanner.scan_transcript(tx)
+            # Assign ORF category labels.
+            ORFClassifier.classify(tx, tx_records)
+            # Mark nested or overlapping ORFs if requested.
+            if self.mark_overlap:
+                ORFOverlapMarker.mark(tx_records)
+            # Remove same-frame internal ORFs if requested.
+            if self.remove_discarded:
+                tx_records = [x for x in tx_records if x.priority != "discarded"]
+            # Assign stable ORF IDs.
+            for rec in tx_records:
+                rec.orf_id = "{}{:08d}".format(self.orf_prefix, orf_index)
+                orf_index += 1
+            self.records.extend(tx_records)
+    def _run_multi_process(self, genome, transcripts) -> None:
+        """
+        Run smORF scanning in multiprocessing mode.
+        """
+        total_tx = len(transcripts)
+        config = {
+            "start_codons": self.start_codons,
+            "min_aa": self.min_aa,
+            "max_aa": self.max_aa,
+            "scan_strand": self.scan_strand,
+            "kozak_up": self.kozak_up,
+            "kozak_down": self.kozak_down,
+            "include_stop": self.include_stop,
+            "mark_overlap": self.mark_overlap,
+            "remove_discarded": self.remove_discarded,
+        }
+        print(
+            "[smORFScanner] Running in multiprocessing mode with {} workers.".format(
+                self.threads
+            ),
+            flush=True,
+        )
+        results_by_index = {}
+        with ProcessPoolExecutor(
+            max_workers=self.threads,
+            initializer=_init_worker,
+            initargs=(genome, config),
+        ) as executor:
+            future_to_index = {
+                executor.submit(_scan_transcript_worker, (idx, tx)): idx
+                for idx, tx in enumerate(transcripts, start=1)
+            }
+            finished = 0
+            for future in as_completed(future_to_index):
+                idx, transcript_id, gene_id, tx_records = future.result()
+                results_by_index[idx] = tx_records
+                finished += 1
+                # Print completed transcript progress.
+                print(
+                    "[smORFScanner] [{}/{}] Finished gene={}, transcript={}, ORFs={}".format(
+                        finished,
+                        total_tx,
+                        gene_id,
+                        transcript_id,
+                        len(tx_records),
+                    ),
+                    flush=True,
+                )
+        # Rebuild records in original transcript order and assign stable ORF IDs.
+        orf_index = 1
+        for idx in range(1, total_tx + 1):
+            tx_records = results_by_index.get(idx, [])
+            for rec in tx_records:
+                rec.orf_id = "{}{:08d}".format(self.orf_prefix, orf_index)
+                orf_index += 1
+            self.records.extend(tx_records)
+    def write_outputs(self) -> None:
+        """
+        Write all output files.
+        """
+        GenePredWriter.write("{}.genePred".format(self.out_prefix), self.records)
+        MessageWriter.write("{}.message.txt".format(self.out_prefix), self.records)
+        FastaWriter.write_nt("{}.nt.fa".format(self.out_prefix), self.records)
+        FastaWriter.write_pep("{}.pep.fa".format(self.out_prefix), self.records)

{riboparser-0.2.2 → riboparser-0.2.3}/utils/smorf_scanner.py RENAMED Viewed

@@ -106,6 +106,14 @@ def parse_args():
         help="Number of downstream nucleotides after start codon for Kozak sequence."
     )
+    parser.add_argument(
+        "-t",
+        "--threads",
+        type=int,
+        default=1,
+        help="Number of worker processes for parallel ORF scanning."
+    )
     parser.add_argument(
         "--mark-overlap",
         action="store_true",
@@ -148,6 +156,7 @@ def main():
         mark_overlap=args.mark_overlap,
         remove_discarded=args.remove_discarded,
         include_stop=args.include_stop,
+        threads=args.threads,
     )
     pipeline.run()

riboparser-0.2.2/utils/smorf/overlap.py DELETED Viewed

@@ -1,76 +0,0 @@
-# Author: Rensc
-# date: 2026-05-21
-"""
-ORF overlap marker.
-This module marks nested or partially overlapping ORFs within the same:
-1. transcript
-2. source strand
-3. reading frame
-"""
-from typing import List
-from .models import ORFRecord
-class ORFOverlapMarker:
-    """
-    Mark nested and overlapping ORFs.
-    """
-    @staticmethod
-    def mark(records: List[ORFRecord]) -> None:
-        """
-        Mark ORF overlap type and priority.
-        A shorter ORF fully contained in a longer ORF with the same frame
-        is marked as secondary.
-        Parameters
-        ----------
-        records : list
-            List of ORFRecord objects.
-        """
-        grouped = {}
-        # Group ORFs by transcript, strand, and frame.
-        for rec in records:
-            key = (rec.transcript_id, rec.source_strand, rec.frame)
-            grouped.setdefault(key, []).append(rec)
-        for _, items in grouped.items():
-            items.sort(key=lambda x: (x.tx_orf_start, -(x.tx_orf_end - x.tx_orf_start)))
-            for i, rec in enumerate(items):
-                for j, other in enumerate(items):
-                    if i == j:
-                        continue
-                    is_nested = (
-                        other.tx_orf_start <= rec.tx_orf_start
-                        and other.tx_orf_end >= rec.tx_orf_end
-                        and (other.tx_orf_end - other.tx_orf_start)
-                        > (rec.tx_orf_end - rec.tx_orf_start)
-                    )
-                    if is_nested:
-                        rec.priority = "secondary"
-                        rec.overlap_type = "nested"
-                        break
-                # If the ORF is not nested, check partial overlap.
-                if rec.priority == "primary":
-                    for other in items:
-                        if rec is other:
-                            continue
-                        is_overlap = (
-                            rec.tx_orf_start < other.tx_orf_end
-                            and rec.tx_orf_end > other.tx_orf_start
-                        )
-                        if is_overlap:
-                            rec.overlap_type = "partial_overlap"
-                            break

RiboParser 0.2.2__tar.gz → 0.2.3__tar.gz

RiboParser 0.2.2tar.gz → 0.2.3tar.gz