PyPI - RiboParser - Versions diffs - 0.2.1__tar.gz → 0.2.3__tar.gz - Mend

RiboParser 0.2.1tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (153) hide show

{riboparser-0.2.1 → riboparser-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: RiboParser
-Version: 0.2.1
+Version: 0.2.3
 Summary: A pipeline for ribosome profiling data analysis
 Author-email: Ren Shuchao <rensc0718@163.com>
 License-Expression: GPL-3.0-or-later

{riboparser-0.2.1 → riboparser-0.2.3}/RiboParser.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: RiboParser
-Version: 0.2.1
+Version: 0.2.3
 Summary: A pipeline for ribosome profiling data analysis
 Author-email: Ren Shuchao <rensc0718@163.com>
 License-Expression: GPL-3.0-or-later

{riboparser-0.2.1 → riboparser-0.2.3}/RiboParser.egg-info/SOURCES.txt RENAMED Viewed

@@ -62,8 +62,6 @@ scripts/rsem/merge_rsem.py
 scripts/unix/__init__.py
 scripts/unix/dos2unix.py
 utils/__init__.py
-utils/make_ensb_ref.py
-utils/make_ribo_ref.py
 utils/riboparser.py
 utils/rna_Density.py
 utils/rna_Offset.py

{riboparser-0.2.1 → riboparser-0.2.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "RiboParser"
-version = "0.2.1"
+version = "0.2.3"
 authors = [{ name = "Ren Shuchao", email = "rensc0718@163.com" }]
 description = "A pipeline for ribosome profiling data analysis"
 readme = "README.md"

riboparser-0.2.3/utils/data/RiboParser.py ADDED Viewed

@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Project : riboParser
+# @Script  : riboparser.py
+import pkg_resources
+class RiboParserInfo:
+    try:
+        version = pkg_resources.get_distribution("RiboParser").version
+    except Exception:
+        version = "unknown"
+    update_date = "2026-05-21"
+    citation = (
+        '''
+        Shuchao Ren, Yinan Li, Zhipeng Zhou.
+        RiboParser/RiboShiny: An integrated platform for comprehensive analysis and visualization of ribo-seq data.
+        Journal of Genetics and Genomics (2025)
+        doi:10.1016/j.jgg.2025.04.010.
+        '''
+    )
+    required_packages = ["pandas", "polars", "numpy", "matplotlib-venn", "seqlogo",
+                         "matplotlib", "seaborn", "biopython",
+                         "scipy", "scikit-learn", "statsmodels",
+                         "pysam", "joblib"]
+    @classmethod
+    def show_version(cls):
+        print(f"RiboParser version: {cls.version}")
+        print(f"Last update: {cls.update_date}")
+    @classmethod
+    def show_citation(cls):
+        print("Please cite:")
+        print(cls.citation)
+    @classmethod
+    def check_dependencies(cls):
+        missing = []
+        for pkg in cls.required_packages:
+            try:
+                pkg_resources.get_distribution(pkg)
+            except pkg_resources.DistributionNotFound:
+                missing.append(pkg)
+        if missing:
+            print(f"Missing dependencies: {', '.join(missing)}")
+            return False
+        else:
+            print(cls.required_packages)
+        print("All required dependencies are installed.")
+        return True
+    @classmethod
+    def check_package_modules(cls, module_type: str = "all"):
+        from pathlib import Path
+        import sys
+        import importlib
+        script_path = Path(__file__).resolve()
+        # Find project root
+        root = script_path.parent
+        for _ in range(10):
+            if any((root / name).exists() for name in ("pyproject.toml", "README.md", ".git", "utils", "scripts")):
+                break
+            if root.parent == root:
+                break
+            root = root.parent
+        # Make local modules importable
+        if str(root) not in sys.path:
+            sys.path.insert(0, str(root))
+        utils_dir = root / "utils"
+        scripts_dir = root / "scripts"
+        modules = {
+            "ribo": [],
+            "serp": [],
+            "smorf": [],
+            "scripts": []
+        }
+        def module_name_from_path(p: Path) -> str:
+            rel = p.relative_to(root)
+            return ".".join(rel.with_suffix("").parts)
+        def add_module(p: Path):
+            if p.name.startswith("_") or p.name == "__init__.py":
+                return
+            mod = module_name_from_path(p)
+            parts = p.relative_to(root).parts
+            stem = p.stem
+            if "smorf" in parts or stem.startswith("smorf_"):
+                modules["smorf"].append(mod)
+            elif "serp" in parts or stem.startswith("serp_"):
+                modules["serp"].append(mod)
+            elif "ribo" in parts or stem.startswith(("rpf_", "rna_")):
+                modules["ribo"].append(mod)
+            elif "scripts" in parts:
+                modules["scripts"].append(mod)
+        if utils_dir.exists():
+            for p in utils_dir.rglob("*.py"):
+                add_module(p)
+        if scripts_dir.exists():
+            for p in scripts_dir.rglob("*.py"):
+                add_module(p)
+        for key in modules:
+            modules[key] = sorted(set(modules[key]))
+        def try_import(module_name: str) -> bool:
+            try:
+                importlib.import_module(module_name)
+                return True
+            except Exception as e:
+                return False
+        show_keys = modules.keys() if module_type == "all" else [module_type]
+        for key in show_keys:
+            print(f"{key} modules:")
+            if modules.get(key):
+                for mod in modules[key]:
+                    status = "[import OK]" if try_import(mod) else "[import FAILED]"
+                    print(f" - {mod} {status}")
+            else:
+                print(" - (not found)")

riboparser-0.2.3/utils/smorf/overlap.py ADDED Viewed

@@ -0,0 +1,234 @@
+# Author: Rensc
+# date: 2026-05-21
+"""
+ORF overlap marker.
+This module marks ORF overlap types and assigns priority labels.
+Priority rule:
+1. annotated_mORF is preferred.
+2. complete ORF is preferred over partial ORF.
+3. ATG start codon is preferred over non-ATG start codons.
+4. Stronger Kozak context is preferred.
+5. Longer ORF is preferred.
+6. More upstream start site is preferred.
+Important:
+Different-frame overlapping ORFs are not suppressed by default.
+"""
+from typing import List, Dict, Tuple
+from .models import ORFRecord
+START_CODON_RANK = {
+    "ATG": 1,
+    "CTG": 2,
+    "GTG": 3,
+    "TTG": 4,
+    "ACG": 5,
+    "ATA": 6,
+    "ATT": 7,
+    "ATC": 8,
+}
+class ORFOverlapMarker:
+    """
+    Mark ORF overlap type and priority.
+    """
+    @staticmethod
+    def mark(records: List[ORFRecord]) -> None:
+        """
+        Mark ORF overlap relationships.
+        Parameters
+        ----------
+        records : list
+            List of ORFRecord objects.
+        """
+        ORFOverlapMarker._mark_different_frame_overlap(records)
+        ORFOverlapMarker._mark_same_frame_overlap(records)
+    @staticmethod
+    def _mark_same_frame_overlap(records: List[ORFRecord]) -> None:
+        """
+        Mark overlaps among ORFs with the same transcript, strand, and frame.
+        """
+        grouped: Dict[Tuple[str, str, int], List[ORFRecord]] = {}
+        for rec in records:
+            key = (rec.transcript_id, rec.source_strand, rec.frame)
+            grouped.setdefault(key, []).append(rec)
+        for _, items in grouped.items():
+            items.sort(key=lambda x: (x.tx_orf_start, x.tx_orf_end))
+            for rec in items:
+                competitors = [
+                    other for other in items
+                    if other is not rec
+                    and ORFOverlapMarker._is_overlap(rec, other)
+                ]
+                if not competitors:
+                    continue
+                best = ORFOverlapMarker._select_best_orf([rec] + competitors)
+                if rec is best:
+                    if rec.overlap_type == "none":
+                        rec.overlap_type = "same_frame_overlap"
+                    rec.priority = "primary"
+                else:
+                    ORFOverlapMarker._downgrade_orf(rec, best)
+    @staticmethod
+    def _mark_different_frame_overlap(records: List[ORFRecord]) -> None:
+        """
+        Mark different-frame overlaps without suppressing either ORF.
+        """
+        grouped: Dict[Tuple[str, str], List[ORFRecord]] = {}
+        for rec in records:
+            key = (rec.transcript_id, rec.source_strand)
+            grouped.setdefault(key, []).append(rec)
+        for _, items in grouped.items():
+            for i, rec in enumerate(items):
+                for j, other in enumerate(items):
+                    if i >= j:
+                        continue
+                    if rec.frame == other.frame:
+                        continue
+                    if ORFOverlapMarker._is_overlap(rec, other):
+                        if rec.overlap_type == "none":
+                            rec.overlap_type = "overlap_different_frame"
+                        if other.overlap_type == "none":
+                            other.overlap_type = "overlap_different_frame"
+    @staticmethod
+    def _downgrade_orf(rec: ORFRecord, best: ORFRecord) -> None:
+        """
+        Downgrade an ORF according to its relationship with the selected best ORF.
+        """
+        rec.priority = "secondary"
+        if ORFOverlapMarker._is_identical(rec, best):
+            rec.overlap_type = "identical_ORF"
+        elif ORFOverlapMarker._is_nested(rec, best):
+            if rec.start_codon != "ATG" and best.start_codon == "ATG":
+                rec.overlap_type = "secondary_noncanonical_start"
+            else:
+                rec.overlap_type = "nested"
+        elif rec.tx_orf_end == best.tx_orf_end:
+            if rec.start_codon != best.start_codon:
+                rec.overlap_type = "alternative_start_same_stop"
+            else:
+                rec.overlap_type = "same_stop_overlap"
+        elif rec.tx_orf_start == best.tx_orf_start:
+            rec.overlap_type = "same_start_different_stop"
+        else:
+            rec.overlap_type = "same_frame_overlap_different_stop"
+    @staticmethod
+    def _select_best_orf(records: List[ORFRecord]) -> ORFRecord:
+        """
+        Select the most reliable ORF from overlapping ORFs.
+        """
+        return sorted(records, key=ORFOverlapMarker._priority_key)[0]
+    @staticmethod
+    def _priority_key(rec: ORFRecord):
+        """
+        Build sorting key for ORF priority.
+        Lower value means higher priority.
+        """
+        annotated_rank = 0 if rec.category == "annotated_mORF" else 1
+        completeness_rank = 0 if rec.completeness == "complete" else 1
+        start_rank = START_CODON_RANK.get(rec.start_codon, 99)
+        kozak_rank = -ORFOverlapMarker._kozak_score(rec.kozak_seq)
+        # Longer ORFs are preferred after biological confidence rules.
+        length_rank = -rec.aa_length
+        # More upstream start site is preferred if all other ranks are equal.
+        start_position_rank = rec.tx_orf_start
+        return (
+            annotated_rank,
+            completeness_rank,
+            start_rank,
+            kozak_rank,
+            length_rank,
+            start_position_rank,
+        )
+    @staticmethod
+    def _kozak_score(kozak_seq: str) -> int:
+        """
+        Calculate a simple Kozak score.
+        Rule:
+        - Position -3 is A/G: +1
+        - Position +4 is G: +1
+        The input sequence is expected to contain:
+        upstream sequence + start codon + downstream sequence.
+        """
+        if not kozak_seq:
+            return 0
+        seq = kozak_seq.upper()
+        score = 0
+        # Default scanner extracts 6 nt upstream + 3 nt start codon + downstream.
+        # Therefore start codon begins at index 6 if full Kozak sequence exists.
+        start_index = 6 if len(seq) >= 9 else max(0, len(seq) // 2 - 1)
+        minus3_index = start_index - 3
+        plus4_index = start_index + 3
+        if 0 <= minus3_index < len(seq) and seq[minus3_index] in {"A", "G"}:
+            score += 1
+        if 0 <= plus4_index < len(seq) and seq[plus4_index] == "G":
+            score += 1
+        return score
+    @staticmethod
+    def _is_overlap(a: ORFRecord, b: ORFRecord) -> bool:
+        """
+        Check whether two ORFs overlap in transcript coordinates.
+        """
+        return a.tx_orf_start < b.tx_orf_end and a.tx_orf_end > b.tx_orf_start
+    @staticmethod
+    def _is_nested(a: ORFRecord, b: ORFRecord) -> bool:
+        """
+        Check whether ORF a is fully contained within ORF b.
+        """
+        return b.tx_orf_start <= a.tx_orf_start and b.tx_orf_end >= a.tx_orf_end
+    @staticmethod
+    def _is_identical(a: ORFRecord, b: ORFRecord) -> bool:
+        """
+        Check whether two ORFs have identical transcript coordinates.
+        """
+        return a.tx_orf_start == b.tx_orf_start and a.tx_orf_end == b.tx_orf_end

riboparser-0.2.3/utils/smorf/pipeline.py ADDED Viewed

@@ -0,0 +1,287 @@
+# Author: Rensc
+# date: 2026-05-21
+"""
+Main smORF pipeline.
+This pipeline connects all functional modules:
+1. Read genome FASTA.
+2. Read genePred annotation.
+3. Reconstruct transcript sequences.
+4. Scan ORFs.
+5. Classify ORFs.
+6. Mark ORF overlaps.
+7. Write output files.
+Parallel mode uses multiprocessing instead of threading because ORF scanning
+is CPU-intensive and Python threads are limited by the GIL.
+"""
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from .fasta import FastaParser
+from .genepred import GenePredParser
+from .coordinate import CoordinateMapper
+from .scanner import ORFScanner
+from .classifier import ORFClassifier
+from .overlap import ORFOverlapMarker
+from .writer import GenePredWriter, MessageWriter, FastaWriter
+_WORKER_GENOME = None
+_WORKER_CONFIG = None
+def _init_worker(genome, config):
+    """
+    Initialize worker-level global objects.
+    This avoids sending the genome dictionary to every single transcript task.
+    """
+    global _WORKER_GENOME
+    global _WORKER_CONFIG
+    _WORKER_GENOME = genome
+    _WORKER_CONFIG = config
+def _scan_transcript_worker(task):
+    """
+    Worker function for scanning one transcript.
+    Parameters
+    ----------
+    task : tuple
+        Tuple of transcript index and Transcript object.
+    Returns
+    -------
+    tuple
+        Transcript index, transcript ID, gene ID, and ORF records.
+    """
+    idx, tx = task
+    scanner = ORFScanner(
+        start_codons=_WORKER_CONFIG["start_codons"],
+        min_aa=_WORKER_CONFIG["min_aa"],
+        max_aa=_WORKER_CONFIG["max_aa"],
+        scan_strand=_WORKER_CONFIG["scan_strand"],
+        kozak_up=_WORKER_CONFIG["kozak_up"],
+        kozak_down=_WORKER_CONFIG["kozak_down"],
+        include_stop=_WORKER_CONFIG["include_stop"],
+    )
+    # Reconstruct spliced transcript sequence before ORF scanning.
+    CoordinateMapper.build_transcript_sequence(tx, _WORKER_GENOME)
+    # Scan candidate ORFs from the transcript sequence.
+    tx_records = scanner.scan_transcript(tx)
+    # Assign ORF category labels.
+    ORFClassifier.classify(tx, tx_records)
+    # Mark nested or overlapping ORFs if requested.
+    if _WORKER_CONFIG["mark_overlap"]:
+        ORFOverlapMarker.mark(tx_records)
+    # Remove same-frame internal ORFs if requested.
+    if _WORKER_CONFIG["remove_discarded"]:
+        tx_records = [x for x in tx_records if x.priority != "discarded"]
+    return idx, tx.transcript_id, tx.gene_id, tx_records
+class SmORFPipeline:
+    """
+    High-level pipeline for transcript-centric smORF detection.
+    """
+    def __init__(
+        self,
+        genome: str,
+        annotation: str,
+        out_prefix: str = "ORF",
+        orf_prefix: str = "ORF",
+        start_codons: str = "ATG",
+        min_aa: int = 8,
+        max_aa: int = 10000,
+        scan_strand: str = "sense",
+        kozak_up: int = 6,
+        kozak_down: int = 6,
+        mark_overlap: bool = False,
+        remove_discarded: bool = False,
+        include_stop: bool = False,
+        threads: int = 1,
+    ):
+        """
+        Initialize smORF pipeline.
+        """
+        self.genome_path = genome
+        self.annotation_path = annotation
+        self.out_prefix = out_prefix
+        self.orf_prefix = orf_prefix
+        self.start_codons = [x.strip().upper() for x in start_codons.split(",")]
+        self.min_aa = min_aa
+        self.max_aa = max_aa
+        self.scan_strand = scan_strand
+        self.kozak_up = kozak_up
+        self.kozak_down = kozak_down
+        self.mark_overlap = mark_overlap
+        self.remove_discarded = remove_discarded
+        self.include_stop = include_stop
+        self.threads = max(1, int(threads))
+        self.records = []
+    def run(self) -> None:
+        """
+        Run the complete smORF scanning pipeline.
+        """
+        genome = FastaParser.read_fasta(self.genome_path)
+        transcripts = GenePredParser.read_genepred(self.annotation_path)
+        if self.threads == 1:
+            self._run_single_process(genome, transcripts)
+        else:
+            self._run_multi_process(genome, transcripts)
+        self.write_outputs()
+    def _run_single_process(self, genome, transcripts) -> None:
+        """
+        Run smORF scanning in single-process mode.
+        """
+        scanner = ORFScanner(
+            start_codons=self.start_codons,
+            min_aa=self.min_aa,
+            max_aa=self.max_aa,
+            scan_strand=self.scan_strand,
+            kozak_up=self.kozak_up,
+            kozak_down=self.kozak_down,
+            include_stop=self.include_stop,
+        )
+        total_tx = len(transcripts)
+        orf_index = 1
+        for idx, tx in enumerate(transcripts, start=1):
+            # Print scanning progress.
+            print(
+                "[smORFScanner] [{}/{}] Scanning gene={}, transcript={}, chrom={}, strand={}".format(
+                    idx,
+                    total_tx,
+                    tx.gene_id,
+                    tx.transcript_id,
+                    tx.chrom,
+                    tx.strand,
+                ),
+                flush=True,
+            )
+            # Reconstruct spliced transcript sequence before ORF scanning.
+            CoordinateMapper.build_transcript_sequence(tx, genome)
+            # Scan candidate ORFs from the transcript sequence.
+            tx_records = scanner.scan_transcript(tx)
+            # Assign ORF category labels.
+            ORFClassifier.classify(tx, tx_records)
+            # Mark nested or overlapping ORFs if requested.
+            if self.mark_overlap:
+                ORFOverlapMarker.mark(tx_records)
+            # Remove same-frame internal ORFs if requested.
+            if self.remove_discarded:
+                tx_records = [x for x in tx_records if x.priority != "discarded"]
+            # Assign stable ORF IDs.
+            for rec in tx_records:
+                rec.orf_id = "{}{:08d}".format(self.orf_prefix, orf_index)
+                orf_index += 1
+            self.records.extend(tx_records)
+    def _run_multi_process(self, genome, transcripts) -> None:
+        """
+        Run smORF scanning in multiprocessing mode.
+        """
+        total_tx = len(transcripts)
+        config = {
+            "start_codons": self.start_codons,
+            "min_aa": self.min_aa,
+            "max_aa": self.max_aa,
+            "scan_strand": self.scan_strand,
+            "kozak_up": self.kozak_up,
+            "kozak_down": self.kozak_down,
+            "include_stop": self.include_stop,
+            "mark_overlap": self.mark_overlap,
+            "remove_discarded": self.remove_discarded,
+        }
+        print(
+            "[smORFScanner] Running in multiprocessing mode with {} workers.".format(
+                self.threads
+            ),
+            flush=True,
+        )
+        results_by_index = {}
+        with ProcessPoolExecutor(
+            max_workers=self.threads,
+            initializer=_init_worker,
+            initargs=(genome, config),
+        ) as executor:
+            future_to_index = {
+                executor.submit(_scan_transcript_worker, (idx, tx)): idx
+                for idx, tx in enumerate(transcripts, start=1)
+            }
+            finished = 0
+            for future in as_completed(future_to_index):
+                idx, transcript_id, gene_id, tx_records = future.result()
+                results_by_index[idx] = tx_records
+                finished += 1
+                # Print completed transcript progress.
+                print(
+                    "[smORFScanner] [{}/{}] Finished gene={}, transcript={}, ORFs={}".format(
+                        finished,
+                        total_tx,
+                        gene_id,
+                        transcript_id,
+                        len(tx_records),
+                    ),
+                    flush=True,
+                )
+        # Rebuild records in original transcript order and assign stable ORF IDs.
+        orf_index = 1
+        for idx in range(1, total_tx + 1):
+            tx_records = results_by_index.get(idx, [])
+            for rec in tx_records:
+                rec.orf_id = "{}{:08d}".format(self.orf_prefix, orf_index)
+                orf_index += 1
+            self.records.extend(tx_records)
+    def write_outputs(self) -> None:
+        """
+        Write all output files.
+        """
+        GenePredWriter.write("{}.genePred".format(self.out_prefix), self.records)
+        MessageWriter.write("{}.message.txt".format(self.out_prefix), self.records)
+        FastaWriter.write_nt("{}.nt.fa".format(self.out_prefix), self.records)
+        FastaWriter.write_pep("{}.pep.fa".format(self.out_prefix), self.records)

RiboParser 0.2.1__tar.gz → 0.2.3__tar.gz

RiboParser 0.2.1tar.gz → 0.2.3tar.gz