RiboParser 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. {riboparser-0.2.1 → riboparser-0.2.3}/PKG-INFO +1 -1
  2. {riboparser-0.2.1 → riboparser-0.2.3}/RiboParser.egg-info/PKG-INFO +1 -1
  3. {riboparser-0.2.1 → riboparser-0.2.3}/RiboParser.egg-info/SOURCES.txt +0 -2
  4. {riboparser-0.2.1 → riboparser-0.2.3}/pyproject.toml +1 -1
  5. riboparser-0.2.3/utils/data/RiboParser.py +135 -0
  6. riboparser-0.2.3/utils/smorf/overlap.py +234 -0
  7. riboparser-0.2.3/utils/smorf/pipeline.py +287 -0
  8. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf_scanner.py +9 -0
  9. riboparser-0.2.1/utils/data/RiboParser.py +0 -184
  10. riboparser-0.2.1/utils/make_ensb_ref.py +0 -308
  11. riboparser-0.2.1/utils/make_ribo_ref.py +0 -39
  12. riboparser-0.2.1/utils/smorf/overlap.py +0 -76
  13. riboparser-0.2.1/utils/smorf/pipeline.py +0 -158
  14. {riboparser-0.2.1 → riboparser-0.2.3}/README.md +0 -0
  15. {riboparser-0.2.1 → riboparser-0.2.3}/RiboParser.egg-info/dependency_links.txt +0 -0
  16. {riboparser-0.2.1 → riboparser-0.2.3}/RiboParser.egg-info/entry_points.txt +0 -0
  17. {riboparser-0.2.1 → riboparser-0.2.3}/RiboParser.egg-info/requires.txt +0 -0
  18. {riboparser-0.2.1 → riboparser-0.2.3}/RiboParser.egg-info/top_level.txt +0 -0
  19. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/__init__.py +0 -0
  20. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/bedgraph/__init__.py +0 -0
  21. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/bedgraph/bg2meta.py +0 -0
  22. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/bedgraph/rpm_smooth.py +0 -0
  23. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/bowtie/__init__.py +0 -0
  24. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/bowtie/merge_bwt_log.py +0 -0
  25. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/__init__.py +0 -0
  26. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/fa_gc_sum.py +0 -0
  27. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/fa_len_flt.py +0 -0
  28. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/fa_len_sum.py +0 -0
  29. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/fa_split.py +0 -0
  30. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/line_feed.py +0 -0
  31. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/nt2aa.py +0 -0
  32. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/rand_seq.py +0 -0
  33. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/retrieve_seq.py +0 -0
  34. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fasta/revs.py +0 -0
  35. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/__init__.py +0 -0
  36. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/fq2fa.py +0 -0
  37. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/fq2txt.py +0 -0
  38. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/fq_len_flt.py +0 -0
  39. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/fq_len_sum.py +0 -0
  40. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/fq_length.py +0 -0
  41. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/fq_split.py +0 -0
  42. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/fq_trim.py +0 -0
  43. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/phred_quality.py +0 -0
  44. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/fastq/simulate_fastq.py +0 -0
  45. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/__init__.py +0 -0
  46. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_cdt.py +0 -0
  47. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_coverage.py +0 -0
  48. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_cst.py +0 -0
  49. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_digestion.py +0 -0
  50. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_dst_list.py +0 -0
  51. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_length.py +0 -0
  52. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_metagene.py +0 -0
  53. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_occupancy.py +0 -0
  54. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_odd_ratio.py +0 -0
  55. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_offset.py +0 -0
  56. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_offset_detail.py +0 -0
  57. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_offset_end.py +0 -0
  58. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_pausing.py +0 -0
  59. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_period.py +0 -0
  60. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_quant.py +0 -0
  61. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/merge_ribo/merge_saturation.py +0 -0
  62. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/oligo/__init__.py +0 -0
  63. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/oligo/get_overlap_seq.py +0 -0
  64. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/oligo/get_tissue_freq.py +0 -0
  65. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/oligo/get_win_seq.py +0 -0
  66. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/ribocode/__init__.py +0 -0
  67. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/ribocode/ribocode_bed_format.py +0 -0
  68. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/ribotish/__init__.py +0 -0
  69. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/ribotish/ribotish_format.py +0 -0
  70. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/rsem/__init__.py +0 -0
  71. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/rsem/merge_rsem.py +0 -0
  72. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/unix/__init__.py +0 -0
  73. {riboparser-0.2.1 → riboparser-0.2.3}/scripts/unix/dos2unix.py +0 -0
  74. {riboparser-0.2.1 → riboparser-0.2.3}/setup.cfg +0 -0
  75. {riboparser-0.2.1 → riboparser-0.2.3}/utils/__init__.py +0 -0
  76. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/ArgsParser.py +0 -0
  77. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Bam2Wig.py +0 -0
  78. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/BamFilter.py +0 -0
  79. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/CDT.py +0 -0
  80. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/CST.py +0 -0
  81. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Codon.py +0 -0
  82. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Coefficient_of_Variation.py +0 -0
  83. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Coverage.py +0 -0
  84. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Cumulative_CoV.py +0 -0
  85. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Density.py +0 -0
  86. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Digestion.py +0 -0
  87. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/EndSite.py +0 -0
  88. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Ensembl_Ref.py +0 -0
  89. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/GenePred.py +0 -0
  90. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/MetaCodon.py +0 -0
  91. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Metaplot.py +0 -0
  92. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Occupancy.py +0 -0
  93. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Odd_Ratio.py +0 -0
  94. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Offset.py +0 -0
  95. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Offset_RSBM.py +0 -0
  96. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Pausing.py +0 -0
  97. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Percentage.py +0 -0
  98. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Periodicity.py +0 -0
  99. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Quality.py +0 -0
  100. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Quant.py +0 -0
  101. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/RNA.py +0 -0
  102. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/RPFs.py +0 -0
  103. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Retrieve.py +0 -0
  104. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Ribo.py +0 -0
  105. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Shift.py +0 -0
  106. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/Shuffle.py +0 -0
  107. {riboparser-0.2.1 → riboparser-0.2.3}/utils/ribo/__init__.py +0 -0
  108. {riboparser-0.2.1 → riboparser-0.2.3}/utils/riboparser.py +0 -0
  109. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rna_Density.py +0 -0
  110. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rna_Offset.py +0 -0
  111. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Bam2bw.py +0 -0
  112. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Bam_Filter.py +0 -0
  113. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_CDT.py +0 -0
  114. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_CST.py +0 -0
  115. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Check.py +0 -0
  116. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_CoV.py +0 -0
  117. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Corr.py +0 -0
  118. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Coverage.py +0 -0
  119. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Cumulative_CoV.py +0 -0
  120. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Density.py +0 -0
  121. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Digest.py +0 -0
  122. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Geneplot.py +0 -0
  123. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Merge.py +0 -0
  124. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Meta_Codon.py +0 -0
  125. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Metaplot.py +0 -0
  126. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Occupancy.py +0 -0
  127. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Odd_Ratio.py +0 -0
  128. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Offset.py +0 -0
  129. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Offset_RSBM.py +0 -0
  130. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Pausing.py +0 -0
  131. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Percent.py +0 -0
  132. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Periodicity.py +0 -0
  133. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Quant.py +0 -0
  134. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Reference.py +0 -0
  135. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Retrieve.py +0 -0
  136. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Shift.py +0 -0
  137. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_Shuffle.py +0 -0
  138. {riboparser-0.2.1 → riboparser-0.2.3}/utils/rpf_end.py +0 -0
  139. {riboparser-0.2.1 → riboparser-0.2.3}/utils/serp/Properties.py +0 -0
  140. {riboparser-0.2.1 → riboparser-0.2.3}/utils/serp/SeRP.py +0 -0
  141. {riboparser-0.2.1 → riboparser-0.2.3}/utils/serp/__init__.py +0 -0
  142. {riboparser-0.2.1 → riboparser-0.2.3}/utils/serp_overlap.py +0 -0
  143. {riboparser-0.2.1 → riboparser-0.2.3}/utils/serp_peak.py +0 -0
  144. {riboparser-0.2.1 → riboparser-0.2.3}/utils/serp_properties.py +0 -0
  145. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf/__init__.py +0 -0
  146. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf/classifier.py +0 -0
  147. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf/coordinate.py +0 -0
  148. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf/fasta.py +0 -0
  149. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf/genepred.py +0 -0
  150. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf/models.py +0 -0
  151. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf/scanner.py +0 -0
  152. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf/sequence.py +0 -0
  153. {riboparser-0.2.1 → riboparser-0.2.3}/utils/smorf/writer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: RiboParser
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: A pipeline for ribosome profiling data analysis
5
5
  Author-email: Ren Shuchao <rensc0718@163.com>
6
6
  License-Expression: GPL-3.0-or-later
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: RiboParser
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: A pipeline for ribosome profiling data analysis
5
5
  Author-email: Ren Shuchao <rensc0718@163.com>
6
6
  License-Expression: GPL-3.0-or-later
@@ -62,8 +62,6 @@ scripts/rsem/merge_rsem.py
62
62
  scripts/unix/__init__.py
63
63
  scripts/unix/dos2unix.py
64
64
  utils/__init__.py
65
- utils/make_ensb_ref.py
66
- utils/make_ribo_ref.py
67
65
  utils/riboparser.py
68
66
  utils/rna_Density.py
69
67
  utils/rna_Offset.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "RiboParser"
7
- version = "0.2.1"
7
+ version = "0.2.3"
8
8
  authors = [{ name = "Ren Shuchao", email = "rensc0718@163.com" }]
9
9
  description = "A pipeline for ribosome profiling data analysis"
10
10
  readme = "README.md"
@@ -0,0 +1,135 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ # @Project : riboParser
4
+ # @Script : riboparser.py
5
+
6
+
7
+ import pkg_resources
8
+
9
+
10
+ class RiboParserInfo:
11
+ try:
12
+ version = pkg_resources.get_distribution("RiboParser").version
13
+ except Exception:
14
+ version = "unknown"
15
+
16
+ update_date = "2026-05-21"
17
+ citation = (
18
+ '''
19
+ Shuchao Ren, Yinan Li, Zhipeng Zhou.
20
+ RiboParser/RiboShiny: An integrated platform for comprehensive analysis and visualization of ribo-seq data.
21
+ Journal of Genetics and Genomics (2025)
22
+ doi:10.1016/j.jgg.2025.04.010.
23
+ '''
24
+ )
25
+ required_packages = ["pandas", "polars", "numpy", "matplotlib-venn", "seqlogo",
26
+ "matplotlib", "seaborn", "biopython",
27
+ "scipy", "scikit-learn", "statsmodels",
28
+ "pysam", "joblib"]
29
+
30
+ @classmethod
31
+ def show_version(cls):
32
+ print(f"RiboParser version: {cls.version}")
33
+ print(f"Last update: {cls.update_date}")
34
+
35
+ @classmethod
36
+ def show_citation(cls):
37
+ print("Please cite:")
38
+ print(cls.citation)
39
+
40
+ @classmethod
41
+ def check_dependencies(cls):
42
+ missing = []
43
+ for pkg in cls.required_packages:
44
+ try:
45
+ pkg_resources.get_distribution(pkg)
46
+ except pkg_resources.DistributionNotFound:
47
+ missing.append(pkg)
48
+ if missing:
49
+ print(f"Missing dependencies: {', '.join(missing)}")
50
+ return False
51
+ else:
52
+ print(cls.required_packages)
53
+ print("All required dependencies are installed.")
54
+ return True
55
+
56
+ @classmethod
57
+ def check_package_modules(cls, module_type: str = "all"):
58
+ from pathlib import Path
59
+ import sys
60
+ import importlib
61
+
62
+ script_path = Path(__file__).resolve()
63
+
64
+ # Find project root
65
+ root = script_path.parent
66
+ for _ in range(10):
67
+ if any((root / name).exists() for name in ("pyproject.toml", "README.md", ".git", "utils", "scripts")):
68
+ break
69
+ if root.parent == root:
70
+ break
71
+ root = root.parent
72
+
73
+ # Make local modules importable
74
+ if str(root) not in sys.path:
75
+ sys.path.insert(0, str(root))
76
+
77
+ utils_dir = root / "utils"
78
+ scripts_dir = root / "scripts"
79
+
80
+ modules = {
81
+ "ribo": [],
82
+ "serp": [],
83
+ "smorf": [],
84
+ "scripts": []
85
+ }
86
+
87
+ def module_name_from_path(p: Path) -> str:
88
+ rel = p.relative_to(root)
89
+ return ".".join(rel.with_suffix("").parts)
90
+
91
+ def add_module(p: Path):
92
+ if p.name.startswith("_") or p.name == "__init__.py":
93
+ return
94
+
95
+ mod = module_name_from_path(p)
96
+ parts = p.relative_to(root).parts
97
+ stem = p.stem
98
+
99
+ if "smorf" in parts or stem.startswith("smorf_"):
100
+ modules["smorf"].append(mod)
101
+ elif "serp" in parts or stem.startswith("serp_"):
102
+ modules["serp"].append(mod)
103
+ elif "ribo" in parts or stem.startswith(("rpf_", "rna_")):
104
+ modules["ribo"].append(mod)
105
+ elif "scripts" in parts:
106
+ modules["scripts"].append(mod)
107
+
108
+ if utils_dir.exists():
109
+ for p in utils_dir.rglob("*.py"):
110
+ add_module(p)
111
+
112
+ if scripts_dir.exists():
113
+ for p in scripts_dir.rglob("*.py"):
114
+ add_module(p)
115
+
116
+ for key in modules:
117
+ modules[key] = sorted(set(modules[key]))
118
+
119
+ def try_import(module_name: str) -> bool:
120
+ try:
121
+ importlib.import_module(module_name)
122
+ return True
123
+ except Exception as e:
124
+ return False
125
+
126
+ show_keys = modules.keys() if module_type == "all" else [module_type]
127
+
128
+ for key in show_keys:
129
+ print(f"{key} modules:")
130
+ if modules.get(key):
131
+ for mod in modules[key]:
132
+ status = "[import OK]" if try_import(mod) else "[import FAILED]"
133
+ print(f" - {mod} {status}")
134
+ else:
135
+ print(" - (not found)")
@@ -0,0 +1,234 @@
1
+ # Author: Rensc
2
+ # date: 2026-05-21
3
+
4
+ """
5
+ ORF overlap marker.
6
+
7
+ This module marks ORF overlap types and assigns priority labels.
8
+
9
+ Priority rule:
10
+ 1. annotated_mORF is preferred.
11
+ 2. complete ORF is preferred over partial ORF.
12
+ 3. ATG start codon is preferred over non-ATG start codons.
13
+ 4. Stronger Kozak context is preferred.
14
+ 5. Longer ORF is preferred.
15
+ 6. More upstream start site is preferred.
16
+
17
+ Important:
18
+ Different-frame overlapping ORFs are not suppressed by default.
19
+ """
20
+
21
+ from typing import List, Dict, Tuple
22
+ from .models import ORFRecord
23
+
24
+
25
+ START_CODON_RANK = {
26
+ "ATG": 1,
27
+ "CTG": 2,
28
+ "GTG": 3,
29
+ "TTG": 4,
30
+ "ACG": 5,
31
+ "ATA": 6,
32
+ "ATT": 7,
33
+ "ATC": 8,
34
+ }
35
+
36
+
37
+ class ORFOverlapMarker:
38
+ """
39
+ Mark ORF overlap type and priority.
40
+ """
41
+
42
+ @staticmethod
43
+ def mark(records: List[ORFRecord]) -> None:
44
+ """
45
+ Mark ORF overlap relationships.
46
+
47
+ Parameters
48
+ ----------
49
+ records : list
50
+ List of ORFRecord objects.
51
+ """
52
+
53
+ ORFOverlapMarker._mark_different_frame_overlap(records)
54
+ ORFOverlapMarker._mark_same_frame_overlap(records)
55
+
56
+ @staticmethod
57
+ def _mark_same_frame_overlap(records: List[ORFRecord]) -> None:
58
+ """
59
+ Mark overlaps among ORFs with the same transcript, strand, and frame.
60
+ """
61
+
62
+ grouped: Dict[Tuple[str, str, int], List[ORFRecord]] = {}
63
+
64
+ for rec in records:
65
+ key = (rec.transcript_id, rec.source_strand, rec.frame)
66
+ grouped.setdefault(key, []).append(rec)
67
+
68
+ for _, items in grouped.items():
69
+ items.sort(key=lambda x: (x.tx_orf_start, x.tx_orf_end))
70
+
71
+ for rec in items:
72
+ competitors = [
73
+ other for other in items
74
+ if other is not rec
75
+ and ORFOverlapMarker._is_overlap(rec, other)
76
+ ]
77
+
78
+ if not competitors:
79
+ continue
80
+
81
+ best = ORFOverlapMarker._select_best_orf([rec] + competitors)
82
+
83
+ if rec is best:
84
+ if rec.overlap_type == "none":
85
+ rec.overlap_type = "same_frame_overlap"
86
+ rec.priority = "primary"
87
+ else:
88
+ ORFOverlapMarker._downgrade_orf(rec, best)
89
+
90
+ @staticmethod
91
+ def _mark_different_frame_overlap(records: List[ORFRecord]) -> None:
92
+ """
93
+ Mark different-frame overlaps without suppressing either ORF.
94
+ """
95
+
96
+ grouped: Dict[Tuple[str, str], List[ORFRecord]] = {}
97
+
98
+ for rec in records:
99
+ key = (rec.transcript_id, rec.source_strand)
100
+ grouped.setdefault(key, []).append(rec)
101
+
102
+ for _, items in grouped.items():
103
+ for i, rec in enumerate(items):
104
+ for j, other in enumerate(items):
105
+ if i >= j:
106
+ continue
107
+
108
+ if rec.frame == other.frame:
109
+ continue
110
+
111
+ if ORFOverlapMarker._is_overlap(rec, other):
112
+ if rec.overlap_type == "none":
113
+ rec.overlap_type = "overlap_different_frame"
114
+ if other.overlap_type == "none":
115
+ other.overlap_type = "overlap_different_frame"
116
+
117
+ @staticmethod
118
+ def _downgrade_orf(rec: ORFRecord, best: ORFRecord) -> None:
119
+ """
120
+ Downgrade an ORF according to its relationship with the selected best ORF.
121
+ """
122
+
123
+ rec.priority = "secondary"
124
+
125
+ if ORFOverlapMarker._is_identical(rec, best):
126
+ rec.overlap_type = "identical_ORF"
127
+ elif ORFOverlapMarker._is_nested(rec, best):
128
+ if rec.start_codon != "ATG" and best.start_codon == "ATG":
129
+ rec.overlap_type = "secondary_noncanonical_start"
130
+ else:
131
+ rec.overlap_type = "nested"
132
+ elif rec.tx_orf_end == best.tx_orf_end:
133
+ if rec.start_codon != best.start_codon:
134
+ rec.overlap_type = "alternative_start_same_stop"
135
+ else:
136
+ rec.overlap_type = "same_stop_overlap"
137
+ elif rec.tx_orf_start == best.tx_orf_start:
138
+ rec.overlap_type = "same_start_different_stop"
139
+ else:
140
+ rec.overlap_type = "same_frame_overlap_different_stop"
141
+
142
+ @staticmethod
143
+ def _select_best_orf(records: List[ORFRecord]) -> ORFRecord:
144
+ """
145
+ Select the most reliable ORF from overlapping ORFs.
146
+ """
147
+
148
+ return sorted(records, key=ORFOverlapMarker._priority_key)[0]
149
+
150
+ @staticmethod
151
+ def _priority_key(rec: ORFRecord):
152
+ """
153
+ Build sorting key for ORF priority.
154
+
155
+ Lower value means higher priority.
156
+ """
157
+
158
+ annotated_rank = 0 if rec.category == "annotated_mORF" else 1
159
+ completeness_rank = 0 if rec.completeness == "complete" else 1
160
+ start_rank = START_CODON_RANK.get(rec.start_codon, 99)
161
+ kozak_rank = -ORFOverlapMarker._kozak_score(rec.kozak_seq)
162
+
163
+ # Longer ORFs are preferred after biological confidence rules.
164
+ length_rank = -rec.aa_length
165
+
166
+ # More upstream start site is preferred if all other ranks are equal.
167
+ start_position_rank = rec.tx_orf_start
168
+
169
+ return (
170
+ annotated_rank,
171
+ completeness_rank,
172
+ start_rank,
173
+ kozak_rank,
174
+ length_rank,
175
+ start_position_rank,
176
+ )
177
+
178
+ @staticmethod
179
+ def _kozak_score(kozak_seq: str) -> int:
180
+ """
181
+ Calculate a simple Kozak score.
182
+
183
+ Rule:
184
+ - Position -3 is A/G: +1
185
+ - Position +4 is G: +1
186
+
187
+ The input sequence is expected to contain:
188
+ upstream sequence + start codon + downstream sequence.
189
+ """
190
+
191
+ if not kozak_seq:
192
+ return 0
193
+
194
+ seq = kozak_seq.upper()
195
+ score = 0
196
+
197
+ # Default scanner extracts 6 nt upstream + 3 nt start codon + downstream.
198
+ # Therefore start codon begins at index 6 if full Kozak sequence exists.
199
+ start_index = 6 if len(seq) >= 9 else max(0, len(seq) // 2 - 1)
200
+
201
+ minus3_index = start_index - 3
202
+ plus4_index = start_index + 3
203
+
204
+ if 0 <= minus3_index < len(seq) and seq[minus3_index] in {"A", "G"}:
205
+ score += 1
206
+
207
+ if 0 <= plus4_index < len(seq) and seq[plus4_index] == "G":
208
+ score += 1
209
+
210
+ return score
211
+
212
+ @staticmethod
213
+ def _is_overlap(a: ORFRecord, b: ORFRecord) -> bool:
214
+ """
215
+ Check whether two ORFs overlap in transcript coordinates.
216
+ """
217
+
218
+ return a.tx_orf_start < b.tx_orf_end and a.tx_orf_end > b.tx_orf_start
219
+
220
+ @staticmethod
221
+ def _is_nested(a: ORFRecord, b: ORFRecord) -> bool:
222
+ """
223
+ Check whether ORF a is fully contained within ORF b.
224
+ """
225
+
226
+ return b.tx_orf_start <= a.tx_orf_start and b.tx_orf_end >= a.tx_orf_end
227
+
228
+ @staticmethod
229
+ def _is_identical(a: ORFRecord, b: ORFRecord) -> bool:
230
+ """
231
+ Check whether two ORFs have identical transcript coordinates.
232
+ """
233
+
234
+ return a.tx_orf_start == b.tx_orf_start and a.tx_orf_end == b.tx_orf_end
@@ -0,0 +1,287 @@
1
+ # Author: Rensc
2
+ # date: 2026-05-21
3
+
4
+ """
5
+ Main smORF pipeline.
6
+
7
+ This pipeline connects all functional modules:
8
+ 1. Read genome FASTA.
9
+ 2. Read genePred annotation.
10
+ 3. Reconstruct transcript sequences.
11
+ 4. Scan ORFs.
12
+ 5. Classify ORFs.
13
+ 6. Mark ORF overlaps.
14
+ 7. Write output files.
15
+
16
+ Parallel mode uses multiprocessing instead of threading because ORF scanning
17
+ is CPU-intensive and Python threads are limited by the GIL.
18
+ """
19
+
20
+ from concurrent.futures import ProcessPoolExecutor, as_completed
21
+
22
+ from .fasta import FastaParser
23
+ from .genepred import GenePredParser
24
+ from .coordinate import CoordinateMapper
25
+ from .scanner import ORFScanner
26
+ from .classifier import ORFClassifier
27
+ from .overlap import ORFOverlapMarker
28
+ from .writer import GenePredWriter, MessageWriter, FastaWriter
29
+
30
+
31
+ _WORKER_GENOME = None
32
+ _WORKER_CONFIG = None
33
+
34
+
35
+ def _init_worker(genome, config):
36
+ """
37
+ Initialize worker-level global objects.
38
+
39
+ This avoids sending the genome dictionary to every single transcript task.
40
+ """
41
+
42
+ global _WORKER_GENOME
43
+ global _WORKER_CONFIG
44
+
45
+ _WORKER_GENOME = genome
46
+ _WORKER_CONFIG = config
47
+
48
+
49
+ def _scan_transcript_worker(task):
50
+ """
51
+ Worker function for scanning one transcript.
52
+
53
+ Parameters
54
+ ----------
55
+ task : tuple
56
+ Tuple of transcript index and Transcript object.
57
+
58
+ Returns
59
+ -------
60
+ tuple
61
+ Transcript index, transcript ID, gene ID, and ORF records.
62
+ """
63
+
64
+ idx, tx = task
65
+
66
+ scanner = ORFScanner(
67
+ start_codons=_WORKER_CONFIG["start_codons"],
68
+ min_aa=_WORKER_CONFIG["min_aa"],
69
+ max_aa=_WORKER_CONFIG["max_aa"],
70
+ scan_strand=_WORKER_CONFIG["scan_strand"],
71
+ kozak_up=_WORKER_CONFIG["kozak_up"],
72
+ kozak_down=_WORKER_CONFIG["kozak_down"],
73
+ include_stop=_WORKER_CONFIG["include_stop"],
74
+ )
75
+
76
+ # Reconstruct spliced transcript sequence before ORF scanning.
77
+ CoordinateMapper.build_transcript_sequence(tx, _WORKER_GENOME)
78
+
79
+ # Scan candidate ORFs from the transcript sequence.
80
+ tx_records = scanner.scan_transcript(tx)
81
+
82
+ # Assign ORF category labels.
83
+ ORFClassifier.classify(tx, tx_records)
84
+
85
+ # Mark nested or overlapping ORFs if requested.
86
+ if _WORKER_CONFIG["mark_overlap"]:
87
+ ORFOverlapMarker.mark(tx_records)
88
+
89
+ # Remove same-frame internal ORFs if requested.
90
+ if _WORKER_CONFIG["remove_discarded"]:
91
+ tx_records = [x for x in tx_records if x.priority != "discarded"]
92
+
93
+ return idx, tx.transcript_id, tx.gene_id, tx_records
94
+
95
+
96
+ class SmORFPipeline:
97
+ """
98
+ High-level pipeline for transcript-centric smORF detection.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ genome: str,
104
+ annotation: str,
105
+ out_prefix: str = "ORF",
106
+ orf_prefix: str = "ORF",
107
+ start_codons: str = "ATG",
108
+ min_aa: int = 8,
109
+ max_aa: int = 10000,
110
+ scan_strand: str = "sense",
111
+ kozak_up: int = 6,
112
+ kozak_down: int = 6,
113
+ mark_overlap: bool = False,
114
+ remove_discarded: bool = False,
115
+ include_stop: bool = False,
116
+ threads: int = 1,
117
+ ):
118
+ """
119
+ Initialize smORF pipeline.
120
+ """
121
+
122
+ self.genome_path = genome
123
+ self.annotation_path = annotation
124
+ self.out_prefix = out_prefix
125
+ self.orf_prefix = orf_prefix
126
+ self.start_codons = [x.strip().upper() for x in start_codons.split(",")]
127
+ self.min_aa = min_aa
128
+ self.max_aa = max_aa
129
+ self.scan_strand = scan_strand
130
+ self.kozak_up = kozak_up
131
+ self.kozak_down = kozak_down
132
+ self.mark_overlap = mark_overlap
133
+ self.remove_discarded = remove_discarded
134
+ self.include_stop = include_stop
135
+ self.threads = max(1, int(threads))
136
+ self.records = []
137
+
138
+ def run(self) -> None:
139
+ """
140
+ Run the complete smORF scanning pipeline.
141
+ """
142
+
143
+ genome = FastaParser.read_fasta(self.genome_path)
144
+ transcripts = GenePredParser.read_genepred(self.annotation_path)
145
+
146
+ if self.threads == 1:
147
+ self._run_single_process(genome, transcripts)
148
+ else:
149
+ self._run_multi_process(genome, transcripts)
150
+
151
+ self.write_outputs()
152
+
153
+ def _run_single_process(self, genome, transcripts) -> None:
154
+ """
155
+ Run smORF scanning in single-process mode.
156
+ """
157
+
158
+ scanner = ORFScanner(
159
+ start_codons=self.start_codons,
160
+ min_aa=self.min_aa,
161
+ max_aa=self.max_aa,
162
+ scan_strand=self.scan_strand,
163
+ kozak_up=self.kozak_up,
164
+ kozak_down=self.kozak_down,
165
+ include_stop=self.include_stop,
166
+ )
167
+
168
+ total_tx = len(transcripts)
169
+ orf_index = 1
170
+
171
+ for idx, tx in enumerate(transcripts, start=1):
172
+ # Print scanning progress.
173
+ print(
174
+ "[smORFScanner] [{}/{}] Scanning gene={}, transcript={}, chrom={}, strand={}".format(
175
+ idx,
176
+ total_tx,
177
+ tx.gene_id,
178
+ tx.transcript_id,
179
+ tx.chrom,
180
+ tx.strand,
181
+ ),
182
+ flush=True,
183
+ )
184
+
185
+ # Reconstruct spliced transcript sequence before ORF scanning.
186
+ CoordinateMapper.build_transcript_sequence(tx, genome)
187
+
188
+ # Scan candidate ORFs from the transcript sequence.
189
+ tx_records = scanner.scan_transcript(tx)
190
+
191
+ # Assign ORF category labels.
192
+ ORFClassifier.classify(tx, tx_records)
193
+
194
+ # Mark nested or overlapping ORFs if requested.
195
+ if self.mark_overlap:
196
+ ORFOverlapMarker.mark(tx_records)
197
+
198
+ # Remove same-frame internal ORFs if requested.
199
+ if self.remove_discarded:
200
+ tx_records = [x for x in tx_records if x.priority != "discarded"]
201
+
202
+ # Assign stable ORF IDs.
203
+ for rec in tx_records:
204
+ rec.orf_id = "{}{:08d}".format(self.orf_prefix, orf_index)
205
+ orf_index += 1
206
+
207
+ self.records.extend(tx_records)
208
+
209
+ def _run_multi_process(self, genome, transcripts) -> None:
210
+ """
211
+ Run smORF scanning in multiprocessing mode.
212
+ """
213
+
214
+ total_tx = len(transcripts)
215
+
216
+ config = {
217
+ "start_codons": self.start_codons,
218
+ "min_aa": self.min_aa,
219
+ "max_aa": self.max_aa,
220
+ "scan_strand": self.scan_strand,
221
+ "kozak_up": self.kozak_up,
222
+ "kozak_down": self.kozak_down,
223
+ "include_stop": self.include_stop,
224
+ "mark_overlap": self.mark_overlap,
225
+ "remove_discarded": self.remove_discarded,
226
+ }
227
+
228
+ print(
229
+ "[smORFScanner] Running in multiprocessing mode with {} workers.".format(
230
+ self.threads
231
+ ),
232
+ flush=True,
233
+ )
234
+
235
+ results_by_index = {}
236
+
237
+ with ProcessPoolExecutor(
238
+ max_workers=self.threads,
239
+ initializer=_init_worker,
240
+ initargs=(genome, config),
241
+ ) as executor:
242
+ future_to_index = {
243
+ executor.submit(_scan_transcript_worker, (idx, tx)): idx
244
+ for idx, tx in enumerate(transcripts, start=1)
245
+ }
246
+
247
+ finished = 0
248
+
249
+ for future in as_completed(future_to_index):
250
+ idx, transcript_id, gene_id, tx_records = future.result()
251
+ results_by_index[idx] = tx_records
252
+
253
+ finished += 1
254
+
255
+ # Print completed transcript progress.
256
+ print(
257
+ "[smORFScanner] [{}/{}] Finished gene={}, transcript={}, ORFs={}".format(
258
+ finished,
259
+ total_tx,
260
+ gene_id,
261
+ transcript_id,
262
+ len(tx_records),
263
+ ),
264
+ flush=True,
265
+ )
266
+
267
+ # Rebuild records in original transcript order and assign stable ORF IDs.
268
+ orf_index = 1
269
+
270
+ for idx in range(1, total_tx + 1):
271
+ tx_records = results_by_index.get(idx, [])
272
+
273
+ for rec in tx_records:
274
+ rec.orf_id = "{}{:08d}".format(self.orf_prefix, orf_index)
275
+ orf_index += 1
276
+
277
+ self.records.extend(tx_records)
278
+
279
+ def write_outputs(self) -> None:
280
+ """
281
+ Write all output files.
282
+ """
283
+
284
+ GenePredWriter.write("{}.genePred".format(self.out_prefix), self.records)
285
+ MessageWriter.write("{}.message.txt".format(self.out_prefix), self.records)
286
+ FastaWriter.write_nt("{}.nt.fa".format(self.out_prefix), self.records)
287
+ FastaWriter.write_pep("{}.pep.fa".format(self.out_prefix), self.records)