rectify-rna 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rectify_rna-2.1.0/LICENSE +21 -0
  2. rectify_rna-2.1.0/PKG-INFO +259 -0
  3. rectify_rna-2.1.0/README.md +220 -0
  4. rectify_rna-2.1.0/pyproject.toml +78 -0
  5. rectify_rna-2.1.0/rectify/__init__.py +36 -0
  6. rectify_rna-2.1.0/rectify/__main__.py +9 -0
  7. rectify_rna-2.1.0/rectify/cli.py +376 -0
  8. rectify_rna-2.1.0/rectify/config.py +208 -0
  9. rectify_rna-2.1.0/rectify/core/__init__.py +32 -0
  10. rectify_rna-2.1.0/rectify/core/ag_mispriming.py +280 -0
  11. rectify_rna-2.1.0/rectify/core/atract_detector.py +440 -0
  12. rectify_rna-2.1.0/rectify/core/bam_processor.py +907 -0
  13. rectify_rna-2.1.0/rectify/core/correct_command.py +240 -0
  14. rectify_rna-2.1.0/rectify/core/indel_corrector.py +588 -0
  15. rectify_rna-2.1.0/rectify/core/netseq_refiner.py +487 -0
  16. rectify_rna-2.1.0/rectify/core/polya_model.py +377 -0
  17. rectify_rna-2.1.0/rectify/core/polya_trimmer.py +396 -0
  18. rectify_rna-2.1.0/rectify/core/spikein_filter.py +786 -0
  19. rectify_rna-2.1.0/rectify/core/train_polya_command.py +632 -0
  20. rectify_rna-2.1.0/rectify/core/validate_command.py +961 -0
  21. rectify_rna-2.1.0/rectify/slurm.py +111 -0
  22. rectify_rna-2.1.0/rectify/utils/__init__.py +14 -0
  23. rectify_rna-2.1.0/rectify/utils/alignment.py +470 -0
  24. rectify_rna-2.1.0/rectify/utils/genome.py +391 -0
  25. rectify_rna-2.1.0/rectify/utils/stats.py +281 -0
  26. rectify_rna-2.1.0/rectify_rna.egg-info/PKG-INFO +259 -0
  27. rectify_rna-2.1.0/rectify_rna.egg-info/SOURCES.txt +38 -0
  28. rectify_rna-2.1.0/rectify_rna.egg-info/dependency_links.txt +1 -0
  29. rectify_rna-2.1.0/rectify_rna.egg-info/entry_points.txt +2 -0
  30. rectify_rna-2.1.0/rectify_rna.egg-info/requires.txt +17 -0
  31. rectify_rna-2.1.0/rectify_rna.egg-info/top_level.txt +1 -0
  32. rectify_rna-2.1.0/setup.cfg +4 -0
  33. rectify_rna-2.1.0/tests/test_ag_mispriming.py +337 -0
  34. rectify_rna-2.1.0/tests/test_atract.py +252 -0
  35. rectify_rna-2.1.0/tests/test_config.py +57 -0
  36. rectify_rna-2.1.0/tests/test_indel_correction.py +421 -0
  37. rectify_rna-2.1.0/tests/test_netseq_refiner.py +426 -0
  38. rectify_rna-2.1.0/tests/test_parallel_processing.py +189 -0
  39. rectify_rna-2.1.0/tests/test_polya_trimming.py +355 -0
  40. rectify_rna-2.1.0/tests/test_slurm.py +119 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2019-2026 Kevin R. Roy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,259 @@
1
+ Metadata-Version: 2.4
2
+ Name: rectify-rna
3
+ Version: 2.1.0
4
+ Summary: Unified RNA 3' End Correction Framework for poly(A)-tailed RNA sequencing
5
+ Author-email: "Kevin R. Roy" <kevinrjroy@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/k-roy/RECTIFY
8
+ Project-URL: Bug Tracker, https://github.com/k-roy/RECTIFY/issues
9
+ Project-URL: Publication, https://pubmed.ncbi.nlm.nih.gov/31128237/
10
+ Keywords: RNA,3-prime-end,polyadenylation,RNA-seq,genomics
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pysam>=0.19.0
24
+ Requires-Dist: numpy>=1.20.0
25
+ Requires-Dist: pandas>=1.3.0
26
+ Requires-Dist: pyBigWig>=0.3.18
27
+ Requires-Dist: biopython>=1.79
28
+ Requires-Dist: tqdm>=4.60.0
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
31
+ Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
32
+ Requires-Dist: black>=22.0.0; extra == "dev"
33
+ Requires-Dist: flake8>=4.0.0; extra == "dev"
34
+ Requires-Dist: mypy>=0.950; extra == "dev"
35
+ Provides-Extra: docs
36
+ Requires-Dist: sphinx>=4.5.0; extra == "docs"
37
+ Requires-Dist: sphinx-rtd-theme>=1.0.0; extra == "docs"
38
+ Dynamic: license-file
39
+
40
+ # RECTIFY: Unified RNA 3' End Correction Framework
41
+
42
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
43
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
44
+ [![Implementation Status](https://img.shields.io/badge/status-complete-brightgreen.svg)](https://github.com/k-roy/RECTIFY)
45
+ [![Tests](https://img.shields.io/badge/tests-147%20passing-brightgreen.svg)](tests/)
46
+
47
+ **RECTIFY** (**R**NA 3' **E**nd **C**orrection **T**ool **I**ntegrating **F**alse-priming and pol**y**(A) ambiguity) is a unified framework for correcting 3' end mapping artifacts in poly(A)-tailed RNA sequencing data.
48
+
49
+ ## Overview
50
+
51
+ RECTIFY addresses two fundamental problems affecting RNA 3' end mapping:
52
+
53
+ 1. **A-tract Ambiguity (Universal)**: Genomic A-tracts near true 3' ends create positional uncertainty affecting ALL poly(A)-tailed RNA-seq technologies
54
+ 2. **Technology-Specific Artifacts**:
55
+ - **AG mispriming**: Internal priming on A/G-rich regions (oligo-dT methods)
56
+ - **Poly(A) tail alignment**: Tail bases align to genomic A-tracts creating systematic shifts (when poly(A) is sequenced)
57
+
58
+ ### Key Features
59
+
60
+ - **Modular correction strategies** that apply based on sequencing technology
61
+ - **Universal A-tract ambiguity detection** for all poly(A)-tailed RNA-seq
62
+ - **AG mispriming screening** (from original RECTIFY, Roy & Chanfreau 2019)
63
+ - **Poly(A) tail trimming and indel artifact correction** (for direct RNA-seq: nanopore, Helicos, QuantSeq)
64
+ - **NET-seq refinement** (optional, technology-independent)
65
+ - **Unified output format** with confidence scores and QC flags
66
+
67
+ ## How It Works
68
+
69
+ RECTIFY corrects common 3' end mapping artifacts through a series of modular steps:
70
+
71
+ ```
72
+ ┌─────────────────────────────────────────────────────────────────────────────┐
73
+ │ EXAMPLE 1: Homopolymer Deletion Artifact (Nanopore) │
74
+ │ ═══════════════════════════════════════════════════ │
75
+ │ │
76
+ │ True RNA: 5'...GCTAAGCTTAAAAAA-3' + AAAAAAAAAA (poly(A) tail) │
77
+ │ └────┘ │
78
+ │ 6A genomic tract │
79
+ │ │
80
+ │ Genome: ...GCTAAGCTTAAAAAA|GTCACC... (| = true CPA site) │
81
+ │ │
82
+ │ Nanopore read: ...GCTAAGCTT--AAAA|GTCACC (2bp deletion in A-tract)│
83
+ │ ↑↑ │
84
+ │ systematic homopolymer error │
85
+ │ │
86
+ │ Problem: Aligner maps 3' end 2bp upstream of true position │
87
+ │ (deletion consumes genomic bases that should be in transcript) │
88
+ │ │
89
+ │ RECTIFY: Detects A-tract deletion, adjusts position +2bp │
90
+ │ Result: Correct 3' end position restored │
91
+ └─────────────────────────────────────────────────────────────────────────────┘
92
+
93
+ ┌─────────────────────────────────────────────────────────────────────────────┐
94
+ │ EXAMPLE 2: Multiple Indels Near 3' End │
95
+ │ ═══════════════════════════════════════ │
96
+ │ │
97
+ │ Genome: ...TACGTTTTTTAAAAAA|GTCA... │
98
+ │ └────┘└────┘ │
99
+ │ T-tract A-tract │
100
+ │ │
101
+ │ Nanopore read: ...TACGT---TTAA-AAA|GTCA │
102
+ │ ↑↑↑ ↑ │
103
+ │ 3bp del 1bp del │
104
+ │ │
105
+ │ RECTIFY logic: │
106
+ │ • T-tract deletion (3bp): TRUE artifact → correct +3bp │
107
+ │ • A-tract deletion (1bp): TRUE artifact → correct +1bp │
108
+ │ • Total correction: +4bp │
109
+ │ │
110
+ │ Note: Insertions do NOT shift reference coordinates (no correction needed) │
111
+ └─────────────────────────────────────────────────────────────────────────────┘
112
+
113
+ ┌─────────────────────────────────────────────────────────────────────────────┐
114
+ │ EXAMPLE 3: A-tract Ambiguity Window │
115
+ │ ═══════════════════════════════════ │
116
+ │ │
117
+ │ Genome: ...CGTACAAAAAAAA|GTCACC... │
118
+ │ └───────┘ │
119
+ │ 8bp A-tract │
120
+ │ │
121
+ │ Problem: Any position within the A-tract could be the true 3' end │
122
+ │ (indistinguishable from poly(A) tail) │
123
+ │ │
124
+ │ ...CGTACAAAAAAAA| ← could be here │
125
+ │ ...CGTACAAAAAAA|A ← or here │
126
+ │ ...CGTACAAAAAA|AA ← or here │
127
+ │ ...CGTACAAAAA|AAA ← etc. │
128
+ │ │
129
+ │ RECTIFY: Reports ambiguity window [pos-7, pos] with range=8 │
130
+ │ Confidence score reflects uncertainty │
131
+ └─────────────────────────────────────────────────────────────────────────────┘
132
+
133
+ ┌─────────────────────────────────────────────────────────────────────────────┐
134
+ │ EXAMPLE 4: NET-seq Refinement │
135
+ │ ═════════════════════════════ │
136
+ │ │
137
+ │ Genome: ...CGTACAAAAAAAA|GTCACC... │
138
+ │ └───────┘ │
139
+ │ ambiguity window │
140
+ │ │
141
+ │ NET-seq: ▁▂▃█▇▅▂▁ │
142
+ │ signal: ↑ │
143
+ │ peak at -3 │
144
+ │ │
145
+ │ RECTIFY: Uses NET-seq Pol II occupancy to identify most likely │
146
+ │ termination site within the ambiguity window │
147
+ │ │
148
+ │ Result: Position refined to NET-seq peak, confidence = HIGH │
149
+ └─────────────────────────────────────────────────────────────────────────────┘
150
+
151
+ Pipeline Flow:
152
+ ══════════════
153
+
154
+ ┌──────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
155
+ │ Input │ │ Module 1 │ │ Module 2A/B │ │ Module 3 │
156
+ │ BAM │───▶│ A-tract │───▶│ Poly(A) & │───▶│ NET-seq │
157
+ │ │ │ Ambiguity │ │ Indels │ │ Refinement │
158
+ └──────────┘ └─────────────┘ └─────────────┘ └─────────────┘
159
+ │ │ │
160
+ ▼ ▼ ▼
161
+ ┌───────────────────────────────────────────────┐
162
+ │ Corrected 3' Ends │
163
+ │ (position, ambiguity range, confidence) │
164
+ └───────────────────────────────────────────────┘
165
+ ```
166
+
167
+ ## Installation
168
+
169
+ ### From source (development)
170
+
171
+ ```bash
172
+ git clone https://github.com/k-roy/RECTIFY.git
173
+ cd RECTIFY
174
+ pip install -e .
175
+ ```
176
+
177
+ ### From PyPI (future release)
178
+
179
+ ```bash
180
+ pip install rectify
181
+ ```
182
+
183
+ ## Quick Start
184
+
185
+ ### QuantSeq (oligo-dT short-read)
186
+
187
+ ```bash
188
+ rectify correct quantseq.bam \
189
+ --genome sacCer3.fa \
190
+ --annotation genes.gtf \
191
+ --polya-sequenced \
192
+ --output corrected_3ends.tsv
193
+ ```
194
+
195
+ ### Nanopore direct RNA-seq with NET-seq refinement
196
+
197
+ ```bash
198
+ rectify correct nanopore.bam \
199
+ --genome sacCer3.fa \
200
+ --annotation genes.gtf \
201
+ --polya-sequenced \
202
+ --aligner minimap2 \
203
+ --netseq-dir churchman_bigwigs/ \
204
+ --output corrected_3ends.tsv
205
+ ```
206
+
207
+ ## Output Format
208
+
209
+ RECTIFY produces a TSV file with corrected 3' end positions and QC metrics:
210
+
211
+ ```
212
+ read_id chrom strand raw_position corrected_position ambiguity_min ambiguity_max ambiguity_range correction_type confidence qc_flags
213
+ read001 chrI + 147588 147585 147583 147588 5 polya_trim high PASS
214
+ read002 chrI + 147593 147591 147591 147593 2 ag_mispriming medium AG_RICH
215
+ ```
216
+
217
+ ## Module Architecture
218
+
219
+ RECTIFY applies corrections modularly based on your data:
220
+
221
+ 1. **Module 1: A-tract Ambiguity** (always applied)
222
+ - Identifies genomic A-tracts near 3' ends
223
+ - Calculates ambiguity windows
224
+
225
+ 2. **Module 2A: AG Mispriming** (when oligo-dT priming used)
226
+ - Screens for downstream AG-richness
227
+ - Flags likely misprimed reads
228
+
229
+ 3. **Module 2B+2C: Poly(A) Corrections** (when poly(A) IS sequenced)
230
+ - Models and trims poly(A) tails
231
+ - Detects and removes indel artifacts
232
+
233
+ 4. **Module 3: NET-seq Refinement** (optional)
234
+ - Resolves ambiguity using NET-seq data
235
+ - Assigns confidence scores
236
+
237
+ ## Citation
238
+
239
+ If you use RECTIFY, please cite:
240
+
241
+ **Original RECTIFY (AG mispriming correction):**
242
+ > Roy KR, Chanfreau GF. Robust mapping of polyadenylated and non-polyadenylated RNA 3' ends at nucleotide resolution by 3'-end sequencing. *Methods*. 2020 Apr 1;176:4-13. doi: 10.1016/j.ymeth.2019.05.016. [PMID: 31128237](https://pubmed.ncbi.nlm.nih.gov/31128237/)
243
+
244
+ **RECTIFY 2.0 (unified framework):**
245
+ > Manuscript in preparation
246
+
247
+ ## License
248
+
249
+ MIT License - See [LICENSE](LICENSE) for details
250
+
251
+ ## Contact
252
+
253
+ - Kevin R. Roy - [kevinrjroy@gmail.com](mailto:kevinrjroy@gmail.com)
254
+ - GitHub: [k-roy/RECTIFY](https://github.com/k-roy/RECTIFY)
255
+
256
+ ## Acknowledgments
257
+
258
+ - Original RECTIFY development supported by Chanfreau Lab, UCLA
259
+ - NET-seq data from Churchman Lab, Harvard Medical School
@@ -0,0 +1,220 @@
1
+ # RECTIFY: Unified RNA 3' End Correction Framework
2
+
3
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
4
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
5
+ [![Implementation Status](https://img.shields.io/badge/status-complete-brightgreen.svg)](https://github.com/k-roy/RECTIFY)
6
+ [![Tests](https://img.shields.io/badge/tests-147%20passing-brightgreen.svg)](tests/)
7
+
8
+ **RECTIFY** (**R**NA 3' **E**nd **C**orrection **T**ool **I**ntegrating **F**alse-priming and pol**y**(A) ambiguity) is a unified framework for correcting 3' end mapping artifacts in poly(A)-tailed RNA sequencing data.
9
+
10
+ ## Overview
11
+
12
+ RECTIFY addresses two fundamental problems affecting RNA 3' end mapping:
13
+
14
+ 1. **A-tract Ambiguity (Universal)**: Genomic A-tracts near true 3' ends create positional uncertainty affecting ALL poly(A)-tailed RNA-seq technologies
15
+ 2. **Technology-Specific Artifacts**:
16
+ - **AG mispriming**: Internal priming on A/G-rich regions (oligo-dT methods)
17
+ - **Poly(A) tail alignment**: Tail bases align to genomic A-tracts creating systematic shifts (when poly(A) is sequenced)
18
+
19
+ ### Key Features
20
+
21
+ - **Modular correction strategies** that apply based on sequencing technology
22
+ - **Universal A-tract ambiguity detection** for all poly(A)-tailed RNA-seq
23
+ - **AG mispriming screening** (from original RECTIFY, Roy & Chanfreau 2019)
24
+ - **Poly(A) tail trimming and indel artifact correction** (for direct RNA-seq: nanopore, Helicos, QuantSeq)
25
+ - **NET-seq refinement** (optional, technology-independent)
26
+ - **Unified output format** with confidence scores and QC flags
27
+
28
+ ## How It Works
29
+
30
+ RECTIFY corrects common 3' end mapping artifacts through a series of modular steps:
31
+
32
+ ```
33
+ ┌─────────────────────────────────────────────────────────────────────────────┐
34
+ │ EXAMPLE 1: Homopolymer Deletion Artifact (Nanopore) │
35
+ │ ═══════════════════════════════════════════════════ │
36
+ │ │
37
+ │ True RNA: 5'...GCTAAGCTTAAAAAA-3' + AAAAAAAAAA (poly(A) tail) │
38
+ │ └────┘ │
39
+ │ 6A genomic tract │
40
+ │ │
41
+ │ Genome: ...GCTAAGCTTAAAAAA|GTCACC... (| = true CPA site) │
42
+ │ │
43
+ │ Nanopore read: ...GCTAAGCTT--AAAA|GTCACC (2bp deletion in A-tract)│
44
+ │ ↑↑ │
45
+ │ systematic homopolymer error │
46
+ │ │
47
+ │ Problem: Aligner maps 3' end 2bp upstream of true position │
48
+ │ (deletion consumes genomic bases that should be in transcript) │
49
+ │ │
50
+ │ RECTIFY: Detects A-tract deletion, adjusts position +2bp │
51
+ │ Result: Correct 3' end position restored │
52
+ └─────────────────────────────────────────────────────────────────────────────┘
53
+
54
+ ┌─────────────────────────────────────────────────────────────────────────────┐
55
+ │ EXAMPLE 2: Multiple Indels Near 3' End │
56
+ │ ═══════════════════════════════════════ │
57
+ │ │
58
+ │ Genome: ...TACGTTTTTTAAAAAA|GTCA... │
59
+ │ └────┘└────┘ │
60
+ │ T-tract A-tract │
61
+ │ │
62
+ │ Nanopore read: ...TACGT---TTAA-AAA|GTCA │
63
+ │ ↑↑↑ ↑ │
64
+ │ 3bp del 1bp del │
65
+ │ │
66
+ │ RECTIFY logic: │
67
+ │ • T-tract deletion (3bp): TRUE artifact → correct +3bp │
68
+ │ • A-tract deletion (1bp): TRUE artifact → correct +1bp │
69
+ │ • Total correction: +4bp │
70
+ │ │
71
+ │ Note: Insertions do NOT shift reference coordinates (no correction needed) │
72
+ └─────────────────────────────────────────────────────────────────────────────┘
73
+
74
+ ┌─────────────────────────────────────────────────────────────────────────────┐
75
+ │ EXAMPLE 3: A-tract Ambiguity Window │
76
+ │ ═══════════════════════════════════ │
77
+ │ │
78
+ │ Genome: ...CGTACAAAAAAAA|GTCACC... │
79
+ │ └───────┘ │
80
+ │ 8bp A-tract │
81
+ │ │
82
+ │ Problem: Any position within the A-tract could be the true 3' end │
83
+ │ (indistinguishable from poly(A) tail) │
84
+ │ │
85
+ │ ...CGTACAAAAAAAA| ← could be here │
86
+ │ ...CGTACAAAAAAA|A ← or here │
87
+ │ ...CGTACAAAAAA|AA ← or here │
88
+ │ ...CGTACAAAAA|AAA ← etc. │
89
+ │ │
90
+ │ RECTIFY: Reports ambiguity window [pos-7, pos] with range=8 │
91
+ │ Confidence score reflects uncertainty │
92
+ └─────────────────────────────────────────────────────────────────────────────┘
93
+
94
+ ┌─────────────────────────────────────────────────────────────────────────────┐
95
+ │ EXAMPLE 4: NET-seq Refinement │
96
+ │ ═════════════════════════════ │
97
+ │ │
98
+ │ Genome: ...CGTACAAAAAAAA|GTCACC... │
99
+ │ └───────┘ │
100
+ │ ambiguity window │
101
+ │ │
102
+ │ NET-seq: ▁▂▃█▇▅▂▁ │
103
+ │ signal: ↑ │
104
+ │ peak at -3 │
105
+ │ │
106
+ │ RECTIFY: Uses NET-seq Pol II occupancy to identify most likely │
107
+ │ termination site within the ambiguity window │
108
+ │ │
109
+ │ Result: Position refined to NET-seq peak, confidence = HIGH │
110
+ └─────────────────────────────────────────────────────────────────────────────┘
111
+
112
+ Pipeline Flow:
113
+ ══════════════
114
+
115
+ ┌──────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
116
+ │ Input │ │ Module 1 │ │ Module 2A/B │ │ Module 3 │
117
+ │ BAM │───▶│ A-tract │───▶│ Poly(A) & │───▶│ NET-seq │
118
+ │ │ │ Ambiguity │ │ Indels │ │ Refinement │
119
+ └──────────┘ └─────────────┘ └─────────────┘ └─────────────┘
120
+ │ │ │
121
+ ▼ ▼ ▼
122
+ ┌───────────────────────────────────────────────┐
123
+ │ Corrected 3' Ends │
124
+ │ (position, ambiguity range, confidence) │
125
+ └───────────────────────────────────────────────┘
126
+ ```
127
+
128
+ ## Installation
129
+
130
+ ### From source (development)
131
+
132
+ ```bash
133
+ git clone https://github.com/k-roy/RECTIFY.git
134
+ cd RECTIFY
135
+ pip install -e .
136
+ ```
137
+
138
+ ### From PyPI (future release)
139
+
140
+ ```bash
141
+ pip install rectify
142
+ ```
143
+
144
+ ## Quick Start
145
+
146
+ ### QuantSeq (oligo-dT short-read)
147
+
148
+ ```bash
149
+ rectify correct quantseq.bam \
150
+ --genome sacCer3.fa \
151
+ --annotation genes.gtf \
152
+ --polya-sequenced \
153
+ --output corrected_3ends.tsv
154
+ ```
155
+
156
+ ### Nanopore direct RNA-seq with NET-seq refinement
157
+
158
+ ```bash
159
+ rectify correct nanopore.bam \
160
+ --genome sacCer3.fa \
161
+ --annotation genes.gtf \
162
+ --polya-sequenced \
163
+ --aligner minimap2 \
164
+ --netseq-dir churchman_bigwigs/ \
165
+ --output corrected_3ends.tsv
166
+ ```
167
+
168
+ ## Output Format
169
+
170
+ RECTIFY produces a TSV file with corrected 3' end positions and QC metrics:
171
+
172
+ ```
173
+ read_id chrom strand raw_position corrected_position ambiguity_min ambiguity_max ambiguity_range correction_type confidence qc_flags
174
+ read001 chrI + 147588 147585 147583 147588 5 polya_trim high PASS
175
+ read002 chrI + 147593 147591 147591 147593 2 ag_mispriming medium AG_RICH
176
+ ```
177
+
178
+ ## Module Architecture
179
+
180
+ RECTIFY applies corrections modularly based on your data:
181
+
182
+ 1. **Module 1: A-tract Ambiguity** (always applied)
183
+ - Identifies genomic A-tracts near 3' ends
184
+ - Calculates ambiguity windows
185
+
186
+ 2. **Module 2A: AG Mispriming** (when oligo-dT priming used)
187
+ - Screens for downstream AG-richness
188
+ - Flags likely misprimed reads
189
+
190
+ 3. **Module 2B+2C: Poly(A) Corrections** (when poly(A) IS sequenced)
191
+ - Models and trims poly(A) tails
192
+ - Detects and removes indel artifacts
193
+
194
+ 4. **Module 3: NET-seq Refinement** (optional)
195
+ - Resolves ambiguity using NET-seq data
196
+ - Assigns confidence scores
197
+
198
+ ## Citation
199
+
200
+ If you use RECTIFY, please cite:
201
+
202
+ **Original RECTIFY (AG mispriming correction):**
203
+ > Roy KR, Chanfreau GF. Robust mapping of polyadenylated and non-polyadenylated RNA 3' ends at nucleotide resolution by 3'-end sequencing. *Methods*. 2020 Apr 1;176:4-13. doi: 10.1016/j.ymeth.2019.05.016. [PMID: 31128237](https://pubmed.ncbi.nlm.nih.gov/31128237/)
204
+
205
+ **RECTIFY 2.0 (unified framework):**
206
+ > Manuscript in preparation
207
+
208
+ ## License
209
+
210
+ MIT License - See [LICENSE](LICENSE) for details
211
+
212
+ ## Contact
213
+
214
+ - Kevin R. Roy - [kevinrjroy@gmail.com](mailto:kevinrjroy@gmail.com)
215
+ - GitHub: [k-roy/RECTIFY](https://github.com/k-roy/RECTIFY)
216
+
217
+ ## Acknowledgments
218
+
219
+ - Original RECTIFY development supported by Chanfreau Lab, UCLA
220
+ - NET-seq data from Churchman Lab, Harvard Medical School
@@ -0,0 +1,78 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "rectify-rna"
7
+ version = "2.1.0"
8
+ description = "Unified RNA 3' End Correction Framework for poly(A)-tailed RNA sequencing"
9
+ authors = [
10
+ {name = "Kevin R. Roy", email = "kevinrjroy@gmail.com"}
11
+ ]
12
+ readme = "README.md"
13
+ requires-python = ">=3.8"
14
+ license = {text = "MIT"}
15
+ keywords = ["RNA", "3-prime-end", "polyadenylation", "RNA-seq", "genomics"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
26
+ ]
27
+
28
+ dependencies = [
29
+ "pysam>=0.19.0",
30
+ "numpy>=1.20.0",
31
+ "pandas>=1.3.0",
32
+ "pyBigWig>=0.3.18",
33
+ "biopython>=1.79",
34
+ "tqdm>=4.60.0",
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ dev = [
39
+ "pytest>=7.0.0",
40
+ "pytest-cov>=3.0.0",
41
+ "black>=22.0.0",
42
+ "flake8>=4.0.0",
43
+ "mypy>=0.950",
44
+ ]
45
+ docs = [
46
+ "sphinx>=4.5.0",
47
+ "sphinx-rtd-theme>=1.0.0",
48
+ ]
49
+
50
+ [project.urls]
51
+ "Homepage" = "https://github.com/k-roy/RECTIFY"
52
+ "Bug Tracker" = "https://github.com/k-roy/RECTIFY/issues"
53
+ "Publication" = "https://pubmed.ncbi.nlm.nih.gov/31128237/"
54
+
55
+ [project.scripts]
56
+ rectify = "rectify.cli:main"
57
+
58
+ [tool.setuptools]
59
+ packages = ["rectify", "rectify.core", "rectify.utils"]
60
+
61
+ [tool.setuptools.package-data]
62
+ rectify = ["data/models/*.json"]
63
+
64
+ [tool.pytest.ini_options]
65
+ testpaths = ["tests"]
66
+ python_files = ["test_*.py"]
67
+ python_functions = ["test_*"]
68
+ addopts = "-v --cov=rectify --cov-report=html --cov-report=term"
69
+
70
+ [tool.black]
71
+ line-length = 100
72
+ target-version = ['py38', 'py39', 'py310', 'py311']
73
+
74
+ [tool.mypy]
75
+ python_version = "3.8"
76
+ warn_return_any = true
77
+ warn_unused_configs = true
78
+ disallow_untyped_defs = false
@@ -0,0 +1,36 @@
1
+ """
2
+ RECTIFY: Unified RNA 3' End Correction Framework
3
+
4
+ A modular framework for correcting 3' end mapping artifacts in poly(A)-tailed RNA sequencing data.
5
+
6
+ Modules:
7
+ - A-tract ambiguity detection (universal)
8
+ - AG mispriming screening (oligo-dT methods)
9
+ - Poly(A) tail trimming and indel correction (direct RNA-seq)
10
+ - NET-seq refinement (optional)
11
+
12
+ Features (v2.1.0):
13
+ - Region-based parallel BAM processing with coverage gap splitting
14
+ - SLURM-aware CPU detection to prevent oversubscription
15
+ - Streaming output mode for large BAM files
16
+
17
+ Author: Kevin R. Roy
18
+ License: MIT
19
+ """
20
+
21
+ __version__ = "2.1.0"
22
+ __author__ = "Kevin R. Roy"
23
+ __email__ = "kevinroy@stanford.edu"
24
+
25
+ from . import core, utils, slurm
26
+ from .slurm import get_available_cpus, set_thread_limits, is_slurm_job
27
+
28
+ __all__ = [
29
+ "core",
30
+ "utils",
31
+ "slurm",
32
+ "get_available_cpus",
33
+ "set_thread_limits",
34
+ "is_slurm_job",
35
+ "__version__",
36
+ ]
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Entry point for running RECTIFY as a module: python -m rectify
4
+ """
5
+
6
+ from .cli import main
7
+
8
+ if __name__ == '__main__':
9
+ main()