rectify-rna 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rectify_rna-2.1.0/LICENSE +21 -0
- rectify_rna-2.1.0/PKG-INFO +259 -0
- rectify_rna-2.1.0/README.md +220 -0
- rectify_rna-2.1.0/pyproject.toml +78 -0
- rectify_rna-2.1.0/rectify/__init__.py +36 -0
- rectify_rna-2.1.0/rectify/__main__.py +9 -0
- rectify_rna-2.1.0/rectify/cli.py +376 -0
- rectify_rna-2.1.0/rectify/config.py +208 -0
- rectify_rna-2.1.0/rectify/core/__init__.py +32 -0
- rectify_rna-2.1.0/rectify/core/ag_mispriming.py +280 -0
- rectify_rna-2.1.0/rectify/core/atract_detector.py +440 -0
- rectify_rna-2.1.0/rectify/core/bam_processor.py +907 -0
- rectify_rna-2.1.0/rectify/core/correct_command.py +240 -0
- rectify_rna-2.1.0/rectify/core/indel_corrector.py +588 -0
- rectify_rna-2.1.0/rectify/core/netseq_refiner.py +487 -0
- rectify_rna-2.1.0/rectify/core/polya_model.py +377 -0
- rectify_rna-2.1.0/rectify/core/polya_trimmer.py +396 -0
- rectify_rna-2.1.0/rectify/core/spikein_filter.py +786 -0
- rectify_rna-2.1.0/rectify/core/train_polya_command.py +632 -0
- rectify_rna-2.1.0/rectify/core/validate_command.py +961 -0
- rectify_rna-2.1.0/rectify/slurm.py +111 -0
- rectify_rna-2.1.0/rectify/utils/__init__.py +14 -0
- rectify_rna-2.1.0/rectify/utils/alignment.py +470 -0
- rectify_rna-2.1.0/rectify/utils/genome.py +391 -0
- rectify_rna-2.1.0/rectify/utils/stats.py +281 -0
- rectify_rna-2.1.0/rectify_rna.egg-info/PKG-INFO +259 -0
- rectify_rna-2.1.0/rectify_rna.egg-info/SOURCES.txt +38 -0
- rectify_rna-2.1.0/rectify_rna.egg-info/dependency_links.txt +1 -0
- rectify_rna-2.1.0/rectify_rna.egg-info/entry_points.txt +2 -0
- rectify_rna-2.1.0/rectify_rna.egg-info/requires.txt +17 -0
- rectify_rna-2.1.0/rectify_rna.egg-info/top_level.txt +1 -0
- rectify_rna-2.1.0/setup.cfg +4 -0
- rectify_rna-2.1.0/tests/test_ag_mispriming.py +337 -0
- rectify_rna-2.1.0/tests/test_atract.py +252 -0
- rectify_rna-2.1.0/tests/test_config.py +57 -0
- rectify_rna-2.1.0/tests/test_indel_correction.py +421 -0
- rectify_rna-2.1.0/tests/test_netseq_refiner.py +426 -0
- rectify_rna-2.1.0/tests/test_parallel_processing.py +189 -0
- rectify_rna-2.1.0/tests/test_polya_trimming.py +355 -0
- rectify_rna-2.1.0/tests/test_slurm.py +119 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2019-2026 Kevin R. Roy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rectify-rna
|
|
3
|
+
Version: 2.1.0
|
|
4
|
+
Summary: Unified RNA 3' End Correction Framework for poly(A)-tailed RNA sequencing
|
|
5
|
+
Author-email: "Kevin R. Roy" <kevinrjroy@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/k-roy/RECTIFY
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/k-roy/RECTIFY/issues
|
|
9
|
+
Project-URL: Publication, https://pubmed.ncbi.nlm.nih.gov/31128237/
|
|
10
|
+
Keywords: RNA,3-prime-end,polyadenylation,RNA-seq,genomics
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: pysam>=0.19.0
|
|
24
|
+
Requires-Dist: numpy>=1.20.0
|
|
25
|
+
Requires-Dist: pandas>=1.3.0
|
|
26
|
+
Requires-Dist: pyBigWig>=0.3.18
|
|
27
|
+
Requires-Dist: biopython>=1.79
|
|
28
|
+
Requires-Dist: tqdm>=4.60.0
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: flake8>=4.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: mypy>=0.950; extra == "dev"
|
|
35
|
+
Provides-Extra: docs
|
|
36
|
+
Requires-Dist: sphinx>=4.5.0; extra == "docs"
|
|
37
|
+
Requires-Dist: sphinx-rtd-theme>=1.0.0; extra == "docs"
|
|
38
|
+
Dynamic: license-file
|
|
39
|
+
|
|
40
|
+
# RECTIFY: Unified RNA 3' End Correction Framework
|
|
41
|
+
|
|
42
|
+
[](https://opensource.org/licenses/MIT)
|
|
43
|
+
[](https://www.python.org/downloads/)
|
|
44
|
+
[](https://github.com/k-roy/RECTIFY)
|
|
45
|
+
[](tests/)
|
|
46
|
+
|
|
47
|
+
**RECTIFY** (**R**NA 3' **E**nd **C**orrection **T**ool **I**ntegrating **F**alse-priming and pol**y**(A) ambiguity) is a unified framework for correcting 3' end mapping artifacts in poly(A)-tailed RNA sequencing data.
|
|
48
|
+
|
|
49
|
+
## Overview
|
|
50
|
+
|
|
51
|
+
RECTIFY addresses two fundamental problems affecting RNA 3' end mapping:
|
|
52
|
+
|
|
53
|
+
1. **A-tract Ambiguity (Universal)**: Genomic A-tracts near true 3' ends create positional uncertainty affecting ALL poly(A)-tailed RNA-seq technologies
|
|
54
|
+
2. **Technology-Specific Artifacts**:
|
|
55
|
+
- **AG mispriming**: Internal priming on A/G-rich regions (oligo-dT methods)
|
|
56
|
+
- **Poly(A) tail alignment**: Tail bases align to genomic A-tracts creating systematic shifts (when poly(A) is sequenced)
|
|
57
|
+
|
|
58
|
+
### Key Features
|
|
59
|
+
|
|
60
|
+
- **Modular correction strategies** that apply based on sequencing technology
|
|
61
|
+
- **Universal A-tract ambiguity detection** for all poly(A)-tailed RNA-seq
|
|
62
|
+
- **AG mispriming screening** (from original RECTIFY, Roy & Chanfreau 2019)
|
|
63
|
+
- **Poly(A) tail trimming and indel artifact correction** (for direct RNA-seq: nanopore, Helicos, QuantSeq)
|
|
64
|
+
- **NET-seq refinement** (optional, technology-independent)
|
|
65
|
+
- **Unified output format** with confidence scores and QC flags
|
|
66
|
+
|
|
67
|
+
## How It Works
|
|
68
|
+
|
|
69
|
+
RECTIFY corrects common 3' end mapping artifacts through a series of modular steps:
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
73
|
+
│ EXAMPLE 1: Homopolymer Deletion Artifact (Nanopore) │
|
|
74
|
+
│ ═══════════════════════════════════════════════════ │
|
|
75
|
+
│ │
|
|
76
|
+
│ True RNA: 5'...GCTAAGCTTAAAAAA-3' + AAAAAAAAAA (poly(A) tail) │
|
|
77
|
+
│ └────┘ │
|
|
78
|
+
│ 6A genomic tract │
|
|
79
|
+
│ │
|
|
80
|
+
│ Genome: ...GCTAAGCTTAAAAAA|GTCACC... (| = true CPA site) │
|
|
81
|
+
│ │
|
|
82
|
+
│ Nanopore read: ...GCTAAGCTT--AAAA|GTCACC (2bp deletion in A-tract)│
|
|
83
|
+
│ ↑↑ │
|
|
84
|
+
│ systematic homopolymer error │
|
|
85
|
+
│ │
|
|
86
|
+
│ Problem: Aligner maps 3' end 2bp upstream of true position │
|
|
87
|
+
│ (deletion consumes genomic bases that should be in transcript) │
|
|
88
|
+
│ │
|
|
89
|
+
│ RECTIFY: Detects A-tract deletion, adjusts position +2bp │
|
|
90
|
+
│ Result: Correct 3' end position restored │
|
|
91
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
92
|
+
|
|
93
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
94
|
+
│ EXAMPLE 2: Multiple Indels Near 3' End │
|
|
95
|
+
│ ═══════════════════════════════════════ │
|
|
96
|
+
│ │
|
|
97
|
+
│ Genome: ...TACGTTTTTTAAAAAA|GTCA... │
|
|
98
|
+
│ └────┘└────┘ │
|
|
99
|
+
│ T-tract A-tract │
|
|
100
|
+
│ │
|
|
101
|
+
│ Nanopore read: ...TACGT---TTAA-AAA|GTCA │
|
|
102
|
+
│ ↑↑↑ ↑ │
|
|
103
|
+
│ 3bp del 1bp del │
|
|
104
|
+
│ │
|
|
105
|
+
│ RECTIFY logic: │
|
|
106
|
+
│ • T-tract deletion (3bp): TRUE artifact → correct +3bp │
|
|
107
|
+
│ • A-tract deletion (1bp): TRUE artifact → correct +1bp │
|
|
108
|
+
│ • Total correction: +4bp │
|
|
109
|
+
│ │
|
|
110
|
+
│ Note: Insertions do NOT shift reference coordinates (no correction needed) │
|
|
111
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
112
|
+
|
|
113
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
114
|
+
│ EXAMPLE 3: A-tract Ambiguity Window │
|
|
115
|
+
│ ═══════════════════════════════════ │
|
|
116
|
+
│ │
|
|
117
|
+
│ Genome: ...CGTACAAAAAAAA|GTCACC... │
|
|
118
|
+
│ └───────┘ │
|
|
119
|
+
│ 8bp A-tract │
|
|
120
|
+
│ │
|
|
121
|
+
│ Problem: Any position within the A-tract could be the true 3' end │
|
|
122
|
+
│ (indistinguishable from poly(A) tail) │
|
|
123
|
+
│ │
|
|
124
|
+
│ ...CGTACAAAAAAAA| ← could be here │
|
|
125
|
+
│ ...CGTACAAAAAAA|A ← or here │
|
|
126
|
+
│ ...CGTACAAAAAA|AA ← or here │
|
|
127
|
+
│ ...CGTACAAAAA|AAA ← etc. │
|
|
128
|
+
│ │
|
|
129
|
+
│ RECTIFY: Reports ambiguity window [pos-7, pos] with range=8 │
|
|
130
|
+
│ Confidence score reflects uncertainty │
|
|
131
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
132
|
+
|
|
133
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
134
|
+
│ EXAMPLE 4: NET-seq Refinement │
|
|
135
|
+
│ ═════════════════════════════ │
|
|
136
|
+
│ │
|
|
137
|
+
│ Genome: ...CGTACAAAAAAAA|GTCACC... │
|
|
138
|
+
│ └───────┘ │
|
|
139
|
+
│ ambiguity window │
|
|
140
|
+
│ │
|
|
141
|
+
│ NET-seq: ▁▂▃█▇▅▂▁ │
|
|
142
|
+
│ signal: ↑ │
|
|
143
|
+
│ peak at -3 │
|
|
144
|
+
│ │
|
|
145
|
+
│ RECTIFY: Uses NET-seq Pol II occupancy to identify most likely │
|
|
146
|
+
│ termination site within the ambiguity window │
|
|
147
|
+
│ │
|
|
148
|
+
│ Result: Position refined to NET-seq peak, confidence = HIGH │
|
|
149
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
150
|
+
|
|
151
|
+
Pipeline Flow:
|
|
152
|
+
══════════════
|
|
153
|
+
|
|
154
|
+
┌──────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
155
|
+
│ Input │ │ Module 1 │ │ Module 2A/B │ │ Module 3 │
|
|
156
|
+
│ BAM │───▶│ A-tract │───▶│ Poly(A) & │───▶│ NET-seq │
|
|
157
|
+
│ │ │ Ambiguity │ │ Indels │ │ Refinement │
|
|
158
|
+
└──────────┘ └─────────────┘ └─────────────┘ └─────────────┘
|
|
159
|
+
│ │ │
|
|
160
|
+
▼ ▼ ▼
|
|
161
|
+
┌───────────────────────────────────────────────┐
|
|
162
|
+
│ Corrected 3' Ends │
|
|
163
|
+
│ (position, ambiguity range, confidence) │
|
|
164
|
+
└───────────────────────────────────────────────┘
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Installation
|
|
168
|
+
|
|
169
|
+
### From source (development)
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
git clone https://github.com/k-roy/RECTIFY.git
|
|
173
|
+
cd RECTIFY
|
|
174
|
+
pip install -e .
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### From PyPI (future release)
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
pip install rectify
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Quick Start
|
|
184
|
+
|
|
185
|
+
### QuantSeq (oligo-dT short-read)
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
rectify correct quantseq.bam \
|
|
189
|
+
--genome sacCer3.fa \
|
|
190
|
+
--annotation genes.gtf \
|
|
191
|
+
--polya-sequenced \
|
|
192
|
+
--output corrected_3ends.tsv
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Nanopore direct RNA-seq with NET-seq refinement
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
rectify correct nanopore.bam \
|
|
199
|
+
--genome sacCer3.fa \
|
|
200
|
+
--annotation genes.gtf \
|
|
201
|
+
--polya-sequenced \
|
|
202
|
+
--aligner minimap2 \
|
|
203
|
+
--netseq-dir churchman_bigwigs/ \
|
|
204
|
+
--output corrected_3ends.tsv
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Output Format
|
|
208
|
+
|
|
209
|
+
RECTIFY produces a TSV file with corrected 3' end positions and QC metrics:
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
read_id chrom strand raw_position corrected_position ambiguity_min ambiguity_max ambiguity_range correction_type confidence qc_flags
|
|
213
|
+
read001 chrI + 147588 147585 147583 147588 5 polya_trim high PASS
|
|
214
|
+
read002 chrI + 147593 147591 147591 147593 2 ag_mispriming medium AG_RICH
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Module Architecture
|
|
218
|
+
|
|
219
|
+
RECTIFY applies corrections modularly based on your data:
|
|
220
|
+
|
|
221
|
+
1. **Module 1: A-tract Ambiguity** (always applied)
|
|
222
|
+
- Identifies genomic A-tracts near 3' ends
|
|
223
|
+
- Calculates ambiguity windows
|
|
224
|
+
|
|
225
|
+
2. **Module 2A: AG Mispriming** (when oligo-dT priming used)
|
|
226
|
+
- Screens for downstream AG-richness
|
|
227
|
+
- Flags likely misprimed reads
|
|
228
|
+
|
|
229
|
+
3. **Module 2B+2C: Poly(A) Corrections** (when poly(A) IS sequenced)
|
|
230
|
+
- Models and trims poly(A) tails
|
|
231
|
+
- Detects and removes indel artifacts
|
|
232
|
+
|
|
233
|
+
4. **Module 3: NET-seq Refinement** (optional)
|
|
234
|
+
- Resolves ambiguity using NET-seq data
|
|
235
|
+
- Assigns confidence scores
|
|
236
|
+
|
|
237
|
+
## Citation
|
|
238
|
+
|
|
239
|
+
If you use RECTIFY, please cite:
|
|
240
|
+
|
|
241
|
+
**Original RECTIFY (AG mispriming correction):**
|
|
242
|
+
> Roy KR, Chanfreau GF. Robust mapping of polyadenylated and non-polyadenylated RNA 3' ends at nucleotide resolution by 3'-end sequencing. *Methods*. 2020 Apr 1;176:4-13. doi: 10.1016/j.ymeth.2019.05.016. [PMID: 31128237](https://pubmed.ncbi.nlm.nih.gov/31128237/)
|
|
243
|
+
|
|
244
|
+
**RECTIFY 2.0 (unified framework):**
|
|
245
|
+
> Manuscript in preparation
|
|
246
|
+
|
|
247
|
+
## License
|
|
248
|
+
|
|
249
|
+
MIT License - See [LICENSE](LICENSE) for details
|
|
250
|
+
|
|
251
|
+
## Contact
|
|
252
|
+
|
|
253
|
+
- Kevin R. Roy - [kevinrjroy@gmail.com](mailto:kevinrjroy@gmail.com)
|
|
254
|
+
- GitHub: [k-roy/RECTIFY](https://github.com/k-roy/RECTIFY)
|
|
255
|
+
|
|
256
|
+
## Acknowledgments
|
|
257
|
+
|
|
258
|
+
- Original RECTIFY development supported by Chanfreau Lab, UCLA
|
|
259
|
+
- NET-seq data from Churchman Lab, Harvard Medical School
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# RECTIFY: Unified RNA 3' End Correction Framework
|
|
2
|
+
|
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://github.com/k-roy/RECTIFY)
|
|
6
|
+
[](tests/)
|
|
7
|
+
|
|
8
|
+
**RECTIFY** (**R**NA 3' **E**nd **C**orrection **T**ool **I**ntegrating **F**alse-priming and pol**y**(A) ambiguity) is a unified framework for correcting 3' end mapping artifacts in poly(A)-tailed RNA sequencing data.
|
|
9
|
+
|
|
10
|
+
## Overview
|
|
11
|
+
|
|
12
|
+
RECTIFY addresses two fundamental problems affecting RNA 3' end mapping:
|
|
13
|
+
|
|
14
|
+
1. **A-tract Ambiguity (Universal)**: Genomic A-tracts near true 3' ends create positional uncertainty affecting ALL poly(A)-tailed RNA-seq technologies
|
|
15
|
+
2. **Technology-Specific Artifacts**:
|
|
16
|
+
- **AG mispriming**: Internal priming on A/G-rich regions (oligo-dT methods)
|
|
17
|
+
- **Poly(A) tail alignment**: Tail bases align to genomic A-tracts creating systematic shifts (when poly(A) is sequenced)
|
|
18
|
+
|
|
19
|
+
### Key Features
|
|
20
|
+
|
|
21
|
+
- **Modular correction strategies** that apply based on sequencing technology
|
|
22
|
+
- **Universal A-tract ambiguity detection** for all poly(A)-tailed RNA-seq
|
|
23
|
+
- **AG mispriming screening** (from original RECTIFY, Roy & Chanfreau 2019)
|
|
24
|
+
- **Poly(A) tail trimming and indel artifact correction** (for direct RNA-seq: nanopore, Helicos, QuantSeq)
|
|
25
|
+
- **NET-seq refinement** (optional, technology-independent)
|
|
26
|
+
- **Unified output format** with confidence scores and QC flags
|
|
27
|
+
|
|
28
|
+
## How It Works
|
|
29
|
+
|
|
30
|
+
RECTIFY corrects common 3' end mapping artifacts through a series of modular steps:
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
34
|
+
│ EXAMPLE 1: Homopolymer Deletion Artifact (Nanopore) │
|
|
35
|
+
│ ═══════════════════════════════════════════════════ │
|
|
36
|
+
│ │
|
|
37
|
+
│ True RNA: 5'...GCTAAGCTTAAAAAA-3' + AAAAAAAAAA (poly(A) tail) │
|
|
38
|
+
│ └────┘ │
|
|
39
|
+
│ 6A genomic tract │
|
|
40
|
+
│ │
|
|
41
|
+
│ Genome: ...GCTAAGCTTAAAAAA|GTCACC... (| = true CPA site) │
|
|
42
|
+
│ │
|
|
43
|
+
│ Nanopore read: ...GCTAAGCTT--AAAA|GTCACC (2bp deletion in A-tract)│
|
|
44
|
+
│ ↑↑ │
|
|
45
|
+
│ systematic homopolymer error │
|
|
46
|
+
│ │
|
|
47
|
+
│ Problem: Aligner maps 3' end 2bp upstream of true position │
|
|
48
|
+
│ (deletion consumes genomic bases that should be in transcript) │
|
|
49
|
+
│ │
|
|
50
|
+
│ RECTIFY: Detects A-tract deletion, adjusts position +2bp │
|
|
51
|
+
│ Result: Correct 3' end position restored │
|
|
52
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
53
|
+
|
|
54
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
55
|
+
│ EXAMPLE 2: Multiple Indels Near 3' End │
|
|
56
|
+
│ ═══════════════════════════════════════ │
|
|
57
|
+
│ │
|
|
58
|
+
│ Genome: ...TACGTTTTTTAAAAAA|GTCA... │
|
|
59
|
+
│ └────┘└────┘ │
|
|
60
|
+
│ T-tract A-tract │
|
|
61
|
+
│ │
|
|
62
|
+
│ Nanopore read: ...TACGT---TTAA-AAA|GTCA │
|
|
63
|
+
│ ↑↑↑ ↑ │
|
|
64
|
+
│ 3bp del 1bp del │
|
|
65
|
+
│ │
|
|
66
|
+
│ RECTIFY logic: │
|
|
67
|
+
│ • T-tract deletion (3bp): TRUE artifact → correct +3bp │
|
|
68
|
+
│ • A-tract deletion (1bp): TRUE artifact → correct +1bp │
|
|
69
|
+
│ • Total correction: +4bp │
|
|
70
|
+
│ │
|
|
71
|
+
│ Note: Insertions do NOT shift reference coordinates (no correction needed) │
|
|
72
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
73
|
+
|
|
74
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
75
|
+
│ EXAMPLE 3: A-tract Ambiguity Window │
|
|
76
|
+
│ ═══════════════════════════════════ │
|
|
77
|
+
│ │
|
|
78
|
+
│ Genome: ...CGTACAAAAAAAA|GTCACC... │
|
|
79
|
+
│ └───────┘ │
|
|
80
|
+
│ 8bp A-tract │
|
|
81
|
+
│ │
|
|
82
|
+
│ Problem: Any position within the A-tract could be the true 3' end │
|
|
83
|
+
│ (indistinguishable from poly(A) tail) │
|
|
84
|
+
│ │
|
|
85
|
+
│ ...CGTACAAAAAAAA| ← could be here │
|
|
86
|
+
│ ...CGTACAAAAAAA|A ← or here │
|
|
87
|
+
│ ...CGTACAAAAAA|AA ← or here │
|
|
88
|
+
│ ...CGTACAAAAA|AAA ← etc. │
|
|
89
|
+
│ │
|
|
90
|
+
│ RECTIFY: Reports ambiguity window [pos-7, pos] with range=8 │
|
|
91
|
+
│ Confidence score reflects uncertainty │
|
|
92
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
93
|
+
|
|
94
|
+
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
95
|
+
│ EXAMPLE 4: NET-seq Refinement │
|
|
96
|
+
│ ═════════════════════════════ │
|
|
97
|
+
│ │
|
|
98
|
+
│ Genome: ...CGTACAAAAAAAA|GTCACC... │
|
|
99
|
+
│ └───────┘ │
|
|
100
|
+
│ ambiguity window │
|
|
101
|
+
│ │
|
|
102
|
+
│ NET-seq: ▁▂▃█▇▅▂▁ │
|
|
103
|
+
│ signal: ↑ │
|
|
104
|
+
│ peak at -3 │
|
|
105
|
+
│ │
|
|
106
|
+
│ RECTIFY: Uses NET-seq Pol II occupancy to identify most likely │
|
|
107
|
+
│ termination site within the ambiguity window │
|
|
108
|
+
│ │
|
|
109
|
+
│ Result: Position refined to NET-seq peak, confidence = HIGH │
|
|
110
|
+
└─────────────────────────────────────────────────────────────────────────────┘
|
|
111
|
+
|
|
112
|
+
Pipeline Flow:
|
|
113
|
+
══════════════
|
|
114
|
+
|
|
115
|
+
┌──────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
116
|
+
│ Input │ │ Module 1 │ │ Module 2A/B │ │ Module 3 │
|
|
117
|
+
│ BAM │───▶│ A-tract │───▶│ Poly(A) & │───▶│ NET-seq │
|
|
118
|
+
│ │ │ Ambiguity │ │ Indels │ │ Refinement │
|
|
119
|
+
└──────────┘ └─────────────┘ └─────────────┘ └─────────────┘
|
|
120
|
+
│ │ │
|
|
121
|
+
▼ ▼ ▼
|
|
122
|
+
┌───────────────────────────────────────────────┐
|
|
123
|
+
│ Corrected 3' Ends │
|
|
124
|
+
│ (position, ambiguity range, confidence) │
|
|
125
|
+
└───────────────────────────────────────────────┘
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Installation
|
|
129
|
+
|
|
130
|
+
### From source (development)
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
git clone https://github.com/k-roy/RECTIFY.git
|
|
134
|
+
cd RECTIFY
|
|
135
|
+
pip install -e .
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### From PyPI (future release)
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
pip install rectify
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Quick Start
|
|
145
|
+
|
|
146
|
+
### QuantSeq (oligo-dT short-read)
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
rectify correct quantseq.bam \
|
|
150
|
+
--genome sacCer3.fa \
|
|
151
|
+
--annotation genes.gtf \
|
|
152
|
+
--polya-sequenced \
|
|
153
|
+
--output corrected_3ends.tsv
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### Nanopore direct RNA-seq with NET-seq refinement
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
rectify correct nanopore.bam \
|
|
160
|
+
--genome sacCer3.fa \
|
|
161
|
+
--annotation genes.gtf \
|
|
162
|
+
--polya-sequenced \
|
|
163
|
+
--aligner minimap2 \
|
|
164
|
+
--netseq-dir churchman_bigwigs/ \
|
|
165
|
+
--output corrected_3ends.tsv
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Output Format
|
|
169
|
+
|
|
170
|
+
RECTIFY produces a TSV file with corrected 3' end positions and QC metrics:
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
read_id chrom strand raw_position corrected_position ambiguity_min ambiguity_max ambiguity_range correction_type confidence qc_flags
|
|
174
|
+
read001 chrI + 147588 147585 147583 147588 5 polya_trim high PASS
|
|
175
|
+
read002 chrI + 147593 147591 147591 147593 2 ag_mispriming medium AG_RICH
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Module Architecture
|
|
179
|
+
|
|
180
|
+
RECTIFY applies corrections modularly based on your data:
|
|
181
|
+
|
|
182
|
+
1. **Module 1: A-tract Ambiguity** (always applied)
|
|
183
|
+
- Identifies genomic A-tracts near 3' ends
|
|
184
|
+
- Calculates ambiguity windows
|
|
185
|
+
|
|
186
|
+
2. **Module 2A: AG Mispriming** (when oligo-dT priming used)
|
|
187
|
+
- Screens for downstream AG-richness
|
|
188
|
+
- Flags likely misprimed reads
|
|
189
|
+
|
|
190
|
+
3. **Module 2B+2C: Poly(A) Corrections** (when poly(A) IS sequenced)
|
|
191
|
+
- Models and trims poly(A) tails
|
|
192
|
+
- Detects and removes indel artifacts
|
|
193
|
+
|
|
194
|
+
4. **Module 3: NET-seq Refinement** (optional)
|
|
195
|
+
- Resolves ambiguity using NET-seq data
|
|
196
|
+
- Assigns confidence scores
|
|
197
|
+
|
|
198
|
+
## Citation
|
|
199
|
+
|
|
200
|
+
If you use RECTIFY, please cite:
|
|
201
|
+
|
|
202
|
+
**Original RECTIFY (AG mispriming correction):**
|
|
203
|
+
> Roy KR, Chanfreau GF. Robust mapping of polyadenylated and non-polyadenylated RNA 3' ends at nucleotide resolution by 3'-end sequencing. *Methods*. 2020 Apr 1;176:4-13. doi: 10.1016/j.ymeth.2019.05.016. [PMID: 31128237](https://pubmed.ncbi.nlm.nih.gov/31128237/)
|
|
204
|
+
|
|
205
|
+
**RECTIFY 2.0 (unified framework):**
|
|
206
|
+
> Manuscript in preparation
|
|
207
|
+
|
|
208
|
+
## License
|
|
209
|
+
|
|
210
|
+
MIT License - See [LICENSE](LICENSE) for details
|
|
211
|
+
|
|
212
|
+
## Contact
|
|
213
|
+
|
|
214
|
+
- Kevin R. Roy - [kevinrjroy@gmail.com](mailto:kevinrjroy@gmail.com)
|
|
215
|
+
- GitHub: [k-roy/RECTIFY](https://github.com/k-roy/RECTIFY)
|
|
216
|
+
|
|
217
|
+
## Acknowledgments
|
|
218
|
+
|
|
219
|
+
- Original RECTIFY development supported by Chanfreau Lab, UCLA
|
|
220
|
+
- NET-seq data from Churchman Lab, Harvard Medical School
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "rectify-rna"
|
|
7
|
+
version = "2.1.0"
|
|
8
|
+
description = "Unified RNA 3' End Correction Framework for poly(A)-tailed RNA sequencing"
|
|
9
|
+
authors = [
|
|
10
|
+
{name = "Kevin R. Roy", email = "kevinrjroy@gmail.com"}
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.8"
|
|
14
|
+
license = {text = "MIT"}
|
|
15
|
+
keywords = ["RNA", "3-prime-end", "polyadenylation", "RNA-seq", "genomics"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
dependencies = [
|
|
29
|
+
"pysam>=0.19.0",
|
|
30
|
+
"numpy>=1.20.0",
|
|
31
|
+
"pandas>=1.3.0",
|
|
32
|
+
"pyBigWig>=0.3.18",
|
|
33
|
+
"biopython>=1.79",
|
|
34
|
+
"tqdm>=4.60.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=7.0.0",
|
|
40
|
+
"pytest-cov>=3.0.0",
|
|
41
|
+
"black>=22.0.0",
|
|
42
|
+
"flake8>=4.0.0",
|
|
43
|
+
"mypy>=0.950",
|
|
44
|
+
]
|
|
45
|
+
docs = [
|
|
46
|
+
"sphinx>=4.5.0",
|
|
47
|
+
"sphinx-rtd-theme>=1.0.0",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[project.urls]
|
|
51
|
+
"Homepage" = "https://github.com/k-roy/RECTIFY"
|
|
52
|
+
"Bug Tracker" = "https://github.com/k-roy/RECTIFY/issues"
|
|
53
|
+
"Publication" = "https://pubmed.ncbi.nlm.nih.gov/31128237/"
|
|
54
|
+
|
|
55
|
+
[project.scripts]
|
|
56
|
+
rectify = "rectify.cli:main"
|
|
57
|
+
|
|
58
|
+
[tool.setuptools]
|
|
59
|
+
packages = ["rectify", "rectify.core", "rectify.utils"]
|
|
60
|
+
|
|
61
|
+
[tool.setuptools.package-data]
|
|
62
|
+
rectify = ["data/models/*.json"]
|
|
63
|
+
|
|
64
|
+
[tool.pytest.ini_options]
|
|
65
|
+
testpaths = ["tests"]
|
|
66
|
+
python_files = ["test_*.py"]
|
|
67
|
+
python_functions = ["test_*"]
|
|
68
|
+
addopts = "-v --cov=rectify --cov-report=html --cov-report=term"
|
|
69
|
+
|
|
70
|
+
[tool.black]
|
|
71
|
+
line-length = 100
|
|
72
|
+
target-version = ['py38', 'py39', 'py310', 'py311']
|
|
73
|
+
|
|
74
|
+
[tool.mypy]
|
|
75
|
+
python_version = "3.8"
|
|
76
|
+
warn_return_any = true
|
|
77
|
+
warn_unused_configs = true
|
|
78
|
+
disallow_untyped_defs = false
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RECTIFY: Unified RNA 3' End Correction Framework
|
|
3
|
+
|
|
4
|
+
A modular framework for correcting 3' end mapping artifacts in poly(A)-tailed RNA sequencing data.
|
|
5
|
+
|
|
6
|
+
Modules:
|
|
7
|
+
- A-tract ambiguity detection (universal)
|
|
8
|
+
- AG mispriming screening (oligo-dT methods)
|
|
9
|
+
- Poly(A) tail trimming and indel correction (direct RNA-seq)
|
|
10
|
+
- NET-seq refinement (optional)
|
|
11
|
+
|
|
12
|
+
Features (v2.1.0):
|
|
13
|
+
- Region-based parallel BAM processing with coverage gap splitting
|
|
14
|
+
- SLURM-aware CPU detection to prevent oversubscription
|
|
15
|
+
- Streaming output mode for large BAM files
|
|
16
|
+
|
|
17
|
+
Author: Kevin R. Roy
|
|
18
|
+
License: MIT
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
__version__ = "2.1.0"
|
|
22
|
+
__author__ = "Kevin R. Roy"
|
|
23
|
+
__email__ = "kevinroy@stanford.edu"
|
|
24
|
+
|
|
25
|
+
from . import core, utils, slurm
|
|
26
|
+
from .slurm import get_available_cpus, set_thread_limits, is_slurm_job
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"core",
|
|
30
|
+
"utils",
|
|
31
|
+
"slurm",
|
|
32
|
+
"get_available_cpus",
|
|
33
|
+
"set_thread_limits",
|
|
34
|
+
"is_slurm_job",
|
|
35
|
+
"__version__",
|
|
36
|
+
]
|