rectify-rna 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rectify/__init__.py ADDED
@@ -0,0 +1,36 @@
1
+ """
2
+ RECTIFY: Unified RNA 3' End Correction Framework
3
+
4
+ A modular framework for correcting 3' end mapping artifacts in poly(A)-tailed RNA sequencing data.
5
+
6
+ Modules:
7
+ - A-tract ambiguity detection (universal)
8
+ - AG mispriming screening (oligo-dT methods)
9
+ - Poly(A) tail trimming and indel correction (direct RNA-seq)
10
+ - NET-seq refinement (optional)
11
+
12
+ Features (v2.1.0):
13
+ - Region-based parallel BAM processing with coverage gap splitting
14
+ - SLURM-aware CPU detection to prevent oversubscription
15
+ - Streaming output mode for large BAM files
16
+
17
+ Author: Kevin R. Roy
18
+ License: MIT
19
+ """
20
+
21
+ __version__ = "2.1.0"
22
+ __author__ = "Kevin R. Roy"
23
+ __email__ = "kevinroy@stanford.edu"
24
+
25
+ from . import core, utils, slurm
26
+ from .slurm import get_available_cpus, set_thread_limits, is_slurm_job
27
+
28
+ __all__ = [
29
+ "core",
30
+ "utils",
31
+ "slurm",
32
+ "get_available_cpus",
33
+ "set_thread_limits",
34
+ "is_slurm_job",
35
+ "__version__",
36
+ ]
rectify/__main__.py ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Entry point for running RECTIFY as a module: python -m rectify
4
+ """
5
+
6
+ from .cli import main
7
+
8
+ if __name__ == '__main__':
9
+ main()
rectify/cli.py ADDED
@@ -0,0 +1,376 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ RECTIFY command-line interface.
4
+
5
+ Provides commands for:
6
+ - correct: Correct 3' end positions in BAM files
7
+ - train-polya: Train poly(A) tail model from control data
8
+ - validate: Validate corrections against NET-seq or other ground truth
9
+
10
+ Author: Kevin R. Roy
11
+ Date: 2026-03-09
12
+ """
13
+
14
+ import argparse
15
+ import sys
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+ from . import __version__
20
+
21
+
22
+ def create_parser() -> argparse.ArgumentParser:
23
+ """Create main argument parser."""
24
+ parser = argparse.ArgumentParser(
25
+ prog='rectify',
26
+ description='RECTIFY: Unified RNA 3\' End Correction Framework',
27
+ formatter_class=argparse.RawDescriptionHelpFormatter,
28
+ epilog="""
29
+ Examples:
30
+ # QuantSeq (oligo-dT short-read)
31
+ rectify correct quantseq.bam --genome sacCer3.fa --annotation genes.gtf --polya-sequenced -o corrected.tsv
32
+
33
+ # Nanopore direct RNA-seq with NET-seq refinement
34
+ rectify correct nanopore.bam --genome sacCer3.fa --annotation genes.gtf --polya-sequenced \\
35
+ --aligner minimap2 --netseq-dir bigwigs/ -o corrected.tsv
36
+
37
+ # Train poly(A) model
38
+ rectify train-polya nanopore.bam --genome sacCer3.fa --control-sites cpa_clusters.tsv -o model.json
39
+
40
+ Citation:
41
+ Roy, K. R., & Chanfreau, G. F. (2019). RECTIFY: Identification and correction of mRNA
42
+ mis-termination caused by oligo(dT)-primed internal priming. Nucleic Acids Research, 47(16), e96.
43
+ """
44
+ )
45
+
46
+ parser.add_argument(
47
+ '--version',
48
+ action='version',
49
+ version=f'RECTIFY {__version__}'
50
+ )
51
+
52
+ subparsers = parser.add_subparsers(dest='command', help='Available commands')
53
+
54
+ # =========================================================================
55
+ # correct command
56
+ # =========================================================================
57
+ correct_parser = subparsers.add_parser(
58
+ 'correct',
59
+ help='Correct 3\' end positions in BAM file',
60
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
61
+ )
62
+
63
+ # Required arguments
64
+ correct_parser.add_argument(
65
+ 'bam',
66
+ type=Path,
67
+ help='Input BAM file (aligned RNA-seq reads)'
68
+ )
69
+
70
+ correct_parser.add_argument(
71
+ '--genome',
72
+ type=Path,
73
+ required=True,
74
+ help='Reference genome FASTA file (indexed with .fai)'
75
+ )
76
+
77
+ correct_parser.add_argument(
78
+ '-o', '--output',
79
+ type=Path,
80
+ required=True,
81
+ help='Output TSV file with corrected 3\' end positions'
82
+ )
83
+
84
+ # Optional arguments
85
+ correct_parser.add_argument(
86
+ '--annotation',
87
+ type=Path,
88
+ help='Gene annotation file (GTF/GFF) for AG mispriming context'
89
+ )
90
+
91
+ # Technology flags
92
+ tech_group = correct_parser.add_argument_group('Technology settings')
93
+ tech_group.add_argument(
94
+ '--polya-sequenced',
95
+ action='store_true',
96
+ help='Poly(A) tail IS sequenced (enables poly(A) trimming and indel correction). '
97
+ 'Use for: nanopore direct RNA, Helicos, QuantSeq, etc.'
98
+ )
99
+
100
+ tech_group.add_argument(
101
+ '--aligner',
102
+ choices=['minimap2', 'bwa', 'star', 'auto'],
103
+ default='auto',
104
+ help='Aligner used (affects indel artifact detection)'
105
+ )
106
+
107
+ # Module flags
108
+ module_group = correct_parser.add_argument_group('Module selection')
109
+ module_group.add_argument(
110
+ '--skip-atract-check',
111
+ action='store_true',
112
+ help='Skip A-tract ambiguity detection (for organisms without A-tracts)'
113
+ )
114
+
115
+ module_group.add_argument(
116
+ '--skip-ag-check',
117
+ action='store_true',
118
+ help='Skip AG mispriming screening'
119
+ )
120
+
121
+ module_group.add_argument(
122
+ '--skip-polya-trim',
123
+ action='store_true',
124
+ help='Skip poly(A) tail trimming (even if --polya-sequenced)'
125
+ )
126
+
127
+ module_group.add_argument(
128
+ '--skip-indel-correction',
129
+ action='store_true',
130
+ help='Skip indel artifact correction (even if --polya-sequenced)'
131
+ )
132
+
133
+ # Poly(A) model
134
+ polya_group = correct_parser.add_argument_group('Poly(A) tail model')
135
+ polya_group.add_argument(
136
+ '--polya-model',
137
+ type=Path,
138
+ help='Pre-trained poly(A) tail model (JSON). If not provided, uses built-in model.'
139
+ )
140
+
141
+ # NET-seq refinement
142
+ netseq_group = correct_parser.add_argument_group('NET-seq refinement')
143
+ netseq_group.add_argument(
144
+ '--netseq-dir',
145
+ type=Path,
146
+ help='Directory containing NET-seq BigWig files (.bw) for refinement'
147
+ )
148
+
149
+ netseq_group.add_argument(
150
+ '--netseq-samples',
151
+ nargs='+',
152
+ help='NET-seq sample names to use (e.g., wt_2022_rep1). If not provided, auto-detect.'
153
+ )
154
+
155
+ # Thresholds and parameters
156
+ param_group = correct_parser.add_argument_group('Parameters')
157
+ param_group.add_argument(
158
+ '--ag-threshold',
159
+ type=float,
160
+ default=0.65,
161
+ help='AG-richness threshold for mispriming flagging (0.0-1.0)'
162
+ )
163
+
164
+ param_group.add_argument(
165
+ '--polya-richness',
166
+ type=float,
167
+ default=0.8,
168
+ help='A-richness threshold for poly(A) tail detection (0.0-1.0)'
169
+ )
170
+
171
+ param_group.add_argument(
172
+ '--min-polya-length',
173
+ type=int,
174
+ default=15,
175
+ help='Minimum poly(A) tail length for nanopore oligo-dT priming'
176
+ )
177
+
178
+ # Output options
179
+ output_group = correct_parser.add_argument_group('Output options')
180
+ output_group.add_argument(
181
+ '--report',
182
+ type=Path,
183
+ help='Output QC report file (HTML or PDF)'
184
+ )
185
+
186
+ output_group.add_argument(
187
+ '--verbose',
188
+ action='store_true',
189
+ help='Verbose logging'
190
+ )
191
+
192
+ # Performance options
193
+ perf_group = correct_parser.add_argument_group('Performance options')
194
+ perf_group.add_argument(
195
+ '-j', '--threads',
196
+ type=int,
197
+ default=0,
198
+ help='Number of threads for parallel processing. '
199
+ '0 = auto-detect from SLURM_CPUS_PER_TASK or system (default: 0)'
200
+ )
201
+
202
+ perf_group.add_argument(
203
+ '--streaming',
204
+ action='store_true',
205
+ help='Use streaming output mode to minimize memory usage. '
206
+ 'Recommended for BAM files > 10GB. Writes directly to output file.'
207
+ )
208
+
209
+ perf_group.add_argument(
210
+ '--chunk-size',
211
+ type=int,
212
+ default=10000,
213
+ help='Number of reads per output chunk in streaming mode (default: 10000)'
214
+ )
215
+
216
+ # =========================================================================
217
+ # train-polya command
218
+ # =========================================================================
219
+ train_parser = subparsers.add_parser(
220
+ 'train-polya',
221
+ help='Train poly(A) tail model from control data',
222
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
223
+ )
224
+
225
+ train_parser.add_argument(
226
+ 'bam',
227
+ type=Path,
228
+ help='Input BAM file with soft-clipped poly(A) tails'
229
+ )
230
+
231
+ train_parser.add_argument(
232
+ '--genome',
233
+ type=Path,
234
+ required=True,
235
+ help='Reference genome FASTA file'
236
+ )
237
+
238
+ train_parser.add_argument(
239
+ '--control-sites',
240
+ type=Path,
241
+ required=True,
242
+ help='TSV file with control CPA sites (0A downstream A-count)'
243
+ )
244
+
245
+ train_parser.add_argument(
246
+ '-o', '--output',
247
+ type=Path,
248
+ required=True,
249
+ help='Output model file (JSON)'
250
+ )
251
+
252
+ train_parser.add_argument(
253
+ '--min-reads',
254
+ type=int,
255
+ default=10,
256
+ help='Minimum reads per control site for training'
257
+ )
258
+
259
+ train_parser.add_argument(
260
+ '--verbose',
261
+ action='store_true',
262
+ help='Verbose logging'
263
+ )
264
+
265
+ # =========================================================================
266
+ # validate command
267
+ # =========================================================================
268
+ validate_parser = subparsers.add_parser(
269
+ 'validate',
270
+ help='Validate corrections against ground truth',
271
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
272
+ )
273
+
274
+ # Required arguments
275
+ validate_parser.add_argument(
276
+ 'corrected',
277
+ type=Path,
278
+ help='Corrected 3\' ends TSV from RECTIFY'
279
+ )
280
+
281
+ validate_parser.add_argument(
282
+ '-o', '--output',
283
+ type=Path,
284
+ required=True,
285
+ help='Output validation results TSV'
286
+ )
287
+
288
+ # Ground truth sources (at least one required)
289
+ truth_group = validate_parser.add_argument_group(
290
+ 'Ground truth sources (at least one required)'
291
+ )
292
+
293
+ truth_group.add_argument(
294
+ '--netseq-dir',
295
+ type=Path,
296
+ help='NET-seq BigWig directory (optional - requires pyBigWig)'
297
+ )
298
+
299
+ truth_group.add_argument(
300
+ '--netseq-samples',
301
+ nargs='+',
302
+ help='Specific NET-seq samples to use'
303
+ )
304
+
305
+ truth_group.add_argument(
306
+ '--annotation',
307
+ type=Path,
308
+ help='Gene annotation GTF/GFF with known 3\' ends'
309
+ )
310
+
311
+ truth_group.add_argument(
312
+ '--ground-truth',
313
+ type=Path,
314
+ help='TSV file with known true 3\' end positions'
315
+ )
316
+
317
+ # Validation parameters
318
+ param_group = validate_parser.add_argument_group('Validation parameters')
319
+
320
+ param_group.add_argument(
321
+ '--tolerance',
322
+ type=int,
323
+ default=1,
324
+ help='Position tolerance in bp for "correct" classification'
325
+ )
326
+
327
+ param_group.add_argument(
328
+ '--min-signal',
329
+ type=float,
330
+ default=0.5,
331
+ help='Minimum NET-seq signal for ground truth'
332
+ )
333
+
334
+ param_group.add_argument(
335
+ '--search-window',
336
+ type=int,
337
+ default=10,
338
+ help='Window size for finding nearest ground truth'
339
+ )
340
+
341
+ # Output options
342
+ validate_parser.add_argument(
343
+ '--verbose',
344
+ action='store_true',
345
+ help='Verbose logging'
346
+ )
347
+
348
+ return parser
349
+
350
+
351
+ def main(argv: Optional[list] = None):
352
+ """Main entry point for RECTIFY CLI."""
353
+ parser = create_parser()
354
+ args = parser.parse_args(argv)
355
+
356
+ if args.command is None:
357
+ parser.print_help()
358
+ sys.exit(1)
359
+
360
+ # Import commands only when needed
361
+ if args.command == 'correct':
362
+ from .core import correct_command
363
+ correct_command.run(args)
364
+ elif args.command == 'train-polya':
365
+ from .core import train_polya_command
366
+ train_polya_command.run(args)
367
+ elif args.command == 'validate':
368
+ from .core import validate_command
369
+ validate_command.run(args)
370
+ else:
371
+ parser.print_help()
372
+ sys.exit(1)
373
+
374
+
375
+ if __name__ == '__main__':
376
+ main()
rectify/config.py ADDED
@@ -0,0 +1,208 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Configuration constants for RECTIFY.
4
+
5
+ This module centralizes:
6
+ - Chromosome mappings (standard ↔ NCBI format)
7
+ - Shift correction parameters (from NET-seq analysis)
8
+ - Poly(A) tail model parameters
9
+ - Indel detection parameters
10
+ - NET-seq refinement parameters
11
+
12
+ Author: Kevin R. Roy
13
+ Date: 2026-03-09
14
+ """
15
+
16
+ from typing import Dict
17
+
18
+ # =============================================================================
19
+ # Version
20
+ # =============================================================================
21
+
22
+ __version__ = "2.1.0"
23
+
24
+ # =============================================================================
25
+ # Chromosome Mappings
26
+ # =============================================================================
27
+
28
+ # Standard chromosome to NCBI genome format mapping (S. cerevisiae)
29
+ CHROM_TO_GENOME: Dict[str, str] = {
30
+ 'chrI': 'ref|NC_001133|',
31
+ 'chrII': 'ref|NC_001134|',
32
+ 'chrIII': 'ref|NC_001135|',
33
+ 'chrIV': 'ref|NC_001136|',
34
+ 'chrV': 'ref|NC_001137|',
35
+ 'chrVI': 'ref|NC_001138|',
36
+ 'chrVII': 'ref|NC_001139|',
37
+ 'chrVIII': 'ref|NC_001140|',
38
+ 'chrIX': 'ref|NC_001141|',
39
+ 'chrX': 'ref|NC_001142|',
40
+ 'chrXI': 'ref|NC_001143|',
41
+ 'chrXII': 'ref|NC_001144|',
42
+ 'chrXIII': 'ref|NC_001145|',
43
+ 'chrXIV': 'ref|NC_001146|',
44
+ 'chrXV': 'ref|NC_001147|',
45
+ 'chrXVI': 'ref|NC_001148|',
46
+ 'chrMito': 'ref|NC_001224|',
47
+ }
48
+
49
+ # Reverse mapping
50
+ GENOME_TO_CHROM: Dict[str, str] = {v: k for k, v in CHROM_TO_GENOME.items()}
51
+
52
+ # Chromosome sizes (sacCer3/R64)
53
+ CHROM_SIZES: Dict[str, int] = {
54
+ 'chrI': 230218,
55
+ 'chrII': 813184,
56
+ 'chrIII': 316620,
57
+ 'chrIV': 1531933,
58
+ 'chrV': 576874,
59
+ 'chrVI': 270161,
60
+ 'chrVII': 1090940,
61
+ 'chrVIII': 562643,
62
+ 'chrIX': 439888,
63
+ 'chrX': 745751,
64
+ 'chrXI': 666816,
65
+ 'chrXII': 1078177,
66
+ 'chrXIII': 924431,
67
+ 'chrXIV': 784333,
68
+ 'chrXV': 1091291,
69
+ 'chrXVI': 948066,
70
+ 'chrMito': 85779,
71
+ }
72
+
73
+ # =============================================================================
74
+ # Shift Correction Parameters (from NET-seq Analysis)
75
+ # =============================================================================
76
+
77
+ # These represent how much the apparent position is shifted RIGHTWARD (downstream)
78
+ # due to long poly(A) tails (>15 A's for nanopore) aligning to genomic A's/T's
79
+ # To correct: shift LEFTWARD (upstream) by this amount
80
+ SHIFT_CORRECTIONS_BY_ACOUNT: Dict[int, float] = {
81
+ 0: 0.0, # No downstream A's - no shift
82
+ 1: 0.2, # Minimal shift
83
+ 2: 0.3,
84
+ 3: 0.4,
85
+ 4: 1.0, # Moderate shift
86
+ 5: 1.3,
87
+ 6: 1.7,
88
+ 7: 2.6, # Strong shift
89
+ 8: 2.8,
90
+ 9: 2.9,
91
+ 10: 3.8, # Maximum observed shift (saturated)
92
+ }
93
+
94
+ # For A-counts > 10, use the 10A value (saturated)
95
+ DEFAULT_MAX_SHIFT: float = 3.8
96
+
97
+ # =============================================================================
98
+ # Poly(A) Tail Model Parameters
99
+ # =============================================================================
100
+
101
+ # Minimum tail length for nanopore oligo-dT priming
102
+ MIN_POLYA_LENGTH: int = 15
103
+
104
+ # A-richness threshold for identifying poly(A) tails
105
+ POLYA_RICHNESS_THRESHOLD: float = 0.8 # 80% A content
106
+
107
+ # Window size for A-richness calculation
108
+ POLYA_WINDOW_SIZE: int = 10
109
+
110
+ # RTA adapter patterns to detect
111
+ ADAPTER_POLY_T_MIN: int = 6 # Minimum poly(T) length
112
+ ADAPTER_TC_MOTIFS = ['TC', 'TCTC', 'TCT']
113
+
114
+ # Scoring thresholds for tail classification
115
+ TAIL_SCORE_HIGH: float = 0.7 # Definite poly(A) tail
116
+ TAIL_SCORE_LOW: float = 0.4 # Uncertain
117
+
118
+ # =============================================================================
119
+ # Indel Detection Parameters
120
+ # =============================================================================
121
+
122
+ # Maximum distance from 3' end to consider deletions as artifacts
123
+ INDEL_SEARCH_WINDOW: int = 20 # bp
124
+
125
+ # Maximum deletion size to consider as artifact
126
+ INDEL_MAX_SIZE: int = 3 # bp
127
+
128
+ # Minimum A-richness in flanking regions to classify as artifact
129
+ INDEL_FLANK_A_THRESHOLD: float = 0.7 # 70% A content
130
+
131
+ # Minimum flank length to check for A-richness
132
+ INDEL_MIN_FLANK_LENGTH: int = 5 # bp
133
+
134
+ # =============================================================================
135
+ # Ambiguity Range Parameters
136
+ # =============================================================================
137
+
138
+ # Maximum shift to consider when calculating ambiguity ranges
139
+ MAX_AMBIGUITY_SHIFT: int = 5 # bp
140
+
141
+ # Window size for downstream A-count calculation
142
+ DOWNSTREAM_WINDOW_SIZE: int = 10 # bp
143
+
144
+ # =============================================================================
145
+ # NET-seq Refinement Parameters
146
+ # =============================================================================
147
+
148
+ # Agreement threshold for combining WT and dst1 NET-seq data
149
+ NETSEQ_AGREEMENT_THRESHOLD: float = 0.90 # 90% agreement at ±1bp
150
+
151
+ # Window around CPA for peak detection
152
+ NETSEQ_PEAK_WINDOW: int = 5 # ±5bp
153
+
154
+ # Minimum signal threshold for peak calling (relative to max)
155
+ NETSEQ_PEAK_THRESHOLD: float = 0.5 # 50% of max signal
156
+
157
+ # Minimum peak signal for high confidence
158
+ NETSEQ_SIGNAL_HIGH: float = 1.0
159
+ NETSEQ_SIGNAL_MEDIUM: float = 0.5
160
+
161
+ # Maximum distance between peaks to consider them "close"
162
+ NETSEQ_PEAK_CLOSE_DISTANCE: int = 2 # bp
163
+
164
+ # =============================================================================
165
+ # AG Mispriming Parameters (from original RECTIFY)
166
+ # =============================================================================
167
+
168
+ # Window size for AG-richness calculation
169
+ AG_RICHNESS_WINDOW: int = 50 # bp downstream
170
+
171
+ # AG content threshold for flagging likely mispriming
172
+ AG_RICHNESS_THRESHOLD: float = 0.65 # 65% A+G content
173
+
174
+ # Minimum window size if near chromosome end
175
+ AG_RICHNESS_MIN_WINDOW: int = 20 # bp
176
+
177
+ # =============================================================================
178
+ # Helper Functions
179
+ # =============================================================================
180
+
181
+ def get_shift_from_acount(a_count: int) -> float:
182
+ """
183
+ Get shift correction for given A-count.
184
+
185
+ Args:
186
+ a_count: Number of A's in downstream window
187
+
188
+ Returns:
189
+ Shift correction in bp (how much to shift leftward/upstream)
190
+ """
191
+ if a_count <= 10:
192
+ return SHIFT_CORRECTIONS_BY_ACOUNT.get(a_count, 0.0)
193
+ else:
194
+ return DEFAULT_MAX_SHIFT
195
+
196
+
197
+ def validate_config():
198
+ """Validate configuration parameters."""
199
+ assert len(CHROM_TO_GENOME) == len(GENOME_TO_CHROM), "Chromosome mapping mismatch"
200
+ assert len(CHROM_TO_GENOME) == len(CHROM_SIZES), "Chromosome size mapping incomplete"
201
+ assert 0.0 <= POLYA_RICHNESS_THRESHOLD <= 1.0, "Invalid poly(A) richness threshold"
202
+ assert 0.0 <= AG_RICHNESS_THRESHOLD <= 1.0, "Invalid AG richness threshold"
203
+ assert MIN_POLYA_LENGTH > 0, "Invalid minimum poly(A) length"
204
+ assert INDEL_SEARCH_WINDOW > 0, "Invalid indel search window"
205
+
206
+
207
+ # Run validation on import
208
+ validate_config()
@@ -0,0 +1,32 @@
1
+ """
2
+ RECTIFY core modules.
3
+
4
+ This package contains the main correction algorithms:
5
+ - bam_processor: BAM reading and 3' end extraction
6
+ - atract_detector: A-tract ambiguity detection (universal)
7
+ - ag_mispriming: AG-richness screening (oligo-dT methods)
8
+ - polya_trimmer: Poly(A) tail modeling and trimming
9
+ - indel_corrector: Indel artifact removal
10
+ - netseq_refiner: NET-seq peak matching and refinement
11
+ - spikein_filter: Spike-in RNA detection and filtering
12
+ - output_writer: Unified output format
13
+
14
+ And CLI command implementations:
15
+ - correct_command: Main correction workflow
16
+ - train_polya_command: Poly(A) model training
17
+ - validate_command: Validation against NET-seq
18
+ """
19
+
20
+ __all__ = [
21
+ "bam_processor",
22
+ "atract_detector",
23
+ "ag_mispriming",
24
+ "polya_trimmer",
25
+ "indel_corrector",
26
+ "netseq_refiner",
27
+ "spikein_filter",
28
+ "output_writer",
29
+ "correct_command",
30
+ "train_polya_command",
31
+ "validate_command",
32
+ ]