rectify-rna 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rectify/__init__.py +36 -0
- rectify/__main__.py +9 -0
- rectify/cli.py +376 -0
- rectify/config.py +208 -0
- rectify/core/__init__.py +32 -0
- rectify/core/ag_mispriming.py +280 -0
- rectify/core/atract_detector.py +440 -0
- rectify/core/bam_processor.py +907 -0
- rectify/core/correct_command.py +240 -0
- rectify/core/indel_corrector.py +588 -0
- rectify/core/netseq_refiner.py +487 -0
- rectify/core/polya_model.py +377 -0
- rectify/core/polya_trimmer.py +396 -0
- rectify/core/spikein_filter.py +786 -0
- rectify/core/train_polya_command.py +632 -0
- rectify/core/validate_command.py +961 -0
- rectify/slurm.py +111 -0
- rectify/utils/__init__.py +14 -0
- rectify/utils/alignment.py +470 -0
- rectify/utils/genome.py +391 -0
- rectify/utils/stats.py +281 -0
- rectify_rna-2.1.0.dist-info/METADATA +259 -0
- rectify_rna-2.1.0.dist-info/RECORD +27 -0
- rectify_rna-2.1.0.dist-info/WHEEL +5 -0
- rectify_rna-2.1.0.dist-info/entry_points.txt +2 -0
- rectify_rna-2.1.0.dist-info/licenses/LICENSE +21 -0
- rectify_rna-2.1.0.dist-info/top_level.txt +1 -0
rectify/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RECTIFY: Unified RNA 3' End Correction Framework
|
|
3
|
+
|
|
4
|
+
A modular framework for correcting 3' end mapping artifacts in poly(A)-tailed RNA sequencing data.
|
|
5
|
+
|
|
6
|
+
Modules:
|
|
7
|
+
- A-tract ambiguity detection (universal)
|
|
8
|
+
- AG mispriming screening (oligo-dT methods)
|
|
9
|
+
- Poly(A) tail trimming and indel correction (direct RNA-seq)
|
|
10
|
+
- NET-seq refinement (optional)
|
|
11
|
+
|
|
12
|
+
Features (v2.1.0):
|
|
13
|
+
- Region-based parallel BAM processing with coverage gap splitting
|
|
14
|
+
- SLURM-aware CPU detection to prevent oversubscription
|
|
15
|
+
- Streaming output mode for large BAM files
|
|
16
|
+
|
|
17
|
+
Author: Kevin R. Roy
|
|
18
|
+
License: MIT
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
__version__ = "2.1.0"
|
|
22
|
+
__author__ = "Kevin R. Roy"
|
|
23
|
+
__email__ = "kevinroy@stanford.edu"
|
|
24
|
+
|
|
25
|
+
from . import core, utils, slurm
|
|
26
|
+
from .slurm import get_available_cpus, set_thread_limits, is_slurm_job
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"core",
|
|
30
|
+
"utils",
|
|
31
|
+
"slurm",
|
|
32
|
+
"get_available_cpus",
|
|
33
|
+
"set_thread_limits",
|
|
34
|
+
"is_slurm_job",
|
|
35
|
+
"__version__",
|
|
36
|
+
]
|
rectify/__main__.py
ADDED
rectify/cli.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
RECTIFY command-line interface.
|
|
4
|
+
|
|
5
|
+
Provides commands for:
|
|
6
|
+
- correct: Correct 3' end positions in BAM files
|
|
7
|
+
- train-polya: Train poly(A) tail model from control data
|
|
8
|
+
- validate: Validate corrections against NET-seq or other ground truth
|
|
9
|
+
|
|
10
|
+
Author: Kevin R. Roy
|
|
11
|
+
Date: 2026-03-09
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
from . import __version__
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_parser() -> argparse.ArgumentParser:
|
|
23
|
+
"""Create main argument parser."""
|
|
24
|
+
parser = argparse.ArgumentParser(
|
|
25
|
+
prog='rectify',
|
|
26
|
+
description='RECTIFY: Unified RNA 3\' End Correction Framework',
|
|
27
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
28
|
+
epilog="""
|
|
29
|
+
Examples:
|
|
30
|
+
# QuantSeq (oligo-dT short-read)
|
|
31
|
+
rectify correct quantseq.bam --genome sacCer3.fa --annotation genes.gtf --polya-sequenced -o corrected.tsv
|
|
32
|
+
|
|
33
|
+
# Nanopore direct RNA-seq with NET-seq refinement
|
|
34
|
+
rectify correct nanopore.bam --genome sacCer3.fa --annotation genes.gtf --polya-sequenced \\
|
|
35
|
+
--aligner minimap2 --netseq-dir bigwigs/ -o corrected.tsv
|
|
36
|
+
|
|
37
|
+
# Train poly(A) model
|
|
38
|
+
rectify train-polya nanopore.bam --genome sacCer3.fa --control-sites cpa_clusters.tsv -o model.json
|
|
39
|
+
|
|
40
|
+
Citation:
|
|
41
|
+
Roy, K. R., & Chanfreau, G. F. (2019). RECTIFY: Identification and correction of mRNA
|
|
42
|
+
mis-termination caused by oligo(dT)-primed internal priming. Nucleic Acids Research, 47(16), e96.
|
|
43
|
+
"""
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
'--version',
|
|
48
|
+
action='version',
|
|
49
|
+
version=f'RECTIFY {__version__}'
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
|
53
|
+
|
|
54
|
+
# =========================================================================
|
|
55
|
+
# correct command
|
|
56
|
+
# =========================================================================
|
|
57
|
+
correct_parser = subparsers.add_parser(
|
|
58
|
+
'correct',
|
|
59
|
+
help='Correct 3\' end positions in BAM file',
|
|
60
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Required arguments
|
|
64
|
+
correct_parser.add_argument(
|
|
65
|
+
'bam',
|
|
66
|
+
type=Path,
|
|
67
|
+
help='Input BAM file (aligned RNA-seq reads)'
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
correct_parser.add_argument(
|
|
71
|
+
'--genome',
|
|
72
|
+
type=Path,
|
|
73
|
+
required=True,
|
|
74
|
+
help='Reference genome FASTA file (indexed with .fai)'
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
correct_parser.add_argument(
|
|
78
|
+
'-o', '--output',
|
|
79
|
+
type=Path,
|
|
80
|
+
required=True,
|
|
81
|
+
help='Output TSV file with corrected 3\' end positions'
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Optional arguments
|
|
85
|
+
correct_parser.add_argument(
|
|
86
|
+
'--annotation',
|
|
87
|
+
type=Path,
|
|
88
|
+
help='Gene annotation file (GTF/GFF) for AG mispriming context'
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Technology flags
|
|
92
|
+
tech_group = correct_parser.add_argument_group('Technology settings')
|
|
93
|
+
tech_group.add_argument(
|
|
94
|
+
'--polya-sequenced',
|
|
95
|
+
action='store_true',
|
|
96
|
+
help='Poly(A) tail IS sequenced (enables poly(A) trimming and indel correction). '
|
|
97
|
+
'Use for: nanopore direct RNA, Helicos, QuantSeq, etc.'
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
tech_group.add_argument(
|
|
101
|
+
'--aligner',
|
|
102
|
+
choices=['minimap2', 'bwa', 'star', 'auto'],
|
|
103
|
+
default='auto',
|
|
104
|
+
help='Aligner used (affects indel artifact detection)'
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Module flags
|
|
108
|
+
module_group = correct_parser.add_argument_group('Module selection')
|
|
109
|
+
module_group.add_argument(
|
|
110
|
+
'--skip-atract-check',
|
|
111
|
+
action='store_true',
|
|
112
|
+
help='Skip A-tract ambiguity detection (for organisms without A-tracts)'
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
module_group.add_argument(
|
|
116
|
+
'--skip-ag-check',
|
|
117
|
+
action='store_true',
|
|
118
|
+
help='Skip AG mispriming screening'
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
module_group.add_argument(
|
|
122
|
+
'--skip-polya-trim',
|
|
123
|
+
action='store_true',
|
|
124
|
+
help='Skip poly(A) tail trimming (even if --polya-sequenced)'
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
module_group.add_argument(
|
|
128
|
+
'--skip-indel-correction',
|
|
129
|
+
action='store_true',
|
|
130
|
+
help='Skip indel artifact correction (even if --polya-sequenced)'
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Poly(A) model
|
|
134
|
+
polya_group = correct_parser.add_argument_group('Poly(A) tail model')
|
|
135
|
+
polya_group.add_argument(
|
|
136
|
+
'--polya-model',
|
|
137
|
+
type=Path,
|
|
138
|
+
help='Pre-trained poly(A) tail model (JSON). If not provided, uses built-in model.'
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# NET-seq refinement
|
|
142
|
+
netseq_group = correct_parser.add_argument_group('NET-seq refinement')
|
|
143
|
+
netseq_group.add_argument(
|
|
144
|
+
'--netseq-dir',
|
|
145
|
+
type=Path,
|
|
146
|
+
help='Directory containing NET-seq BigWig files (.bw) for refinement'
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
netseq_group.add_argument(
|
|
150
|
+
'--netseq-samples',
|
|
151
|
+
nargs='+',
|
|
152
|
+
help='NET-seq sample names to use (e.g., wt_2022_rep1). If not provided, auto-detect.'
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Thresholds and parameters
|
|
156
|
+
param_group = correct_parser.add_argument_group('Parameters')
|
|
157
|
+
param_group.add_argument(
|
|
158
|
+
'--ag-threshold',
|
|
159
|
+
type=float,
|
|
160
|
+
default=0.65,
|
|
161
|
+
help='AG-richness threshold for mispriming flagging (0.0-1.0)'
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
param_group.add_argument(
|
|
165
|
+
'--polya-richness',
|
|
166
|
+
type=float,
|
|
167
|
+
default=0.8,
|
|
168
|
+
help='A-richness threshold for poly(A) tail detection (0.0-1.0)'
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
param_group.add_argument(
|
|
172
|
+
'--min-polya-length',
|
|
173
|
+
type=int,
|
|
174
|
+
default=15,
|
|
175
|
+
help='Minimum poly(A) tail length for nanopore oligo-dT priming'
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Output options
|
|
179
|
+
output_group = correct_parser.add_argument_group('Output options')
|
|
180
|
+
output_group.add_argument(
|
|
181
|
+
'--report',
|
|
182
|
+
type=Path,
|
|
183
|
+
help='Output QC report file (HTML or PDF)'
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
output_group.add_argument(
|
|
187
|
+
'--verbose',
|
|
188
|
+
action='store_true',
|
|
189
|
+
help='Verbose logging'
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Performance options
|
|
193
|
+
perf_group = correct_parser.add_argument_group('Performance options')
|
|
194
|
+
perf_group.add_argument(
|
|
195
|
+
'-j', '--threads',
|
|
196
|
+
type=int,
|
|
197
|
+
default=0,
|
|
198
|
+
help='Number of threads for parallel processing. '
|
|
199
|
+
'0 = auto-detect from SLURM_CPUS_PER_TASK or system (default: 0)'
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
perf_group.add_argument(
|
|
203
|
+
'--streaming',
|
|
204
|
+
action='store_true',
|
|
205
|
+
help='Use streaming output mode to minimize memory usage. '
|
|
206
|
+
'Recommended for BAM files > 10GB. Writes directly to output file.'
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
perf_group.add_argument(
|
|
210
|
+
'--chunk-size',
|
|
211
|
+
type=int,
|
|
212
|
+
default=10000,
|
|
213
|
+
help='Number of reads per output chunk in streaming mode (default: 10000)'
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# =========================================================================
|
|
217
|
+
# train-polya command
|
|
218
|
+
# =========================================================================
|
|
219
|
+
train_parser = subparsers.add_parser(
|
|
220
|
+
'train-polya',
|
|
221
|
+
help='Train poly(A) tail model from control data',
|
|
222
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
train_parser.add_argument(
|
|
226
|
+
'bam',
|
|
227
|
+
type=Path,
|
|
228
|
+
help='Input BAM file with soft-clipped poly(A) tails'
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
train_parser.add_argument(
|
|
232
|
+
'--genome',
|
|
233
|
+
type=Path,
|
|
234
|
+
required=True,
|
|
235
|
+
help='Reference genome FASTA file'
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
train_parser.add_argument(
|
|
239
|
+
'--control-sites',
|
|
240
|
+
type=Path,
|
|
241
|
+
required=True,
|
|
242
|
+
help='TSV file with control CPA sites (0A downstream A-count)'
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
train_parser.add_argument(
|
|
246
|
+
'-o', '--output',
|
|
247
|
+
type=Path,
|
|
248
|
+
required=True,
|
|
249
|
+
help='Output model file (JSON)'
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
train_parser.add_argument(
|
|
253
|
+
'--min-reads',
|
|
254
|
+
type=int,
|
|
255
|
+
default=10,
|
|
256
|
+
help='Minimum reads per control site for training'
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
train_parser.add_argument(
|
|
260
|
+
'--verbose',
|
|
261
|
+
action='store_true',
|
|
262
|
+
help='Verbose logging'
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# =========================================================================
|
|
266
|
+
# validate command
|
|
267
|
+
# =========================================================================
|
|
268
|
+
validate_parser = subparsers.add_parser(
|
|
269
|
+
'validate',
|
|
270
|
+
help='Validate corrections against ground truth',
|
|
271
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Required arguments
|
|
275
|
+
validate_parser.add_argument(
|
|
276
|
+
'corrected',
|
|
277
|
+
type=Path,
|
|
278
|
+
help='Corrected 3\' ends TSV from RECTIFY'
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
validate_parser.add_argument(
|
|
282
|
+
'-o', '--output',
|
|
283
|
+
type=Path,
|
|
284
|
+
required=True,
|
|
285
|
+
help='Output validation results TSV'
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Ground truth sources (at least one required)
|
|
289
|
+
truth_group = validate_parser.add_argument_group(
|
|
290
|
+
'Ground truth sources (at least one required)'
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
truth_group.add_argument(
|
|
294
|
+
'--netseq-dir',
|
|
295
|
+
type=Path,
|
|
296
|
+
help='NET-seq BigWig directory (optional - requires pyBigWig)'
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
truth_group.add_argument(
|
|
300
|
+
'--netseq-samples',
|
|
301
|
+
nargs='+',
|
|
302
|
+
help='Specific NET-seq samples to use'
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
truth_group.add_argument(
|
|
306
|
+
'--annotation',
|
|
307
|
+
type=Path,
|
|
308
|
+
help='Gene annotation GTF/GFF with known 3\' ends'
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
truth_group.add_argument(
|
|
312
|
+
'--ground-truth',
|
|
313
|
+
type=Path,
|
|
314
|
+
help='TSV file with known true 3\' end positions'
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Validation parameters
|
|
318
|
+
param_group = validate_parser.add_argument_group('Validation parameters')
|
|
319
|
+
|
|
320
|
+
param_group.add_argument(
|
|
321
|
+
'--tolerance',
|
|
322
|
+
type=int,
|
|
323
|
+
default=1,
|
|
324
|
+
help='Position tolerance in bp for "correct" classification'
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
param_group.add_argument(
|
|
328
|
+
'--min-signal',
|
|
329
|
+
type=float,
|
|
330
|
+
default=0.5,
|
|
331
|
+
help='Minimum NET-seq signal for ground truth'
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
param_group.add_argument(
|
|
335
|
+
'--search-window',
|
|
336
|
+
type=int,
|
|
337
|
+
default=10,
|
|
338
|
+
help='Window size for finding nearest ground truth'
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Output options
|
|
342
|
+
validate_parser.add_argument(
|
|
343
|
+
'--verbose',
|
|
344
|
+
action='store_true',
|
|
345
|
+
help='Verbose logging'
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
return parser
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def main(argv: Optional[list] = None):
|
|
352
|
+
"""Main entry point for RECTIFY CLI."""
|
|
353
|
+
parser = create_parser()
|
|
354
|
+
args = parser.parse_args(argv)
|
|
355
|
+
|
|
356
|
+
if args.command is None:
|
|
357
|
+
parser.print_help()
|
|
358
|
+
sys.exit(1)
|
|
359
|
+
|
|
360
|
+
# Import commands only when needed
|
|
361
|
+
if args.command == 'correct':
|
|
362
|
+
from .core import correct_command
|
|
363
|
+
correct_command.run(args)
|
|
364
|
+
elif args.command == 'train-polya':
|
|
365
|
+
from .core import train_polya_command
|
|
366
|
+
train_polya_command.run(args)
|
|
367
|
+
elif args.command == 'validate':
|
|
368
|
+
from .core import validate_command
|
|
369
|
+
validate_command.run(args)
|
|
370
|
+
else:
|
|
371
|
+
parser.print_help()
|
|
372
|
+
sys.exit(1)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
if __name__ == '__main__':
|
|
376
|
+
main()
|
rectify/config.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Configuration constants for RECTIFY.
|
|
4
|
+
|
|
5
|
+
This module centralizes:
|
|
6
|
+
- Chromosome mappings (standard ↔ NCBI format)
|
|
7
|
+
- Shift correction parameters (from NET-seq analysis)
|
|
8
|
+
- Poly(A) tail model parameters
|
|
9
|
+
- Indel detection parameters
|
|
10
|
+
- NET-seq refinement parameters
|
|
11
|
+
|
|
12
|
+
Author: Kevin R. Roy
|
|
13
|
+
Date: 2026-03-09
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from typing import Dict
|
|
17
|
+
|
|
18
|
+
# =============================================================================
|
|
19
|
+
# Version
|
|
20
|
+
# =============================================================================
|
|
21
|
+
|
|
22
|
+
__version__ = "2.1.0"
|
|
23
|
+
|
|
24
|
+
# =============================================================================
|
|
25
|
+
# Chromosome Mappings
|
|
26
|
+
# =============================================================================
|
|
27
|
+
|
|
28
|
+
# Standard chromosome to NCBI genome format mapping (S. cerevisiae)
|
|
29
|
+
CHROM_TO_GENOME: Dict[str, str] = {
|
|
30
|
+
'chrI': 'ref|NC_001133|',
|
|
31
|
+
'chrII': 'ref|NC_001134|',
|
|
32
|
+
'chrIII': 'ref|NC_001135|',
|
|
33
|
+
'chrIV': 'ref|NC_001136|',
|
|
34
|
+
'chrV': 'ref|NC_001137|',
|
|
35
|
+
'chrVI': 'ref|NC_001138|',
|
|
36
|
+
'chrVII': 'ref|NC_001139|',
|
|
37
|
+
'chrVIII': 'ref|NC_001140|',
|
|
38
|
+
'chrIX': 'ref|NC_001141|',
|
|
39
|
+
'chrX': 'ref|NC_001142|',
|
|
40
|
+
'chrXI': 'ref|NC_001143|',
|
|
41
|
+
'chrXII': 'ref|NC_001144|',
|
|
42
|
+
'chrXIII': 'ref|NC_001145|',
|
|
43
|
+
'chrXIV': 'ref|NC_001146|',
|
|
44
|
+
'chrXV': 'ref|NC_001147|',
|
|
45
|
+
'chrXVI': 'ref|NC_001148|',
|
|
46
|
+
'chrMito': 'ref|NC_001224|',
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Reverse mapping
|
|
50
|
+
GENOME_TO_CHROM: Dict[str, str] = {v: k for k, v in CHROM_TO_GENOME.items()}
|
|
51
|
+
|
|
52
|
+
# Chromosome sizes (sacCer3/R64)
|
|
53
|
+
CHROM_SIZES: Dict[str, int] = {
|
|
54
|
+
'chrI': 230218,
|
|
55
|
+
'chrII': 813184,
|
|
56
|
+
'chrIII': 316620,
|
|
57
|
+
'chrIV': 1531933,
|
|
58
|
+
'chrV': 576874,
|
|
59
|
+
'chrVI': 270161,
|
|
60
|
+
'chrVII': 1090940,
|
|
61
|
+
'chrVIII': 562643,
|
|
62
|
+
'chrIX': 439888,
|
|
63
|
+
'chrX': 745751,
|
|
64
|
+
'chrXI': 666816,
|
|
65
|
+
'chrXII': 1078177,
|
|
66
|
+
'chrXIII': 924431,
|
|
67
|
+
'chrXIV': 784333,
|
|
68
|
+
'chrXV': 1091291,
|
|
69
|
+
'chrXVI': 948066,
|
|
70
|
+
'chrMito': 85779,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# =============================================================================
|
|
74
|
+
# Shift Correction Parameters (from NET-seq Analysis)
|
|
75
|
+
# =============================================================================
|
|
76
|
+
|
|
77
|
+
# These represent how much the apparent position is shifted RIGHTWARD (downstream)
|
|
78
|
+
# due to long poly(A) tails (>15 A's for nanopore) aligning to genomic A's/T's
|
|
79
|
+
# To correct: shift LEFTWARD (upstream) by this amount
|
|
80
|
+
SHIFT_CORRECTIONS_BY_ACOUNT: Dict[int, float] = {
|
|
81
|
+
0: 0.0, # No downstream A's - no shift
|
|
82
|
+
1: 0.2, # Minimal shift
|
|
83
|
+
2: 0.3,
|
|
84
|
+
3: 0.4,
|
|
85
|
+
4: 1.0, # Moderate shift
|
|
86
|
+
5: 1.3,
|
|
87
|
+
6: 1.7,
|
|
88
|
+
7: 2.6, # Strong shift
|
|
89
|
+
8: 2.8,
|
|
90
|
+
9: 2.9,
|
|
91
|
+
10: 3.8, # Maximum observed shift (saturated)
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# For A-counts > 10, use the 10A value (saturated)
|
|
95
|
+
DEFAULT_MAX_SHIFT: float = 3.8
|
|
96
|
+
|
|
97
|
+
# =============================================================================
|
|
98
|
+
# Poly(A) Tail Model Parameters
|
|
99
|
+
# =============================================================================
|
|
100
|
+
|
|
101
|
+
# Minimum tail length for nanopore oligo-dT priming
|
|
102
|
+
MIN_POLYA_LENGTH: int = 15
|
|
103
|
+
|
|
104
|
+
# A-richness threshold for identifying poly(A) tails
|
|
105
|
+
POLYA_RICHNESS_THRESHOLD: float = 0.8 # 80% A content
|
|
106
|
+
|
|
107
|
+
# Window size for A-richness calculation
|
|
108
|
+
POLYA_WINDOW_SIZE: int = 10
|
|
109
|
+
|
|
110
|
+
# RTA adapter patterns to detect
|
|
111
|
+
ADAPTER_POLY_T_MIN: int = 6 # Minimum poly(T) length
|
|
112
|
+
ADAPTER_TC_MOTIFS = ['TC', 'TCTC', 'TCT']
|
|
113
|
+
|
|
114
|
+
# Scoring thresholds for tail classification
|
|
115
|
+
TAIL_SCORE_HIGH: float = 0.7 # Definite poly(A) tail
|
|
116
|
+
TAIL_SCORE_LOW: float = 0.4 # Uncertain
|
|
117
|
+
|
|
118
|
+
# =============================================================================
|
|
119
|
+
# Indel Detection Parameters
|
|
120
|
+
# =============================================================================
|
|
121
|
+
|
|
122
|
+
# Maximum distance from 3' end to consider deletions as artifacts
|
|
123
|
+
INDEL_SEARCH_WINDOW: int = 20 # bp
|
|
124
|
+
|
|
125
|
+
# Maximum deletion size to consider as artifact
|
|
126
|
+
INDEL_MAX_SIZE: int = 3 # bp
|
|
127
|
+
|
|
128
|
+
# Minimum A-richness in flanking regions to classify as artifact
|
|
129
|
+
INDEL_FLANK_A_THRESHOLD: float = 0.7 # 70% A content
|
|
130
|
+
|
|
131
|
+
# Minimum flank length to check for A-richness
|
|
132
|
+
INDEL_MIN_FLANK_LENGTH: int = 5 # bp
|
|
133
|
+
|
|
134
|
+
# =============================================================================
|
|
135
|
+
# Ambiguity Range Parameters
|
|
136
|
+
# =============================================================================
|
|
137
|
+
|
|
138
|
+
# Maximum shift to consider when calculating ambiguity ranges
|
|
139
|
+
MAX_AMBIGUITY_SHIFT: int = 5 # bp
|
|
140
|
+
|
|
141
|
+
# Window size for downstream A-count calculation
|
|
142
|
+
DOWNSTREAM_WINDOW_SIZE: int = 10 # bp
|
|
143
|
+
|
|
144
|
+
# =============================================================================
|
|
145
|
+
# NET-seq Refinement Parameters
|
|
146
|
+
# =============================================================================
|
|
147
|
+
|
|
148
|
+
# Agreement threshold for combining WT and dst1 NET-seq data
|
|
149
|
+
NETSEQ_AGREEMENT_THRESHOLD: float = 0.90 # 90% agreement at ±1bp
|
|
150
|
+
|
|
151
|
+
# Window around CPA for peak detection
|
|
152
|
+
NETSEQ_PEAK_WINDOW: int = 5 # ±5bp
|
|
153
|
+
|
|
154
|
+
# Minimum signal threshold for peak calling (relative to max)
|
|
155
|
+
NETSEQ_PEAK_THRESHOLD: float = 0.5 # 50% of max signal
|
|
156
|
+
|
|
157
|
+
# Minimum peak signal for high confidence
|
|
158
|
+
NETSEQ_SIGNAL_HIGH: float = 1.0
|
|
159
|
+
NETSEQ_SIGNAL_MEDIUM: float = 0.5
|
|
160
|
+
|
|
161
|
+
# Maximum distance between peaks to consider them "close"
|
|
162
|
+
NETSEQ_PEAK_CLOSE_DISTANCE: int = 2 # bp
|
|
163
|
+
|
|
164
|
+
# =============================================================================
|
|
165
|
+
# AG Mispriming Parameters (from original RECTIFY)
|
|
166
|
+
# =============================================================================
|
|
167
|
+
|
|
168
|
+
# Window size for AG-richness calculation
|
|
169
|
+
AG_RICHNESS_WINDOW: int = 50 # bp downstream
|
|
170
|
+
|
|
171
|
+
# AG content threshold for flagging likely mispriming
|
|
172
|
+
AG_RICHNESS_THRESHOLD: float = 0.65 # 65% A+G content
|
|
173
|
+
|
|
174
|
+
# Minimum window size if near chromosome end
|
|
175
|
+
AG_RICHNESS_MIN_WINDOW: int = 20 # bp
|
|
176
|
+
|
|
177
|
+
# =============================================================================
|
|
178
|
+
# Helper Functions
|
|
179
|
+
# =============================================================================
|
|
180
|
+
|
|
181
|
+
def get_shift_from_acount(a_count: int) -> float:
|
|
182
|
+
"""
|
|
183
|
+
Get shift correction for given A-count.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
a_count: Number of A's in downstream window
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Shift correction in bp (how much to shift leftward/upstream)
|
|
190
|
+
"""
|
|
191
|
+
if a_count <= 10:
|
|
192
|
+
return SHIFT_CORRECTIONS_BY_ACOUNT.get(a_count, 0.0)
|
|
193
|
+
else:
|
|
194
|
+
return DEFAULT_MAX_SHIFT
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def validate_config():
|
|
198
|
+
"""Validate configuration parameters."""
|
|
199
|
+
assert len(CHROM_TO_GENOME) == len(GENOME_TO_CHROM), "Chromosome mapping mismatch"
|
|
200
|
+
assert len(CHROM_TO_GENOME) == len(CHROM_SIZES), "Chromosome size mapping incomplete"
|
|
201
|
+
assert 0.0 <= POLYA_RICHNESS_THRESHOLD <= 1.0, "Invalid poly(A) richness threshold"
|
|
202
|
+
assert 0.0 <= AG_RICHNESS_THRESHOLD <= 1.0, "Invalid AG richness threshold"
|
|
203
|
+
assert MIN_POLYA_LENGTH > 0, "Invalid minimum poly(A) length"
|
|
204
|
+
assert INDEL_SEARCH_WINDOW > 0, "Invalid indel search window"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# Run validation on import
|
|
208
|
+
validate_config()
|
rectify/core/__init__.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RECTIFY core modules.
|
|
3
|
+
|
|
4
|
+
This package contains the main correction algorithms:
|
|
5
|
+
- bam_processor: BAM reading and 3' end extraction
|
|
6
|
+
- atract_detector: A-tract ambiguity detection (universal)
|
|
7
|
+
- ag_mispriming: AG-richness screening (oligo-dT methods)
|
|
8
|
+
- polya_trimmer: Poly(A) tail modeling and trimming
|
|
9
|
+
- indel_corrector: Indel artifact removal
|
|
10
|
+
- netseq_refiner: NET-seq peak matching and refinement
|
|
11
|
+
- spikein_filter: Spike-in RNA detection and filtering
|
|
12
|
+
- output_writer: Unified output format
|
|
13
|
+
|
|
14
|
+
And CLI command implementations:
|
|
15
|
+
- correct_command: Main correction workflow
|
|
16
|
+
- train_polya_command: Poly(A) model training
|
|
17
|
+
- validate_command: Validation against NET-seq
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"bam_processor",
|
|
22
|
+
"atract_detector",
|
|
23
|
+
"ag_mispriming",
|
|
24
|
+
"polya_trimmer",
|
|
25
|
+
"indel_corrector",
|
|
26
|
+
"netseq_refiner",
|
|
27
|
+
"spikein_filter",
|
|
28
|
+
"output_writer",
|
|
29
|
+
"correct_command",
|
|
30
|
+
"train_polya_command",
|
|
31
|
+
"validate_command",
|
|
32
|
+
]
|