exomeflow 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Robin Tomar, AIIMS New Delhi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,287 @@
1
+ Metadata-Version: 2.4
2
+ Name: exomeflow
3
+ Version: 1.0.0
4
+ Summary: Production-quality Whole Exome Sequencing analysis pipeline
5
+ Author-email: Robin Tomar <robin@aiims.ac.in>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/robintomar/exomeflow
8
+ Project-URL: Repository, https://github.com/robintomar/exomeflow
9
+ Project-URL: Bug Tracker, https://github.com/robintomar/exomeflow/issues
10
+ Keywords: bioinformatics,WES,NGS,genomics,exome,variant-calling
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Operating System :: POSIX :: Linux
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: typer>=0.12.0
24
+ Requires-Dist: rich>=13.0.0
25
+ Requires-Dist: pandas>=2.0.0
26
+ Requires-Dist: pysam>=0.22.0
27
+ Dynamic: license-file
28
+
29
+ # ExomeFlow
30
+
31
+ **Production-quality Whole Exome Sequencing (WES) analysis pipeline**
32
+
33
+ > Author: Robin Tomar, AIIMS New Delhi
34
+ > License: MIT
35
+
36
+ ---
37
+
38
+ ## Overview
39
+
40
+ ExomeFlow is a Python package that wraps a complete WES analysis workflow into a single, reproducible CLI command. It handles cohort-level processing (multiple samples), checkpointing for resumable runs, structured logging, and parallel execution.
41
+
42
+ ```
43
+ FASTQ
44
+ └─ fastp (QC + trimming)
45
+ └─ BWA MEM (alignment)
46
+ └─ GATK SortSam (coordinate sort)
47
+ └─ samtools flagstat (alignment QC)
48
+ └─ GATK MarkDuplicates
49
+ └─ GATK BuildBamIndex
50
+ └─ GATK BQSR (BaseRecalibrator + ApplyBQSR)
51
+ └─ GATK HaplotypeCaller (variant calling)
52
+ └─ GATK VariantFiltration (hard filters)
53
+ └─ ANNOVAR (functional annotation)
54
+ ```
55
+
56
+ ---
57
+
58
+ ## Requirements
59
+
60
+ ### System dependencies (must be on `PATH`)
61
+
62
+ | Tool | Version tested |
63
+ |------|---------------|
64
+ | `bwa` | ≥ 0.7.17 |
65
+ | `samtools` | ≥ 1.17 |
66
+ | `gatk` | 4.6.x |
67
+ | `fastp` | ≥ 0.23 |
68
+ | Perl + ANNOVAR | `table_annovar.pl` |
69
+
70
+ ### Python
71
+
72
+ - Python ≥ 3.9
73
+ - See `requirements.txt` for Python dependencies
74
+
75
+ ---
76
+
77
+ ## Installation
78
+
79
+ ### From PyPI
80
+
81
+ ```bash
82
+ pip install exomeflow
83
+ ```
84
+
85
+ ### From source
86
+
87
+ ```bash
88
+ git clone https://github.com/robintomar/exomeflow.git
89
+ cd exomeflow
90
+ pip install -e .
91
+ ```
92
+
93
+ ---
94
+
95
+ ## Reference files required
96
+
97
+ | File | Description |
98
+ |------|-------------|
99
+ | `hg38.fa` | BWA-indexed reference genome |
100
+ | `dbsnp.vcf.gz` | dbSNP (bgzipped + tabix-indexed) |
101
+ | `Mills_and_1000G_gold_standard.indels.hg38.vcf.gz` | Mills indels |
102
+ | `Homo_sapiens_assembly38.known_indels.vcf.gz` | Known indels |
103
+ | Exome capture BED | e.g. `Illumina_Exome_TargetedRegions_v1.2.hg38.bed` |
104
+ | ANNOVAR humandb | `hg38` annotation databases |
105
+
106
+ ---
107
+
108
+ ## Input FASTQ naming convention
109
+
110
+ ExomeFlow automatically detects samples from paired-end FASTQ files:
111
+
112
+ ```
113
+ fastq/
114
+ ├── sample1_1.fastq.gz
115
+ ├── sample1_2.fastq.gz
116
+ ├── sample2_1.fastq.gz
117
+ └── sample2_2.fastq.gz
118
+ ```
119
+
120
+ Pattern: `<sample_id>_1.fastq.gz` / `<sample_id>_2.fastq.gz`
121
+
122
+ ---
123
+
124
+ ## Usage
125
+
126
+ ### Minimal example
127
+
128
+ ```bash
129
+ exomeflow run \
130
+ --input-dir fastq/ \
131
+ --output results/ \
132
+ --reference /refs/hg38.fa \
133
+ --dbsnp /refs/dbsnp.vcf.gz \
134
+ --mills /refs/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
135
+ --known-indels /refs/Homo_sapiens_assembly38.known_indels.vcf.gz \
136
+ --annovar-bin /tools/annovar \
137
+ --annovar-db /tools/annovar/humandb
138
+ ```
139
+
140
+ ### Full example with all options
141
+
142
+ ```bash
143
+ exomeflow run \
144
+ --input-dir fastq/ \
145
+ --output results/ \
146
+ --reference /refs/hg38.fa \
147
+ --dbsnp /refs/dbsnp.vcf.gz \
148
+ --mills /refs/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
149
+ --known-indels /refs/Homo_sapiens_assembly38.known_indels.vcf.gz \
150
+ --intervals /refs/Illumina_Exome_TargetedRegions_v1.2.hg38.bed \
151
+ --interval-padding 100 \
152
+ --annovar-bin /tools/annovar \
153
+ --annovar-db /tools/annovar/humandb \
154
+ --threads 32 \
155
+ --fastp-threads 8 \
156
+ --annovar-threads 24 \
157
+ --max-workers 2 \
158
+ --java-opts "-Xmx80g"
159
+ ```
160
+
161
+ ### Check version
162
+
163
+ ```bash
164
+ exomeflow --version
165
+ ```
166
+
167
+ ### Help
168
+
169
+ ```bash
170
+ exomeflow run --help
171
+ ```
172
+
173
+ ---
174
+
175
+ ## Output files
176
+
177
+ After a successful run the `results/` directory contains:
178
+
179
+ ```
180
+ results/
181
+ ├── QC/ # fastp HTML/JSON reports (reserved)
182
+ ├── filtered_fastp/
183
+ │ ├── <sample>_1_filtered.fastq.gz
184
+ │ ├── <sample>_2_filtered.fastq.gz
185
+ │ ├── <sample>_fastp.html
186
+ │ └── <sample>_fastp.json
187
+ ├── Mapsam/
188
+ │ ├── <sample>_recalibrated.bam ← use in IGV for variant validation
189
+ │ └── <sample>_recalibrated.bam.bai
190
+ ├── VCF/
191
+ │ ├── <sample>.vcf ← raw HaplotypeCaller output
192
+ │ ├── <sample>_PASS.vcf ← PASS-only hard-filtered variants
193
+ │ ├── <sample>.annovar.hg38_multianno.vcf ← annotated VCF
194
+ │ └── <sample>.annovar.hg38_multianno.txt ← annotated tab-delimited table
195
+ ├── logs/
196
+ │ ├── analysis_<timestamp>.log ← full pipeline log
197
+ │ ├── errors_<timestamp>.log ← errors only
198
+ │ └── <sample>_<timestamp>.log ← per-sample log
199
+ └── .checkpoints/ ← resume state (do not delete during a run)
200
+ ```
201
+
202
+ ---
203
+
204
+ ## Checkpointing & resuming
205
+
206
+ ExomeFlow writes a checkpoint file for every completed step. If the pipeline
207
+ is interrupted (power failure, wall-time limit, etc.) simply re-run the
208
+ **exact same command** — completed steps are skipped automatically.
209
+
210
+ ---
211
+
212
+ ## GATK hard-filter thresholds
213
+
214
+ ### SNPs
215
+
216
+ | Filter name | Expression |
217
+ |-------------|-----------|
218
+ | `SNP_LowQD` | `QD < 2.0` |
219
+ | `SNP_StrandBias` | `FS > 60.0` |
220
+ | `SNP_StrandOddsRatio` | `SOR > 3.0` |
221
+ | `SNP_LowMQ` | `MQ < 40.0` |
222
+ | `SNP_MQRankSum` | `MQRankSum < -12.5` |
223
+ | `SNP_ReadPosRankSum` | `ReadPosRankSum < -8.0` |
224
+ | `LowDepth` | `DP < 10` |
225
+ | `LowGQ` *(genotype)* | `GQ < 20` |
226
+
227
+ ### INDELs
228
+
229
+ | Filter name | Expression |
230
+ |-------------|-----------|
231
+ | `INDEL_LowQD` | `QD < 2.0` |
232
+ | `INDEL_StrandBias` | `FS > 200.0` |
233
+ | `INDEL_StrandOddsRatio` | `SOR > 10.0` |
234
+ | `INDEL_ReadPosRankSum` | `ReadPosRankSum < -20.0` |
235
+ | `LowDepth` | `DP < 10` |
236
+ | `LowGQ` *(genotype)* | `GQ < 20` |
237
+
238
+ ---
239
+
240
+ ## ANNOVAR annotation databases (default)
241
+
242
+ ```
243
+ refGene, dbnsfp47a, clinvar_20240416, gnomad41_exome,
244
+ gnomad41_genome, avsnp150, cosmic84_coding, exac03
245
+ ```
246
+
247
+ ---
248
+
249
+ ## Publishing to PyPI
250
+
251
+ ```bash
252
+ pip install build twine
253
+
254
+ # Build source + wheel distributions
255
+ python -m build
256
+
257
+ # Upload to PyPI (requires ~/.pypirc or TWINE_USERNAME / TWINE_PASSWORD env vars)
258
+ twine upload dist/*
259
+ ```
260
+
261
+ To publish to TestPyPI first:
262
+
263
+ ```bash
264
+ twine upload --repository testpypi dist/*
265
+ pip install --index-url https://test.pypi.org/simple/ exomeflow
266
+ ```
267
+
268
+ ---
269
+
270
+ ## Development
271
+
272
+ ```bash
273
+ # Install in editable mode with dev extras
274
+ pip install -e ".[dev]"
275
+
276
+ # Lint
277
+ flake8 exomeflow/
278
+ mypy exomeflow/
279
+ ```
280
+
281
+ ---
282
+
283
+ ## Citation
284
+
285
+ If you use ExomeFlow in your research, please cite:
286
+
287
+ > Robin Tomar. *ExomeFlow: a production-quality whole exome sequencing pipeline*. AIIMS New Delhi, 2025.
@@ -0,0 +1,259 @@
1
+ # ExomeFlow
2
+
3
+ **Production-quality Whole Exome Sequencing (WES) analysis pipeline**
4
+
5
+ > Author: Robin Tomar, AIIMS New Delhi
6
+ > License: MIT
7
+
8
+ ---
9
+
10
+ ## Overview
11
+
12
+ ExomeFlow is a Python package that wraps a complete WES analysis workflow into a single, reproducible CLI command. It handles cohort-level processing (multiple samples), checkpointing for resumable runs, structured logging, and parallel execution.
13
+
14
+ ```
15
+ FASTQ
16
+ └─ fastp (QC + trimming)
17
+ └─ BWA MEM (alignment)
18
+ └─ GATK SortSam (coordinate sort)
19
+ └─ samtools flagstat (alignment QC)
20
+ └─ GATK MarkDuplicates
21
+ └─ GATK BuildBamIndex
22
+ └─ GATK BQSR (BaseRecalibrator + ApplyBQSR)
23
+ └─ GATK HaplotypeCaller (variant calling)
24
+ └─ GATK VariantFiltration (hard filters)
25
+ └─ ANNOVAR (functional annotation)
26
+ ```
27
+
28
+ ---
29
+
30
+ ## Requirements
31
+
32
+ ### System dependencies (must be on `PATH`)
33
+
34
+ | Tool | Version tested |
35
+ |------|---------------|
36
+ | `bwa` | ≥ 0.7.17 |
37
+ | `samtools` | ≥ 1.17 |
38
+ | `gatk` | 4.6.x |
39
+ | `fastp` | ≥ 0.23 |
40
+ | Perl + ANNOVAR | `table_annovar.pl` |
41
+
42
+ ### Python
43
+
44
+ - Python ≥ 3.9
45
+ - See `requirements.txt` for Python dependencies
46
+
47
+ ---
48
+
49
+ ## Installation
50
+
51
+ ### From PyPI
52
+
53
+ ```bash
54
+ pip install exomeflow
55
+ ```
56
+
57
+ ### From source
58
+
59
+ ```bash
60
+ git clone https://github.com/robintomar/exomeflow.git
61
+ cd exomeflow
62
+ pip install -e .
63
+ ```
64
+
65
+ ---
66
+
67
+ ## Reference files required
68
+
69
+ | File | Description |
70
+ |------|-------------|
71
+ | `hg38.fa` | BWA-indexed reference genome |
72
+ | `dbsnp.vcf.gz` | dbSNP (bgzipped + tabix-indexed) |
73
+ | `Mills_and_1000G_gold_standard.indels.hg38.vcf.gz` | Mills indels |
74
+ | `Homo_sapiens_assembly38.known_indels.vcf.gz` | Known indels |
75
+ | Exome capture BED | e.g. `Illumina_Exome_TargetedRegions_v1.2.hg38.bed` |
76
+ | ANNOVAR humandb | `hg38` annotation databases |
77
+
78
+ ---
79
+
80
+ ## Input FASTQ naming convention
81
+
82
+ ExomeFlow automatically detects samples from paired-end FASTQ files:
83
+
84
+ ```
85
+ fastq/
86
+ ├── sample1_1.fastq.gz
87
+ ├── sample1_2.fastq.gz
88
+ ├── sample2_1.fastq.gz
89
+ └── sample2_2.fastq.gz
90
+ ```
91
+
92
+ Pattern: `<sample_id>_1.fastq.gz` / `<sample_id>_2.fastq.gz`
93
+
94
+ ---
95
+
96
+ ## Usage
97
+
98
+ ### Minimal example
99
+
100
+ ```bash
101
+ exomeflow run \
102
+ --input-dir fastq/ \
103
+ --output results/ \
104
+ --reference /refs/hg38.fa \
105
+ --dbsnp /refs/dbsnp.vcf.gz \
106
+ --mills /refs/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
107
+ --known-indels /refs/Homo_sapiens_assembly38.known_indels.vcf.gz \
108
+ --annovar-bin /tools/annovar \
109
+ --annovar-db /tools/annovar/humandb
110
+ ```
111
+
112
+ ### Full example with all options
113
+
114
+ ```bash
115
+ exomeflow run \
116
+ --input-dir fastq/ \
117
+ --output results/ \
118
+ --reference /refs/hg38.fa \
119
+ --dbsnp /refs/dbsnp.vcf.gz \
120
+ --mills /refs/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
121
+ --known-indels /refs/Homo_sapiens_assembly38.known_indels.vcf.gz \
122
+ --intervals /refs/Illumina_Exome_TargetedRegions_v1.2.hg38.bed \
123
+ --interval-padding 100 \
124
+ --annovar-bin /tools/annovar \
125
+ --annovar-db /tools/annovar/humandb \
126
+ --threads 32 \
127
+ --fastp-threads 8 \
128
+ --annovar-threads 24 \
129
+ --max-workers 2 \
130
+ --java-opts "-Xmx80g"
131
+ ```
132
+
133
+ ### Check version
134
+
135
+ ```bash
136
+ exomeflow --version
137
+ ```
138
+
139
+ ### Help
140
+
141
+ ```bash
142
+ exomeflow run --help
143
+ ```
144
+
145
+ ---
146
+
147
+ ## Output files
148
+
149
+ After a successful run the `results/` directory contains:
150
+
151
+ ```
152
+ results/
153
+ ├── QC/ # fastp HTML/JSON reports (reserved)
154
+ ├── filtered_fastp/
155
+ │ ├── <sample>_1_filtered.fastq.gz
156
+ │ ├── <sample>_2_filtered.fastq.gz
157
+ │ ├── <sample>_fastp.html
158
+ │ └── <sample>_fastp.json
159
+ ├── Mapsam/
160
+ │ ├── <sample>_recalibrated.bam ← use in IGV for variant validation
161
+ │ └── <sample>_recalibrated.bam.bai
162
+ ├── VCF/
163
+ │ ├── <sample>.vcf ← raw HaplotypeCaller output
164
+ │ ├── <sample>_PASS.vcf ← PASS-only hard-filtered variants
165
+ │ ├── <sample>.annovar.hg38_multianno.vcf ← annotated VCF
166
+ │ └── <sample>.annovar.hg38_multianno.txt ← annotated tab-delimited table
167
+ ├── logs/
168
+ │ ├── analysis_<timestamp>.log ← full pipeline log
169
+ │ ├── errors_<timestamp>.log ← errors only
170
+ │ └── <sample>_<timestamp>.log ← per-sample log
171
+ └── .checkpoints/ ← resume state (do not delete during a run)
172
+ ```
173
+
174
+ ---
175
+
176
+ ## Checkpointing & resuming
177
+
178
+ ExomeFlow writes a checkpoint file for every completed step. If the pipeline
179
+ is interrupted (power failure, wall-time limit, etc.) simply re-run the
180
+ **exact same command** — completed steps are skipped automatically.
181
+
182
+ ---
183
+
184
+ ## GATK hard-filter thresholds
185
+
186
+ ### SNPs
187
+
188
+ | Filter name | Expression |
189
+ |-------------|-----------|
190
+ | `SNP_LowQD` | `QD < 2.0` |
191
+ | `SNP_StrandBias` | `FS > 60.0` |
192
+ | `SNP_StrandOddsRatio` | `SOR > 3.0` |
193
+ | `SNP_LowMQ` | `MQ < 40.0` |
194
+ | `SNP_MQRankSum` | `MQRankSum < -12.5` |
195
+ | `SNP_ReadPosRankSum` | `ReadPosRankSum < -8.0` |
196
+ | `LowDepth` | `DP < 10` |
197
+ | `LowGQ` *(genotype)* | `GQ < 20` |
198
+
199
+ ### INDELs
200
+
201
+ | Filter name | Expression |
202
+ |-------------|-----------|
203
+ | `INDEL_LowQD` | `QD < 2.0` |
204
+ | `INDEL_StrandBias` | `FS > 200.0` |
205
+ | `INDEL_StrandOddsRatio` | `SOR > 10.0` |
206
+ | `INDEL_ReadPosRankSum` | `ReadPosRankSum < -20.0` |
207
+ | `LowDepth` | `DP < 10` |
208
+ | `LowGQ` *(genotype)* | `GQ < 20` |
209
+
210
+ ---
211
+
212
+ ## ANNOVAR annotation databases (default)
213
+
214
+ ```
215
+ refGene, dbnsfp47a, clinvar_20240416, gnomad41_exome,
216
+ gnomad41_genome, avsnp150, cosmic84_coding, exac03
217
+ ```
218
+
219
+ ---
220
+
221
+ ## Publishing to PyPI
222
+
223
+ ```bash
224
+ pip install build twine
225
+
226
+ # Build source + wheel distributions
227
+ python -m build
228
+
229
+ # Upload to PyPI (requires ~/.pypirc or TWINE_USERNAME / TWINE_PASSWORD env vars)
230
+ twine upload dist/*
231
+ ```
232
+
233
+ To publish to TestPyPI first:
234
+
235
+ ```bash
236
+ twine upload --repository testpypi dist/*
237
+ pip install --index-url https://test.pypi.org/simple/ exomeflow
238
+ ```
239
+
240
+ ---
241
+
242
+ ## Development
243
+
244
+ ```bash
245
+ # Install in editable mode with dev extras
246
+ pip install -e ".[dev]"
247
+
248
+ # Lint
249
+ flake8 exomeflow/
250
+ mypy exomeflow/
251
+ ```
252
+
253
+ ---
254
+
255
+ ## Citation
256
+
257
+ If you use ExomeFlow in your research, please cite:
258
+
259
+ > Robin Tomar. *ExomeFlow: a production-quality whole exome sequencing pipeline*. AIIMS New Delhi, 2025.
@@ -0,0 +1,9 @@
1
+ """
2
+ ExomeFlow — Whole Exome Sequencing analysis pipeline.
3
+
4
+ Author: Robin Tomar, AIIMS New Delhi
5
+ """
6
+
7
+ __version__ = "1.0.0"
8
+ __author__ = "Robin Tomar"
9
+ __email__ = "robin@aiims.ac.in"
@@ -0,0 +1,103 @@
1
+ """
2
+ Step 2 — Read alignment with BWA MEM.
3
+
4
+ Mirrors the Bash ``run_bwa_mem`` function exactly:
5
+ - BWA MEM flags: -Y -K 100000000
6
+ - Read-group tag set to sample name (ID, PU, SM, LB = sample; PL = illumina)
7
+ - Output piped through samtools view -Shb to produce a raw BAM
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import subprocess
14
+ from pathlib import Path
15
+ from typing import TYPE_CHECKING
16
+
17
+ from exomeflow.utils import Checkpoint, PipelineStepError
18
+
19
+ if TYPE_CHECKING:
20
+ from exomeflow.config import Config
21
+
22
+ logger = logging.getLogger("exomeflow")
23
+
24
+ STEP = "bwa"
25
+
26
+
27
+ def run_bwa_mem(sample: str, cfg: "Config", checkpoint: Checkpoint) -> None:
28
+ """
29
+ Align filtered reads for *sample* with BWA MEM and convert to BAM.
30
+
31
+ Input
32
+ -----
33
+ <fastp_dir>/<sample>_1_filtered.fastq.gz
34
+ <fastp_dir>/<sample>_2_filtered.fastq.gz
35
+
36
+ Output
37
+ ------
38
+ <map_dir>/<sample>.bam
39
+ """
40
+ if checkpoint.done(sample, STEP):
41
+ logger.info("[%s] BWA MEM already completed, skipping.", sample)
42
+ return
43
+
44
+ r1 = cfg.fastp_dir / f"{sample}_1_filtered.fastq.gz"
45
+ r2 = cfg.fastp_dir / f"{sample}_2_filtered.fastq.gz"
46
+ output = cfg.map_dir / f"{sample}.bam"
47
+
48
+ read_group = (
49
+ f"@RG\\tID:{sample}\\tPU:{sample}\\tSM:{sample}"
50
+ f"\\tLB:{sample}\\tPL:illumina"
51
+ )
52
+
53
+ logger.info("[%s] Running BWA MEM ...", sample)
54
+
55
+ # BWA MEM → samtools view pipe (mirrors the Bash pipe)
56
+ bwa_cmd = [
57
+ "bwa", "mem",
58
+ "-Y",
59
+ "-K", "100000000",
60
+ "-t", str(cfg.threads),
61
+ "-R", read_group,
62
+ str(cfg.reference),
63
+ str(r1),
64
+ str(r2),
65
+ ]
66
+
67
+ samtools_cmd = [
68
+ "samtools", "view",
69
+ "-Shb",
70
+ "-o", str(output),
71
+ "-",
72
+ ]
73
+
74
+ env = cfg.env()
75
+
76
+ bwa_proc = subprocess.Popen(
77
+ bwa_cmd,
78
+ stdout=subprocess.PIPE,
79
+ stderr=None,
80
+ env=env,
81
+ )
82
+
83
+ samtools_proc = subprocess.Popen(
84
+ samtools_cmd,
85
+ stdin=bwa_proc.stdout,
86
+ env=env,
87
+ )
88
+
89
+ # Allow bwa to receive SIGPIPE if samtools exits early
90
+ if bwa_proc.stdout:
91
+ bwa_proc.stdout.close()
92
+
93
+ samtools_rc = samtools_proc.wait()
94
+ bwa_rc = bwa_proc.wait()
95
+
96
+ if bwa_rc != 0 or samtools_rc != 0:
97
+ raise PipelineStepError(
98
+ f"[{sample}] BWA MEM / samtools pipe failed "
99
+ f"(bwa={bwa_rc}, samtools={samtools_rc})"
100
+ )
101
+
102
+ checkpoint.mark(sample, STEP)
103
+ logger.log(25, "[%s] BWA MEM and conversion to BAM completed.", sample)