samsampleX 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,316 @@
1
+ Metadata-Version: 2.4
2
+ Name: samsampleX
3
+ Version: 0.1.0
4
+ Summary: Depth-aware dynamic BAM file downsampling
5
+ Author-email: Sedat Demiriz <sedat.demiriz@mail.mcgill.ca>, Daniel Taliun <daniel.taliun@mcgill.ca>
6
+ License-Expression: MIT
7
+ Project-URL: Repository, https://github.com/sdemiriz/samsampleX
8
+ Keywords: bam,alignment,downsampling,sampling,samtools,sambamba,gatk,genome,bioinformatics,genomics
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: xxhash>=3.5.0
12
+ Requires-Dist: pysam>=0.23.3
13
+ Requires-Dist: numpy>=2.3.3
14
+ Requires-Dist: matplotlib>=3.10.8
15
+ Requires-Dist: scipy>=1.17.0
16
+ Provides-Extra: benchmark
17
+ Requires-Dist: snakemake>=9.16.3; extra == "benchmark"
18
+ Requires-Dist: snakemake-executor-plugin-slurm>=2.3.1; extra == "benchmark"
19
+ Requires-Dist: pandas>=2.3.3; extra == "benchmark"
20
+ Provides-Extra: test
21
+ Requires-Dist: pytest>=9.0.2; extra == "test"
22
+
23
+ # samsampleX
24
+ A Python-based tool for dynamic BAM file downsampling, unlike existing tools that only downsample uniformly, based on a single global fraction value. Sample reads from a source BAM file to match the depth of coverage distribution of one or more template BAM file(s) through a created BED template.
25
+
26
+ ## Features:
27
+ - Reproducable, integer seed-based deterministic downsampling
28
+ - Uniform sampling mode: retain a fixed fraction of reads, feature parity with existing tools.
29
+ - Map depth from multiple BAM files to a single BED template via common aggregation statistics (`min`, `mean`, `median`, `max`, `random`).
30
+ - Calculation of quality metrics:
31
+ - Mean absolute error (MAE): mean per-base absolute difference in depth between two BAMs over the region.
32
+ - First-order Wasserstein distance (W1): L1 distance between empirical CDFs of per-base depths.
33
+ - Plotting for visual sampling comparisons, with an option to emit a TSV file of the same data instead.
34
+
35
+ ## Installation
36
+ ### Requirements
37
+
38
+ - pysam
39
+ - xxHash
40
+ - numpy
41
+ - matplotlib
42
+ - scipy
43
+ - Snakemake (benchmarking only)
44
+ - pytest (testing only)
45
+
46
+ ### Build samsampleX
47
+ ```bash
48
+ git clone https://github.com/sdemiriz/samsampleX.git
49
+ cd samsampleX
50
+ pip install .
51
+ ```
52
+
53
+ ## Usage
54
+ ### Mapping
55
+ Extract depth of coverage from one or more template BAM file(s) to a single BED template. When multiple BAMs are provided, per-position depths are combined using the selected `--mode`.
56
+ ```bash
57
+ # Single BAM
58
+ samsampleX map \
59
+ --template-bam template.bam \
60
+ --region chr1:1000-2000 \
61
+ --out-bed template.bed
62
+
63
+ # Multiple BAMs (combined per-position using mean)
64
+ samsampleX map \
65
+ --template-bam a.bam b.bam c.bam \
66
+ --region chr1:1000-2000 \
67
+ --mode mean \
68
+ --out-bed template.bed
69
+ ```
70
+ | Option | Description | Default |
71
+ |--------|-------------|---------|
72
+ | `--template-bam FILE [FILE ...]` | Input BAM file(s) (required) | - |
73
+ | `--region REGION` | Target region, samtools-style (required) | - |
74
+ | `--out-bed FILE` | Output BED file (required) | - |
75
+ | `--collapse INT` | Merge consecutive positions with depth diff <= INT | `0` |
76
+ | `--mode MODE` | Combine mode when multiple BAMs: `min`, `mean`, `median`, `max`, `random` | `mean` |
77
+ | `--seed INT` | Random seed for `--mode random` | `42` |
78
+
79
+ ### Sampling
80
+ Downsample BAM based on provided BED template, using selected metric if multiple BEDs provided. Alternatively, use `--uniform` for position-independent uniform sampling similar to existing tools.
81
+
82
+ **Depth-based sampling (template required):**
83
+ ```bash
84
+ samsampleX sample \
85
+ --source-bam high_depth.bam \
86
+ --template-bed template.bed \
87
+ --region chr1:1000-2000 \
88
+ --out-bam sampled.bam
89
+ ```
90
+
91
+ **Uniform sampling (no template):**
92
+ ```bash
93
+ samsampleX sample \
94
+ --source-bam high_depth.bam \
95
+ --uniform 0.5 \
96
+ --region chr1:1000-2000 \
97
+ --out-bam sampled.bam
98
+ ```
99
+ Retains approximately 50% of reads uniformly across the region.
100
+
101
+ | Option | Description | Default |
102
+ |--------|-------------|---------|
103
+ | `--source-bam FILE` | Input BAM to sample reads from (required) | - |
104
+ | `--template-bed FILE` | Template BED file; required unless `--uniform` is used | - |
105
+ | `--uniform FRACTION` | Uniform sampling: retain fraction of reads. Bypasses template-based downsampling. | - |
106
+ | `--region REGION` | Target region, samtools-style (required) | - |
107
+ | `--out-bam FILE` | Output BAM file to write reads to (required) | - |
108
+ | `--mode MODE` | Combine mode for multiple templates: `min`, `mean`, `median`, `max`, `random` | `random` |
109
+ | `--stat STAT` | Statistic for summarising ratio over read span: `min`, `mean`, `median`, `max`, `random` | `mean` |
110
+ | `--seed INT` | Random seed for reproducibility | `42` |
111
+
112
+ ### Plotting
113
+ Compare depth of coverage between source, template, and output BAM files. Output either as PNG plot or TSV data.
114
+
115
+ Green is source, orange is template and blue is output depth.
116
+
117
+ TSV contains one column for `position`, and three for respective depths of source, template and output.
118
+ ```bash
119
+ # Generate PNG plot
120
+ samsampleX plot \
121
+ --source-bam high_depth.bam \
122
+ --template-bam template.bam \
123
+ --out-bam sampled.bam \
124
+ --region chr1:1000-2000 \
125
+ --out-png coverage_plot.png
126
+ ```
127
+
128
+ | Option | Description | Default |
129
+ |--------|-------------|---------|
130
+ | `--source-bam FILE` | Source BAM file (required) | - |
131
+ | `--template-bam FILE` | Template BAM file (mutually exclusive with --template-bed) | - |
132
+ | `--template-bed FILE` | Template BED file (mutually exclusive with --template-bam) | - |
133
+ | `--out-bam FILE` | Output BAM file from sampling (required) | - |
134
+ | `--region REGION` | Target region, samtools-style (required) | - |
135
+ | `--out-png FILE` | Output PNG plot (mutually exclusive with --out-tsv) | - |
136
+ | `--out-tsv FILE` | Output TSV data (mutually exclusive with --out-png) | - |
137
+
138
+ ### Mapback
139
+ **If you do not use HLA\*LA and its specific read processing method, feel free to ignore this section.**
140
+
141
+ Remap HLA\*LA PRG-mapped reads back to canonical chr6 coordinates. This is a preprocessing step for BAM files produced by HLA\*LA, which maps reads to a pangenome reference graph (PRG) with synthetic contig names (`PRG_1`, `PRG_2`, ...). The mapback subcommand translates these back to chr6 positions using the HLA\*LA `sequences.txt` file and known HLA gene / alt contig boundaries.
142
+
143
+ The output BAM can then be used as input to `sample` for depth-aware downsampling on chr6.
144
+
145
+ ```bash
146
+ # Step 1: remap PRG reads to chr6
147
+ samsampleX mapback \
148
+ --source-bam hlala_output.bam \
149
+ --region chr6:28000000-34000000 \
150
+ --genome-build GRCh38 \
151
+ --out-bam remapped.bam
152
+
153
+ # Step 2: sample from the remapped BAM
154
+ samsampleX sample \
155
+ --source-bam remapped.bam \
156
+ --template-bed template.bed \
157
+ --region chr6:28000000-34000000 \
158
+ --out-bam sampled.bam
159
+ ```
160
+
161
+ | Option | Description | Default |
162
+ |--------|-------------|---------|
163
+ | `--source-bam FILE` | HLA\*LA-remapped BAM file (required) | - |
164
+ | `--region REGION` | Target region on chr6, samtools-style (required) | - |
165
+ | `--out-bam FILE` | Output BAM file (required) | - |
166
+ | `--genome-build BUILD` | Reference genome build: `GRCh38` or `GRCh37` (required) | - |
167
+ | `--prg-seq FILE` | Path to HLA\*LA `sequences.txt` | `HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt` |
168
+
169
+ ### Stats
170
+ Compare depth distributions between two inputs over a given region. Each input can be a BAM file or a BED file (auto-detected by extension). Reports mean absolute error (MAE) and Wasserstein-1 distance.
171
+ ```bash
172
+ # BAM vs BAM
173
+ samsampleX stats \
174
+ --a template.bam \
175
+ --b sampled.bam \
176
+ --region chr1:1000-2000
177
+
178
+ # BED vs BAM (e.g. combined cohort template against sampled output)
179
+ samsampleX stats \
180
+ --a template.bed \
181
+ --b sampled.bam \
182
+ --region chr1:1000-2000
183
+ ```
184
+
185
+ | Option | Description | Default |
186
+ |--------|-------------|---------|
187
+ | `--a FILE` | First input — BAM or BED file (reference) (required) | - |
188
+ | `--b FILE` | Second input — BAM or BED file (comparison) (required) | - |
189
+ | `--region REGION` | Target region, samtools-style (required) | - |
190
+
191
+ ## Example
192
+
193
+ ![Example plot results](examples/example-plot.png)
194
+
195
+ The following commands showcase an example workflow of a short, arbitrary region on chromosome 21. Three 1000 Genomes Project 30X WGS samples are downloaded and mapped to a template, then used to downsample a GIAB 300X WGS sample in the same region. The results are finally displayed on a plot.
196
+
197
+
198
+ ```bash
199
+ cd examples/
200
+
201
+ # Download reference genome (GRCh38)
202
+ wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
203
+
204
+ # Download three first three 1K Genomes 30X WGS samples from
205
+ # https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/1000G_2504_high_coverage.sequence.index
206
+ wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239480/NA12718.final.cram -O NA12718.cram && samtools index NA12718.cram
207
+ wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239481/NA12748.final.cram -O NA12748.cram && samtools index NA12748.cram
208
+ wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239482/NA12775.final.cram -O NA12775.cram && samtools index NA12775.cram
209
+
210
+ # Convert to BAM, restrict to target region and index
211
+ samtools view NA12718.cram chr21:10000000-10010000 -b -o NA12718.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12718.bam
212
+ samtools view NA12748.cram chr21:10000000-10010000 -b -o NA12748.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12748.bam
213
+ samtools view NA12775.cram chr21:10000000-10010000 -b -o NA12775.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12775.bam
214
+
215
+ # Run samsampleX workflow
216
+ samsampleX map \
217
+ --template-bam NA12718.bam NA12748.bam NA12775.bam \
218
+ --region chr21:10000000-10010000 \
219
+ --mode mean \
220
+ --collapse 0 \
221
+ --out-bed template.bed
222
+ # template.bed should match example-template.bed
223
+
224
+ # Source BAM+index is provided in the examples directory, created by subsetting to target region from
225
+ # https://ftp.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.GRCh38.300x.bam
226
+ samsampleX sample \
227
+ --source-bam HG002.GRCh38.300x.chr21:10000000-10010000.bam \
228
+ --template-bed template.bed \
229
+ --region chr21:10000000-10010000 \
230
+ --seed 42 \
231
+ --out-bam sampled.bam
232
+
233
+ samtools index sampled.bam
234
+
235
+ samsampleX plot \
236
+ --source-bam HG002.GRCh38.300x.chr21:10000000-10010000.bam \
237
+ --template-bed template.bed \
238
+ --out-bam sampled.bam \
239
+ --region chr21:10000000-10010000 \
240
+ --out-png plot.png
241
+ # plot.png should match example-plot.png
242
+ ```
243
+
244
+ ## Testing
245
+
246
+ A `pytest` test suite is available. Run with the `-v` flag for a detailed report.
247
+ ```bash
248
+ pytest -v
249
+ ```
250
+
251
+ ## Algorithm rundown
252
+
253
+ ### Mapping
254
+ 1. Parse target region from first BAM header
255
+ 2. Compute per-position depth of coverage for each BAM over the region
256
+ 3. If multiple BAMs: combine depths per-position using `--mode` (min, mean, median, max, random)
257
+ 4. Optionally collapse consecutive similar depths (`--collapse`)
258
+ 5. Write to BED4 format (`chrom`, `start`, `end`, `depth` columns)
259
+
260
+ ### Sampling
261
+ 1. **Uniform mode** (`--uniform FRACTION`): Skip template downsampling. For each read, hash the read name with xxHash32 to get $f_{read} \in [0, 1)$; keep if $f_{read} < FRACTION$. Deterministic and position-independent.
262
+ 2. **Depth-based mode**: Load template depths from BED file(s); if multiple templates are provided, combine them per-position using the selected `--mode`
263
+ 3. Compute source depths from BAM
264
+ 4. Calculate per-position sampling coefficient: $ratio(i) = \min(1,\; depth_{template}(i) \;/\; depth_{source}(i))$
265
+ - Positions where the template depth meets or exceeds the source depth get coefficient 1.0 (keep all reads)
266
+ - Positions with zero source depth get coefficient 0.0
267
+ 5. Build a cumulative sum of the coefficient array for O(1) range queries
268
+ 6. For each read in the source BAM:
269
+ - Hash read name with xxHash32 to produce a deterministic fraction $f_{read} \in [0, 1)$
270
+ - Summarise the coefficient over the read's covered positions using `--stat` (min, mean, median, max, random; default mean via cumsum for mean). `random` picks one overlap ratio from a deterministic index (read span + seed).
271
+ - Keep the read if $f_{read} < ratio_{read}$
272
+
273
+ ## Metrics
274
+ | Metric | Significance |
275
+ | ------ | ------------ |
276
+ | Mean Absolute Error | Average absolute per-base depth difference between the two BAMs |
277
+ | Wasserstein-1 Distance | L1 distance between empirical CDFs of depth (scales with region length) |
278
+
279
+
280
+ ## Benchmarking
281
+ Benchmarking is done by a `snakemake` workflow in the `benchmarks` directory, and thus `snakemake` should be installed beforehand (for HPC systems, also install `snakemake-executor-plugin-slurm` or other plugin compatible with your system type).
282
+
283
+ An `Apptainer` container definition `bench.def` that contains installs for `GATK`, `samtools`, `sambamba` and `samsampleX` is included. Build this container using `apptainer build bench.sif bench.def` before running the workflow.
284
+
285
+ Configure the benchmarking parameters in `config.yaml` in the same directory: copy and rename an existing chunk with all parameters and populate the values. All input files are expected to be found in the same directory as `config.yaml`, BAM files should be indexed using `samtools index`.
286
+
287
+ ```{yaml}
288
+ # config.yaml
289
+ benchmarks: # all chunks should be children of this header
290
+ wgs-chr21: # arbitrary name for benchmarking instance, parameters will be children
291
+ chr: "chr21" # specify contig
292
+ start: 1 # region start coordinate
293
+ end: 46709982 # region end coordinate
294
+ seed: 42 # random seed (base)
295
+ n_replicates: 1 # replicate count, will affect seed
296
+ # (e.g. seed=42, n=3 will use seeds 43, 44, 45)
297
+ collapse: 0 # define smoothing during mapping step
298
+ templates: # specify files to use as templates in sampling
299
+ - "template.bam" # all files must be in the benchmarks directory
300
+
301
+ mode: "mean" # how to determine per-position template depths from multiple template files
302
+ source: "source.bam" # specify file to downsample
303
+
304
+ coefficient: 0.1 # coefficient provided to GATK, samtools, sambamba
305
+
306
+ cpu: 2 # specify hardware resource (used by all steps)
307
+ mem_mb: 16384
308
+ time: "10:00"
309
+ ```
310
+
311
+ When executing the workflow, navigate to the `benchmarks` directory and make sure to use the following arguments:
312
+ ```
313
+ snakemake -p --use-apptainer --apptainer-args '--bind $(pwd)'
314
+ ```
315
+
316
+ A directory for all intermediate files will be created for each chunk defined in `config.yaml` and the final benchmark results will be made available in the `benchmarks` directory as `benchmark-{chunk_name}.tsv`.
@@ -0,0 +1,294 @@
1
+ # samsampleX
2
+ A Python-based tool for dynamic BAM file downsampling, unlike existing tools that only downsample uniformly, based on a single global fraction value. Sample reads from a source BAM file to match the depth of coverage distribution of one or more template BAM file(s) through a created BED template.
3
+
4
+ ## Features:
5
+ - Reproducable, integer seed-based deterministic downsampling
6
+ - Uniform sampling mode: retain a fixed fraction of reads, feature parity with existing tools.
7
+ - Map depth from multiple BAM files to a single BED template via common aggregation statistics (`min`, `mean`, `median`, `max`, `random`).
8
+ - Calculation of quality metrics:
9
+ - Mean absolute error (MAE): mean per-base absolute difference in depth between two BAMs over the region.
10
+ - First-order Wasserstein distance (W1): L1 distance between empirical CDFs of per-base depths.
11
+ - Plotting for visual sampling comparisons, with an option to emit a TSV file of the same data instead.
12
+
13
+ ## Installation
14
+ ### Requirements
15
+
16
+ - pysam
17
+ - xxHash
18
+ - numpy
19
+ - matplotlib
20
+ - scipy
21
+ - Snakemake (benchmarking only)
22
+ - pytest (testing only)
23
+
24
+ ### Build samsampleX
25
+ ```bash
26
+ git clone https://github.com/sdemiriz/samsampleX.git
27
+ cd samsampleX
28
+ pip install .
29
+ ```
30
+
31
+ ## Usage
32
+ ### Mapping
33
+ Extract depth of coverage from one or more template BAM file(s) to a single BED template. When multiple BAMs are provided, per-position depths are combined using the selected `--mode`.
34
+ ```bash
35
+ # Single BAM
36
+ samsampleX map \
37
+ --template-bam template.bam \
38
+ --region chr1:1000-2000 \
39
+ --out-bed template.bed
40
+
41
+ # Multiple BAMs (combined per-position using mean)
42
+ samsampleX map \
43
+ --template-bam a.bam b.bam c.bam \
44
+ --region chr1:1000-2000 \
45
+ --mode mean \
46
+ --out-bed template.bed
47
+ ```
48
+ | Option | Description | Default |
49
+ |--------|-------------|---------|
50
+ | `--template-bam FILE [FILE ...]` | Input BAM file(s) (required) | - |
51
+ | `--region REGION` | Target region, samtools-style (required) | - |
52
+ | `--out-bed FILE` | Output BED file (required) | - |
53
+ | `--collapse INT` | Merge consecutive positions with depth diff <= INT | `0` |
54
+ | `--mode MODE` | Combine mode when multiple BAMs: `min`, `mean`, `median`, `max`, `random` | `mean` |
55
+ | `--seed INT` | Random seed for `--mode random` | `42` |
56
+
57
+ ### Sampling
58
+ Downsample BAM based on provided BED template, using selected metric if multiple BEDs provided. Alternatively, use `--uniform` for position-independent uniform sampling similar to existing tools.
59
+
60
+ **Depth-based sampling (template required):**
61
+ ```bash
62
+ samsampleX sample \
63
+ --source-bam high_depth.bam \
64
+ --template-bed template.bed \
65
+ --region chr1:1000-2000 \
66
+ --out-bam sampled.bam
67
+ ```
68
+
69
+ **Uniform sampling (no template):**
70
+ ```bash
71
+ samsampleX sample \
72
+ --source-bam high_depth.bam \
73
+ --uniform 0.5 \
74
+ --region chr1:1000-2000 \
75
+ --out-bam sampled.bam
76
+ ```
77
+ Retains approximately 50% of reads uniformly across the region.
78
+
79
+ | Option | Description | Default |
80
+ |--------|-------------|---------|
81
+ | `--source-bam FILE` | Input BAM to sample reads from (required) | - |
82
+ | `--template-bed FILE` | Template BED file; required unless `--uniform` is used | - |
83
+ | `--uniform FRACTION` | Uniform sampling: retain fraction of reads. Bypasses template-based downsampling. | - |
84
+ | `--region REGION` | Target region, samtools-style (required) | - |
85
+ | `--out-bam FILE` | Output BAM file to write reads to (required) | - |
86
+ | `--mode MODE` | Combine mode for multiple templates: `min`, `mean`, `median`, `max`, `random` | `random` |
87
+ | `--stat STAT` | Statistic for summarising ratio over read span: `min`, `mean`, `median`, `max`, `random` | `mean` |
88
+ | `--seed INT` | Random seed for reproducibility | `42` |
89
+
90
+ ### Plotting
91
+ Compare depth of coverage between source, template, and output BAM files. Output either as PNG plot or TSV data.
92
+
93
+ Green is source, orange is template and blue is output depth.
94
+
95
+ TSV contains one column for `position`, and three for respective depths of source, template and output.
96
+ ```bash
97
+ # Generate PNG plot
98
+ samsampleX plot \
99
+ --source-bam high_depth.bam \
100
+ --template-bam template.bam \
101
+ --out-bam sampled.bam \
102
+ --region chr1:1000-2000 \
103
+ --out-png coverage_plot.png
104
+ ```
105
+
106
+ | Option | Description | Default |
107
+ |--------|-------------|---------|
108
+ | `--source-bam FILE` | Source BAM file (required) | - |
109
+ | `--template-bam FILE` | Template BAM file (mutually exclusive with --template-bed) | - |
110
+ | `--template-bed FILE` | Template BED file (mutually exclusive with --template-bam) | - |
111
+ | `--out-bam FILE` | Output BAM file from sampling (required) | - |
112
+ | `--region REGION` | Target region, samtools-style (required) | - |
113
+ | `--out-png FILE` | Output PNG plot (mutually exclusive with --out-tsv) | - |
114
+ | `--out-tsv FILE` | Output TSV data (mutually exclusive with --out-png) | - |
115
+
116
+ ### Mapback
117
+ **If you do not use HLA\*LA and its specific read processing method, feel free to ignore this section.**
118
+
119
+ Remap HLA\*LA PRG-mapped reads back to canonical chr6 coordinates. This is a preprocessing step for BAM files produced by HLA\*LA, which maps reads to a pangenome reference graph (PRG) with synthetic contig names (`PRG_1`, `PRG_2`, ...). The mapback subcommand translates these back to chr6 positions using the HLA\*LA `sequences.txt` file and known HLA gene / alt contig boundaries.
120
+
121
+ The output BAM can then be used as input to `sample` for depth-aware downsampling on chr6.
122
+
123
+ ```bash
124
+ # Step 1: remap PRG reads to chr6
125
+ samsampleX mapback \
126
+ --source-bam hlala_output.bam \
127
+ --region chr6:28000000-34000000 \
128
+ --genome-build GRCh38 \
129
+ --out-bam remapped.bam
130
+
131
+ # Step 2: sample from the remapped BAM
132
+ samsampleX sample \
133
+ --source-bam remapped.bam \
134
+ --template-bed template.bed \
135
+ --region chr6:28000000-34000000 \
136
+ --out-bam sampled.bam
137
+ ```
138
+
139
+ | Option | Description | Default |
140
+ |--------|-------------|---------|
141
+ | `--source-bam FILE` | HLA\*LA-remapped BAM file (required) | - |
142
+ | `--region REGION` | Target region on chr6, samtools-style (required) | - |
143
+ | `--out-bam FILE` | Output BAM file (required) | - |
144
+ | `--genome-build BUILD` | Reference genome build: `GRCh38` or `GRCh37` (required) | - |
145
+ | `--prg-seq FILE` | Path to HLA\*LA `sequences.txt` | `HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt` |
146
+
147
+ ### Stats
148
+ Compare depth distributions between two inputs over a given region. Each input can be a BAM file or a BED file (auto-detected by extension). Reports mean absolute error (MAE) and Wasserstein-1 distance.
149
+ ```bash
150
+ # BAM vs BAM
151
+ samsampleX stats \
152
+ --a template.bam \
153
+ --b sampled.bam \
154
+ --region chr1:1000-2000
155
+
156
+ # BED vs BAM (e.g. combined cohort template against sampled output)
157
+ samsampleX stats \
158
+ --a template.bed \
159
+ --b sampled.bam \
160
+ --region chr1:1000-2000
161
+ ```
162
+
163
+ | Option | Description | Default |
164
+ |--------|-------------|---------|
165
+ | `--a FILE` | First input — BAM or BED file (reference) (required) | - |
166
+ | `--b FILE` | Second input — BAM or BED file (comparison) (required) | - |
167
+ | `--region REGION` | Target region, samtools-style (required) | - |
168
+
169
+ ## Example
170
+
171
+ ![Example plot results](examples/example-plot.png)
172
+
173
+ The following commands showcase an example workflow of a short, arbitrary region on chromosome 21. Three 1000 Genomes Project 30X WGS samples are downloaded and mapped to a template, then used to downsample a GIAB 300X WGS sample in the same region. The results are finally displayed on a plot.
174
+
175
+
176
+ ```bash
177
+ cd examples/
178
+
179
+ # Download reference genome (GRCh38)
180
+ wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
181
+
182
+ # Download three first three 1K Genomes 30X WGS samples from
183
+ # https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/1000G_2504_high_coverage.sequence.index
184
+ wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239480/NA12718.final.cram -O NA12718.cram && samtools index NA12718.cram
185
+ wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239481/NA12748.final.cram -O NA12748.cram && samtools index NA12748.cram
186
+ wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239482/NA12775.final.cram -O NA12775.cram && samtools index NA12775.cram
187
+
188
+ # Convert to BAM, restrict to target region and index
189
+ samtools view NA12718.cram chr21:10000000-10010000 -b -o NA12718.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12718.bam
190
+ samtools view NA12748.cram chr21:10000000-10010000 -b -o NA12748.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12748.bam
191
+ samtools view NA12775.cram chr21:10000000-10010000 -b -o NA12775.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12775.bam
192
+
193
+ # Run samsampleX workflow
194
+ samsampleX map \
195
+ --template-bam NA12718.bam NA12748.bam NA12775.bam \
196
+ --region chr21:10000000-10010000 \
197
+ --mode mean \
198
+ --collapse 0 \
199
+ --out-bed template.bed
200
+ # template.bed should match example-template.bed
201
+
202
+ # Source BAM+index is provided in the examples directory, created by subsetting to target region from
203
+ # https://ftp.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.GRCh38.300x.bam
204
+ samsampleX sample \
205
+ --source-bam HG002.GRCh38.300x.chr21:10000000-10010000.bam \
206
+ --template-bed template.bed \
207
+ --region chr21:10000000-10010000 \
208
+ --seed 42 \
209
+ --out-bam sampled.bam
210
+
211
+ samtools index sampled.bam
212
+
213
+ samsampleX plot \
214
+ --source-bam HG002.GRCh38.300x.chr21:10000000-10010000.bam \
215
+ --template-bed template.bed \
216
+ --out-bam sampled.bam \
217
+ --region chr21:10000000-10010000 \
218
+ --out-png plot.png
219
+ # plot.png should match example-plot.png
220
+ ```
221
+
222
+ ## Testing
223
+
224
+ A `pytest` test suite is available. Run with the `-v` flag for a detailed report.
225
+ ```bash
226
+ pytest -v
227
+ ```
228
+
229
+ ## Algorithm rundown
230
+
231
+ ### Mapping
232
+ 1. Parse target region from first BAM header
233
+ 2. Compute per-position depth of coverage for each BAM over the region
234
+ 3. If multiple BAMs: combine depths per-position using `--mode` (min, mean, median, max, random)
235
+ 4. Optionally collapse consecutive similar depths (`--collapse`)
236
+ 5. Write to BED4 format (`chrom`, `start`, `end`, `depth` columns)
237
+
238
+ ### Sampling
239
+ 1. **Uniform mode** (`--uniform FRACTION`): Skip template downsampling. For each read, hash the read name with xxHash32 to get $f_{read} \in [0, 1)$; keep if $f_{read} < FRACTION$. Deterministic and position-independent.
240
+ 2. **Depth-based mode**: Load template depths from BED file(s); if multiple templates are provided, combine them per-position using the selected `--mode`
241
+ 3. Compute source depths from BAM
242
+ 4. Calculate per-position sampling coefficient: $ratio(i) = \min(1,\; depth_{template}(i) \;/\; depth_{source}(i))$
243
+ - Positions where the template depth meets or exceeds the source depth get coefficient 1.0 (keep all reads)
244
+ - Positions with zero source depth get coefficient 0.0
245
+ 5. Build a cumulative sum of the coefficient array for O(1) range queries
246
+ 6. For each read in the source BAM:
247
+ - Hash read name with xxHash32 to produce a deterministic fraction $f_{read} \in [0, 1)$
248
+ - Summarise the coefficient over the read's covered positions using `--stat` (min, mean, median, max, random; default mean via cumsum for mean). `random` picks one overlap ratio from a deterministic index (read span + seed).
249
+ - Keep the read if $f_{read} < ratio_{read}$
250
+
251
+ ## Metrics
252
+ | Metric | Significance |
253
+ | ------ | ------------ |
254
+ | Mean Absolute Error | Average absolute per-base depth difference between the two BAMs |
255
+ | Wasserstein-1 Distance | L1 distance between empirical CDFs of depth (scales with region length) |
256
+
257
+
258
+ ## Benchmarking
259
+ Benchmarking is done by a `snakemake` workflow in the `benchmarks` directory, and thus `snakemake` should be installed beforehand (for HPC systems, also install `snakemake-executor-plugin-slurm` or other plugin compatible with your system type).
260
+
261
+ An `Apptainer` container definition `bench.def` that contains installs for `GATK`, `samtools`, `sambamba` and `samsampleX` is included. Build this container using `apptainer build bench.sif bench.def` before running the workflow.
262
+
263
+ Configure the benchmarking parameters in `config.yaml` in the same directory: copy and rename an existing chunk with all parameters and populate the values. All input files are expected to be found in the same directory as `config.yaml`, BAM files should be indexed using `samtools index`.
264
+
265
+ ```{yaml}
266
+ # config.yaml
267
+ benchmarks: # all chunks should be children of this header
268
+ wgs-chr21: # arbitrary name for benchmarking instance, parameters will be children
269
+ chr: "chr21" # specify contig
270
+ start: 1 # region start coordinate
271
+ end: 46709982 # region end coordinate
272
+ seed: 42 # random seed (base)
273
+ n_replicates: 1 # replicate count, will affect seed
274
+ # (e.g. seed=42, n=3 will use seeds 43, 44, 45)
275
+ collapse: 0 # define smoothing during mapping step
276
+ templates: # specify files to use as templates in sampling
277
+ - "template.bam" # all files must be in the benchmarks directory
278
+
279
+ mode: "mean" # how to determine per-position template depths from multiple template files
280
+ source: "source.bam" # specify file to downsample
281
+
282
+ coefficient: 0.1 # coefficient provided to GATK, samtools, sambamba
283
+
284
+ cpu: 2 # specify hardware resource (used by all steps)
285
+ mem_mb: 16384
286
+ time: "10:00"
287
+ ```
288
+
289
+ When executing the workflow, navigate to the `benchmarks` directory and make sure to use the following arguments:
290
+ ```
291
+ snakemake -p --use-apptainer --apptainer-args '--bind $(pwd)'
292
+ ```
293
+
294
+ A directory for all intermediate files will be created for each chunk defined in `config.yaml` and the final benchmark results will be made available in the `benchmarks` directory as `benchmark-{chunk_name}.tsv`.
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "samsampleX"
7
+ authors = [
8
+ {name = "Sedat Demiriz", email = "sedat.demiriz@mail.mcgill.ca"},
9
+ {name = "Daniel Taliun", email = "daniel.taliun@mcgill.ca"},
10
+ ]
11
+ version = "0.1.0"
12
+ description = "Depth-aware dynamic BAM file downsampling"
13
+ keywords = ["bam", "alignment", "downsampling", "sampling", "samtools", "sambamba", "gatk", "genome", "bioinformatics", "genomics"]
14
+ requires-python = ">=3.9"
15
+ readme = "README.md"
16
+ license = "MIT"
17
+ dependencies = [
18
+ "xxhash>=3.5.0",
19
+ "pysam>=0.23.3",
20
+ "numpy>=2.3.3",
21
+ "matplotlib>=3.10.8",
22
+ "scipy>=1.17.0",
23
+ ]
24
+
25
+ [project.urls]
26
+ Repository = "https://github.com/sdemiriz/samsampleX"
27
+
28
+ [project.optional-dependencies]
29
+ benchmark = [
30
+ "snakemake>=9.16.3",
31
+ "snakemake-executor-plugin-slurm>=2.3.1",
32
+ "pandas>=2.3.3",
33
+ ]
34
+ test = [
35
+ "pytest>=9.0.2"
36
+ ]
37
+
38
+ [project.scripts]
39
+ samsampleX = "samsamplex.cli:main"