samsampleX 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- samsamplex-0.1.0/PKG-INFO +316 -0
- samsamplex-0.1.0/README.md +294 -0
- samsamplex-0.1.0/pyproject.toml +39 -0
- samsamplex-0.1.0/samsampleX.egg-info/PKG-INFO +316 -0
- samsamplex-0.1.0/samsampleX.egg-info/SOURCES.txt +23 -0
- samsamplex-0.1.0/samsampleX.egg-info/dependency_links.txt +1 -0
- samsamplex-0.1.0/samsampleX.egg-info/entry_points.txt +2 -0
- samsamplex-0.1.0/samsampleX.egg-info/requires.txt +13 -0
- samsamplex-0.1.0/samsampleX.egg-info/top_level.txt +1 -0
- samsamplex-0.1.0/samsamplex/__init__.py +1 -0
- samsamplex-0.1.0/samsamplex/bed.py +146 -0
- samsamplex-0.1.0/samsamplex/cli.py +342 -0
- samsamplex-0.1.0/samsamplex/depth.py +126 -0
- samsamplex-0.1.0/samsamplex/mapback.py +365 -0
- samsamplex-0.1.0/samsamplex/metrics.py +64 -0
- samsamplex-0.1.0/samsamplex/modes.py +4 -0
- samsamplex-0.1.0/samsamplex/plot.py +217 -0
- samsamplex-0.1.0/samsamplex/sample.py +257 -0
- samsamplex-0.1.0/setup.cfg +4 -0
- samsamplex-0.1.0/tests/test_bed.py +209 -0
- samsamplex-0.1.0/tests/test_depth.py +55 -0
- samsamplex-0.1.0/tests/test_mapback.py +232 -0
- samsamplex-0.1.0/tests/test_metrics.py +91 -0
- samsamplex-0.1.0/tests/test_plot.py +135 -0
- samsamplex-0.1.0/tests/test_sample.py +199 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: samsampleX
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Depth-aware dynamic BAM file downsampling
|
|
5
|
+
Author-email: Sedat Demiriz <sedat.demiriz@mail.mcgill.ca>, Daniel Taliun <daniel.taliun@mcgill.ca>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/sdemiriz/samsampleX
|
|
8
|
+
Keywords: bam,alignment,downsampling,sampling,samtools,sambamba,gatk,genome,bioinformatics,genomics
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: xxhash>=3.5.0
|
|
12
|
+
Requires-Dist: pysam>=0.23.3
|
|
13
|
+
Requires-Dist: numpy>=2.3.3
|
|
14
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
15
|
+
Requires-Dist: scipy>=1.17.0
|
|
16
|
+
Provides-Extra: benchmark
|
|
17
|
+
Requires-Dist: snakemake>=9.16.3; extra == "benchmark"
|
|
18
|
+
Requires-Dist: snakemake-executor-plugin-slurm>=2.3.1; extra == "benchmark"
|
|
19
|
+
Requires-Dist: pandas>=2.3.3; extra == "benchmark"
|
|
20
|
+
Provides-Extra: test
|
|
21
|
+
Requires-Dist: pytest>=9.0.2; extra == "test"
|
|
22
|
+
|
|
23
|
+
# samsampleX
|
|
24
|
+
A Python-based tool for dynamic BAM file downsampling, unlike existing tools that only downsample uniformly, based on a single global fraction value. Sample reads from a source BAM file to match the depth of coverage distribution of one or more template BAM file(s) through a created BED template.
|
|
25
|
+
|
|
26
|
+
## Features:
|
|
27
|
+
- Reproducable, integer seed-based deterministic downsampling
|
|
28
|
+
- Uniform sampling mode: retain a fixed fraction of reads, feature parity with existing tools.
|
|
29
|
+
- Map depth from multiple BAM files to a single BED template via common aggregation statistics (`min`, `mean`, `median`, `max`, `random`).
|
|
30
|
+
- Calculation of quality metrics:
|
|
31
|
+
- Mean absolute error (MAE): mean per-base absolute difference in depth between two BAMs over the region.
|
|
32
|
+
- First-order Wasserstein distance (W1): L1 distance between empirical CDFs of per-base depths.
|
|
33
|
+
- Plotting for visual sampling comparisons, with an option to emit a TSV file of the same data instead.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
### Requirements
|
|
37
|
+
|
|
38
|
+
- pysam
|
|
39
|
+
- xxHash
|
|
40
|
+
- numpy
|
|
41
|
+
- matplotlib
|
|
42
|
+
- scipy
|
|
43
|
+
- Snakemake (benchmarking only)
|
|
44
|
+
- pytest (testing only)
|
|
45
|
+
|
|
46
|
+
### Build samsampleX
|
|
47
|
+
```bash
|
|
48
|
+
git clone https://github.com/sdemiriz/samsampleX.git
|
|
49
|
+
cd samsampleX
|
|
50
|
+
pip install .
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
### Mapping
|
|
55
|
+
Extract depth of coverage from one or more template BAM file(s) to a single BED template. When multiple BAMs are provided, per-position depths are combined using the selected `--mode`.
|
|
56
|
+
```bash
|
|
57
|
+
# Single BAM
|
|
58
|
+
samsampleX map \
|
|
59
|
+
--template-bam template.bam \
|
|
60
|
+
--region chr1:1000-2000 \
|
|
61
|
+
--out-bed template.bed
|
|
62
|
+
|
|
63
|
+
# Multiple BAMs (combined per-position using mean)
|
|
64
|
+
samsampleX map \
|
|
65
|
+
--template-bam a.bam b.bam c.bam \
|
|
66
|
+
--region chr1:1000-2000 \
|
|
67
|
+
--mode mean \
|
|
68
|
+
--out-bed template.bed
|
|
69
|
+
```
|
|
70
|
+
| Option | Description | Default |
|
|
71
|
+
|--------|-------------|---------|
|
|
72
|
+
| `--template-bam FILE [FILE ...]` | Input BAM file(s) (required) | - |
|
|
73
|
+
| `--region REGION` | Target region, samtools-style (required) | - |
|
|
74
|
+
| `--out-bed FILE` | Output BED file (required) | - |
|
|
75
|
+
| `--collapse INT` | Merge consecutive positions with depth diff <= INT | `0` |
|
|
76
|
+
| `--mode MODE` | Combine mode when multiple BAMs: `min`, `mean`, `median`, `max`, `random` | `mean` |
|
|
77
|
+
| `--seed INT` | Random seed for `--mode random` | `42` |
|
|
78
|
+
|
|
79
|
+
### Sampling
|
|
80
|
+
Downsample BAM based on provided BED template, using selected metric if multiple BEDs provided. Alternatively, use `--uniform` for position-independent uniform sampling similar to existing tools.
|
|
81
|
+
|
|
82
|
+
**Depth-based sampling (template required):**
|
|
83
|
+
```bash
|
|
84
|
+
samsampleX sample \
|
|
85
|
+
--source-bam high_depth.bam \
|
|
86
|
+
--template-bed template.bed \
|
|
87
|
+
--region chr1:1000-2000 \
|
|
88
|
+
--out-bam sampled.bam
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Uniform sampling (no template):**
|
|
92
|
+
```bash
|
|
93
|
+
samsampleX sample \
|
|
94
|
+
--source-bam high_depth.bam \
|
|
95
|
+
--uniform 0.5 \
|
|
96
|
+
--region chr1:1000-2000 \
|
|
97
|
+
--out-bam sampled.bam
|
|
98
|
+
```
|
|
99
|
+
Retains approximately 50% of reads uniformly across the region.
|
|
100
|
+
|
|
101
|
+
| Option | Description | Default |
|
|
102
|
+
|--------|-------------|---------|
|
|
103
|
+
| `--source-bam FILE` | Input BAM to sample reads from (required) | - |
|
|
104
|
+
| `--template-bed FILE` | Template BED file; required unless `--uniform` is used | - |
|
|
105
|
+
| `--uniform FRACTION` | Uniform sampling: retain fraction of reads. Bypasses template-based downsampling. | - |
|
|
106
|
+
| `--region REGION` | Target region, samtools-style (required) | - |
|
|
107
|
+
| `--out-bam FILE` | Output BAM file to write reads to (required) | - |
|
|
108
|
+
| `--mode MODE` | Combine mode for multiple templates: `min`, `mean`, `median`, `max`, `random` | `random` |
|
|
109
|
+
| `--stat STAT` | Statistic for summarising ratio over read span: `min`, `mean`, `median`, `max`, `random` | `mean` |
|
|
110
|
+
| `--seed INT` | Random seed for reproducibility | `42` |
|
|
111
|
+
|
|
112
|
+
### Plotting
|
|
113
|
+
Compare depth of coverage between source, template, and output BAM files. Output either as PNG plot or TSV data.
|
|
114
|
+
|
|
115
|
+
Green is source, orange is template and blue is output depth.
|
|
116
|
+
|
|
117
|
+
TSV contains one column for `position`, and three for respective depths of source, template and output.
|
|
118
|
+
```bash
|
|
119
|
+
# Generate PNG plot
|
|
120
|
+
samsampleX plot \
|
|
121
|
+
--source-bam high_depth.bam \
|
|
122
|
+
--template-bam template.bam \
|
|
123
|
+
--out-bam sampled.bam \
|
|
124
|
+
--region chr1:1000-2000 \
|
|
125
|
+
--out-png coverage_plot.png
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
| Option | Description | Default |
|
|
129
|
+
|--------|-------------|---------|
|
|
130
|
+
| `--source-bam FILE` | Source BAM file (required) | - |
|
|
131
|
+
| `--template-bam FILE` | Template BAM file (mutually exclusive with --template-bed) | - |
|
|
132
|
+
| `--template-bed FILE` | Template BED file (mutually exclusive with --template-bam) | - |
|
|
133
|
+
| `--out-bam FILE` | Output BAM file from sampling (required) | - |
|
|
134
|
+
| `--region REGION` | Target region, samtools-style (required) | - |
|
|
135
|
+
| `--out-png FILE` | Output PNG plot (mutually exclusive with --out-tsv) | - |
|
|
136
|
+
| `--out-tsv FILE` | Output TSV data (mutually exclusive with --out-png) | - |
|
|
137
|
+
|
|
138
|
+
### Mapback
|
|
139
|
+
**If you do not use HLA\*LA and its specific read processing method, feel free to ignore this section.**
|
|
140
|
+
|
|
141
|
+
Remap HLA\*LA PRG-mapped reads back to canonical chr6 coordinates. This is a preprocessing step for BAM files produced by HLA\*LA, which maps reads to a pangenome reference graph (PRG) with synthetic contig names (`PRG_1`, `PRG_2`, ...). The mapback subcommand translates these back to chr6 positions using the HLA\*LA `sequences.txt` file and known HLA gene / alt contig boundaries.
|
|
142
|
+
|
|
143
|
+
The output BAM can then be used as input to `sample` for depth-aware downsampling on chr6.
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
# Step 1: remap PRG reads to chr6
|
|
147
|
+
samsampleX mapback \
|
|
148
|
+
--source-bam hlala_output.bam \
|
|
149
|
+
--region chr6:28000000-34000000 \
|
|
150
|
+
--genome-build GRCh38 \
|
|
151
|
+
--out-bam remapped.bam
|
|
152
|
+
|
|
153
|
+
# Step 2: sample from the remapped BAM
|
|
154
|
+
samsampleX sample \
|
|
155
|
+
--source-bam remapped.bam \
|
|
156
|
+
--template-bed template.bed \
|
|
157
|
+
--region chr6:28000000-34000000 \
|
|
158
|
+
--out-bam sampled.bam
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
| Option | Description | Default |
|
|
162
|
+
|--------|-------------|---------|
|
|
163
|
+
| `--source-bam FILE` | HLA\*LA-remapped BAM file (required) | - |
|
|
164
|
+
| `--region REGION` | Target region on chr6, samtools-style (required) | - |
|
|
165
|
+
| `--out-bam FILE` | Output BAM file (required) | - |
|
|
166
|
+
| `--genome-build BUILD` | Reference genome build: `GRCh38` or `GRCh37` (required) | - |
|
|
167
|
+
| `--prg-seq FILE` | Path to HLA\*LA `sequences.txt` | `HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt` |
|
|
168
|
+
|
|
169
|
+
### Stats
|
|
170
|
+
Compare depth distributions between two inputs over a given region. Each input can be a BAM file or a BED file (auto-detected by extension). Reports mean absolute error (MAE) and Wasserstein-1 distance.
|
|
171
|
+
```bash
|
|
172
|
+
# BAM vs BAM
|
|
173
|
+
samsampleX stats \
|
|
174
|
+
--a template.bam \
|
|
175
|
+
--b sampled.bam \
|
|
176
|
+
--region chr1:1000-2000
|
|
177
|
+
|
|
178
|
+
# BED vs BAM (e.g. combined cohort template against sampled output)
|
|
179
|
+
samsampleX stats \
|
|
180
|
+
--a template.bed \
|
|
181
|
+
--b sampled.bam \
|
|
182
|
+
--region chr1:1000-2000
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
| Option | Description | Default |
|
|
186
|
+
|--------|-------------|---------|
|
|
187
|
+
| `--a FILE` | First input — BAM or BED file (reference) (required) | - |
|
|
188
|
+
| `--b FILE` | Second input — BAM or BED file (comparison) (required) | - |
|
|
189
|
+
| `--region REGION` | Target region, samtools-style (required) | - |
|
|
190
|
+
|
|
191
|
+
## Example
|
|
192
|
+
|
|
193
|
+

|
|
194
|
+
|
|
195
|
+
The following commands showcase an example workflow of a short, arbitrary region on chromosome 21. Three 1000 Genomes Project 30X WGS samples are downloaded and mapped to a template, then used to downsample a GIAB 300X WGS sample in the same region. The results are finally displayed on a plot.
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
cd examples/
|
|
200
|
+
|
|
201
|
+
# Download reference genome (GRCh38)
|
|
202
|
+
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
|
|
203
|
+
|
|
204
|
+
# Download three first three 1K Genomes 30X WGS samples from
|
|
205
|
+
# https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/1000G_2504_high_coverage.sequence.index
|
|
206
|
+
wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239480/NA12718.final.cram -O NA12718.cram && samtools index NA12718.cram
|
|
207
|
+
wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239481/NA12748.final.cram -O NA12748.cram && samtools index NA12748.cram
|
|
208
|
+
wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239482/NA12775.final.cram -O NA12775.cram && samtools index NA12775.cram
|
|
209
|
+
|
|
210
|
+
# Convert to BAM, restrict to target region and index
|
|
211
|
+
samtools view NA12718.cram chr21:10000000-10010000 -b -o NA12718.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12718.bam
|
|
212
|
+
samtools view NA12748.cram chr21:10000000-10010000 -b -o NA12748.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12748.bam
|
|
213
|
+
samtools view NA12775.cram chr21:10000000-10010000 -b -o NA12775.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12775.bam
|
|
214
|
+
|
|
215
|
+
# Run samsampleX workflow
|
|
216
|
+
samsampleX map \
|
|
217
|
+
--template-bam NA12718.bam NA12748.bam NA12775.bam \
|
|
218
|
+
--region chr21:10000000-10010000 \
|
|
219
|
+
--mode mean \
|
|
220
|
+
--collapse 0 \
|
|
221
|
+
--out-bed template.bed
|
|
222
|
+
# template.bed should match example-template.bed
|
|
223
|
+
|
|
224
|
+
# Source BAM+index is provided in the examples directory, created by subsetting to target region from
|
|
225
|
+
# https://ftp.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.GRCh38.300x.bam
|
|
226
|
+
samsampleX sample \
|
|
227
|
+
--source-bam HG002.GRCh38.300x.chr21:10000000-10010000.bam \
|
|
228
|
+
--template-bed template.bed \
|
|
229
|
+
--region chr21:10000000-10010000 \
|
|
230
|
+
--seed 42 \
|
|
231
|
+
--out-bam sampled.bam
|
|
232
|
+
|
|
233
|
+
samtools index sampled.bam
|
|
234
|
+
|
|
235
|
+
samsampleX plot \
|
|
236
|
+
--source-bam HG002.GRCh38.300x.chr21:10000000-10010000.bam \
|
|
237
|
+
--template-bed template.bed \
|
|
238
|
+
--out-bam sampled.bam \
|
|
239
|
+
--region chr21:10000000-10010000 \
|
|
240
|
+
--out-png plot.png
|
|
241
|
+
# plot.png should match example-plot.png
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
## Testing
|
|
245
|
+
|
|
246
|
+
A `pytest` test suite is available. Run with the `-v` flag for a detailed report.
|
|
247
|
+
```bash
|
|
248
|
+
pytest -v
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
## Algorithm rundown
|
|
252
|
+
|
|
253
|
+
### Mapping
|
|
254
|
+
1. Parse target region from first BAM header
|
|
255
|
+
2. Compute per-position depth of coverage for each BAM over the region
|
|
256
|
+
3. If multiple BAMs: combine depths per-position using `--mode` (min, mean, median, max, random)
|
|
257
|
+
4. Optionally collapse consecutive similar depths (`--collapse`)
|
|
258
|
+
5. Write to BED4 format (`chrom`, `start`, `end`, `depth` columns)
|
|
259
|
+
|
|
260
|
+
### Sampling
|
|
261
|
+
1. **Uniform mode** (`--uniform FRACTION`): Skip template downsampling. For each read, hash the read name with xxHash32 to get $f_{read} \in [0, 1)$; keep if $f_{read} < FRACTION$. Deterministic and position-independent.
|
|
262
|
+
2. **Depth-based mode**: Load template depths from BED file(s); if multiple templates are provided, combine them per-position using the selected `--mode`
|
|
263
|
+
3. Compute source depths from BAM
|
|
264
|
+
4. Calculate per-position sampling coefficient: $ratio(i) = \min(1,\; depth_{template}(i) \;/\; depth_{source}(i))$
|
|
265
|
+
- Positions where the template depth meets or exceeds the source depth get coefficient 1.0 (keep all reads)
|
|
266
|
+
- Positions with zero source depth get coefficient 0.0
|
|
267
|
+
5. Build a cumulative sum of the coefficient array for O(1) range queries
|
|
268
|
+
6. For each read in the source BAM:
|
|
269
|
+
- Hash read name with xxHash32 to produce a deterministic fraction $f_{read} \in [0, 1)$
|
|
270
|
+
- Summarise the coefficient over the read's covered positions using `--stat` (min, mean, median, max, random; default mean via cumsum for mean). `random` picks one overlap ratio from a deterministic index (read span + seed).
|
|
271
|
+
- Keep the read if $f_{read} < ratio_{read}$
|
|
272
|
+
|
|
273
|
+
## Metrics
|
|
274
|
+
| Metric | Significance |
|
|
275
|
+
| ------ | ------------ |
|
|
276
|
+
| Mean Absolute Error | Average absolute per-base depth difference between the two BAMs |
|
|
277
|
+
| Wasserstein-1 Distance | L1 distance between empirical CDFs of depth (scales with region length) |
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
## Benchmarking
|
|
281
|
+
Benchmarking is done by a `snakemake` workflow in the `benchmarks` directory, and thus `snakemake` should be installed beforehand (for HPC systems, also install `snakemake-executor-plugin-slurm` or other plugin compatible with your system type).
|
|
282
|
+
|
|
283
|
+
An `Apptainer` container definition `bench.def` that contains installs for `GATK`, `samtools`, `sambamba` and `samsampleX` is included. Build this container using `apptainer build bench.sif bench.def` before running the workflow.
|
|
284
|
+
|
|
285
|
+
Configure the benchmarking parameters in `config.yaml` in the same directory: copy and rename an existing chunk with all parameters and populate the values. All input files are expected to be found in the same directory as `config.yaml`, BAM files should be indexed using `samtools index`.
|
|
286
|
+
|
|
287
|
+
```{yaml}
|
|
288
|
+
# config.yaml
|
|
289
|
+
benchmarks: # all chunks should be children of this header
|
|
290
|
+
wgs-chr21: # arbitrary name for benchmarking instance, parameters will be children
|
|
291
|
+
chr: "chr21" # specify contig
|
|
292
|
+
start: 1 # region start coordinate
|
|
293
|
+
end: 46709982 # region end coordinate
|
|
294
|
+
seed: 42 # random seed (base)
|
|
295
|
+
n_replicates: 1 # replicate count, will affect seed
|
|
296
|
+
# (e.g. seed=42, n=3 will use seeds 43, 44, 45)
|
|
297
|
+
collapse: 0 # define smoothing during mapping step
|
|
298
|
+
templates: # specify files to use as templates in sampling
|
|
299
|
+
- "template.bam" # all files must be in the benchmarks directory
|
|
300
|
+
|
|
301
|
+
mode: "mean" # how to determine per-position template depths from multiple template files
|
|
302
|
+
source: "source.bam" # specify file to downsample
|
|
303
|
+
|
|
304
|
+
coefficient: 0.1 # coefficient provided to GATK, samtools, sambamba
|
|
305
|
+
|
|
306
|
+
cpu: 2 # specify hardware resource (used by all steps)
|
|
307
|
+
mem_mb: 16384
|
|
308
|
+
time: "10:00"
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
When executing the workflow, navigate to the `benchmarks` directory and make sure to use the following arguments:
|
|
312
|
+
```
|
|
313
|
+
snakemake -p --use-apptainer --apptainer-args '--bind $(pwd)'
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
A directory for all intermediate files will be created for each chunk defined in `config.yaml` and the final benchmark results will be made available in the `benchmarks` directory as `benchmark-{chunk_name}.tsv`.
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
# samsampleX
|
|
2
|
+
A Python-based tool for dynamic BAM file downsampling, unlike existing tools that only downsample uniformly, based on a single global fraction value. Sample reads from a source BAM file to match the depth of coverage distribution of one or more template BAM file(s) through a created BED template.
|
|
3
|
+
|
|
4
|
+
## Features:
|
|
5
|
+
- Reproducable, integer seed-based deterministic downsampling
|
|
6
|
+
- Uniform sampling mode: retain a fixed fraction of reads, feature parity with existing tools.
|
|
7
|
+
- Map depth from multiple BAM files to a single BED template via common aggregation statistics (`min`, `mean`, `median`, `max`, `random`).
|
|
8
|
+
- Calculation of quality metrics:
|
|
9
|
+
- Mean absolute error (MAE): mean per-base absolute difference in depth between two BAMs over the region.
|
|
10
|
+
- First-order Wasserstein distance (W1): L1 distance between empirical CDFs of per-base depths.
|
|
11
|
+
- Plotting for visual sampling comparisons, with an option to emit a TSV file of the same data instead.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
### Requirements
|
|
15
|
+
|
|
16
|
+
- pysam
|
|
17
|
+
- xxHash
|
|
18
|
+
- numpy
|
|
19
|
+
- matplotlib
|
|
20
|
+
- scipy
|
|
21
|
+
- Snakemake (benchmarking only)
|
|
22
|
+
- pytest (testing only)
|
|
23
|
+
|
|
24
|
+
### Build samsampleX
|
|
25
|
+
```bash
|
|
26
|
+
git clone https://github.com/sdemiriz/samsampleX.git
|
|
27
|
+
cd samsampleX
|
|
28
|
+
pip install .
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
### Mapping
|
|
33
|
+
Extract depth of coverage from one or more template BAM file(s) to a single BED template. When multiple BAMs are provided, per-position depths are combined using the selected `--mode`.
|
|
34
|
+
```bash
|
|
35
|
+
# Single BAM
|
|
36
|
+
samsampleX map \
|
|
37
|
+
--template-bam template.bam \
|
|
38
|
+
--region chr1:1000-2000 \
|
|
39
|
+
--out-bed template.bed
|
|
40
|
+
|
|
41
|
+
# Multiple BAMs (combined per-position using mean)
|
|
42
|
+
samsampleX map \
|
|
43
|
+
--template-bam a.bam b.bam c.bam \
|
|
44
|
+
--region chr1:1000-2000 \
|
|
45
|
+
--mode mean \
|
|
46
|
+
--out-bed template.bed
|
|
47
|
+
```
|
|
48
|
+
| Option | Description | Default |
|
|
49
|
+
|--------|-------------|---------|
|
|
50
|
+
| `--template-bam FILE [FILE ...]` | Input BAM file(s) (required) | - |
|
|
51
|
+
| `--region REGION` | Target region, samtools-style (required) | - |
|
|
52
|
+
| `--out-bed FILE` | Output BED file (required) | - |
|
|
53
|
+
| `--collapse INT` | Merge consecutive positions with depth diff <= INT | `0` |
|
|
54
|
+
| `--mode MODE` | Combine mode when multiple BAMs: `min`, `mean`, `median`, `max`, `random` | `mean` |
|
|
55
|
+
| `--seed INT` | Random seed for `--mode random` | `42` |
|
|
56
|
+
|
|
57
|
+
### Sampling
|
|
58
|
+
Downsample BAM based on provided BED template, using selected metric if multiple BEDs provided. Alternatively, use `--uniform` for position-independent uniform sampling similar to existing tools.
|
|
59
|
+
|
|
60
|
+
**Depth-based sampling (template required):**
|
|
61
|
+
```bash
|
|
62
|
+
samsampleX sample \
|
|
63
|
+
--source-bam high_depth.bam \
|
|
64
|
+
--template-bed template.bed \
|
|
65
|
+
--region chr1:1000-2000 \
|
|
66
|
+
--out-bam sampled.bam
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
**Uniform sampling (no template):**
|
|
70
|
+
```bash
|
|
71
|
+
samsampleX sample \
|
|
72
|
+
--source-bam high_depth.bam \
|
|
73
|
+
--uniform 0.5 \
|
|
74
|
+
--region chr1:1000-2000 \
|
|
75
|
+
--out-bam sampled.bam
|
|
76
|
+
```
|
|
77
|
+
Retains approximately 50% of reads uniformly across the region.
|
|
78
|
+
|
|
79
|
+
| Option | Description | Default |
|
|
80
|
+
|--------|-------------|---------|
|
|
81
|
+
| `--source-bam FILE` | Input BAM to sample reads from (required) | - |
|
|
82
|
+
| `--template-bed FILE` | Template BED file; required unless `--uniform` is used | - |
|
|
83
|
+
| `--uniform FRACTION` | Uniform sampling: retain fraction of reads. Bypasses template-based downsampling. | - |
|
|
84
|
+
| `--region REGION` | Target region, samtools-style (required) | - |
|
|
85
|
+
| `--out-bam FILE` | Output BAM file to write reads to (required) | - |
|
|
86
|
+
| `--mode MODE` | Combine mode for multiple templates: `min`, `mean`, `median`, `max`, `random` | `random` |
|
|
87
|
+
| `--stat STAT` | Statistic for summarising ratio over read span: `min`, `mean`, `median`, `max`, `random` | `mean` |
|
|
88
|
+
| `--seed INT` | Random seed for reproducibility | `42` |
|
|
89
|
+
|
|
90
|
+
### Plotting
|
|
91
|
+
Compare depth of coverage between source, template, and output BAM files. Output either as PNG plot or TSV data.
|
|
92
|
+
|
|
93
|
+
Green is source, orange is template and blue is output depth.
|
|
94
|
+
|
|
95
|
+
TSV contains one column for `position`, and three for respective depths of source, template and output.
|
|
96
|
+
```bash
|
|
97
|
+
# Generate PNG plot
|
|
98
|
+
samsampleX plot \
|
|
99
|
+
--source-bam high_depth.bam \
|
|
100
|
+
--template-bam template.bam \
|
|
101
|
+
--out-bam sampled.bam \
|
|
102
|
+
--region chr1:1000-2000 \
|
|
103
|
+
--out-png coverage_plot.png
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
| Option | Description | Default |
|
|
107
|
+
|--------|-------------|---------|
|
|
108
|
+
| `--source-bam FILE` | Source BAM file (required) | - |
|
|
109
|
+
| `--template-bam FILE` | Template BAM file (mutually exclusive with --template-bed) | - |
|
|
110
|
+
| `--template-bed FILE` | Template BED file (mutually exclusive with --template-bam) | - |
|
|
111
|
+
| `--out-bam FILE` | Output BAM file from sampling (required) | - |
|
|
112
|
+
| `--region REGION` | Target region, samtools-style (required) | - |
|
|
113
|
+
| `--out-png FILE` | Output PNG plot (mutually exclusive with --out-tsv) | - |
|
|
114
|
+
| `--out-tsv FILE` | Output TSV data (mutually exclusive with --out-png) | - |
|
|
115
|
+
|
|
116
|
+
### Mapback
|
|
117
|
+
**If you do not use HLA\*LA and its specific read processing method, feel free to ignore this section.**
|
|
118
|
+
|
|
119
|
+
Remap HLA\*LA PRG-mapped reads back to canonical chr6 coordinates. This is a preprocessing step for BAM files produced by HLA\*LA, which maps reads to a pangenome reference graph (PRG) with synthetic contig names (`PRG_1`, `PRG_2`, ...). The mapback subcommand translates these back to chr6 positions using the HLA\*LA `sequences.txt` file and known HLA gene / alt contig boundaries.
|
|
120
|
+
|
|
121
|
+
The output BAM can then be used as input to `sample` for depth-aware downsampling on chr6.
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
# Step 1: remap PRG reads to chr6
|
|
125
|
+
samsampleX mapback \
|
|
126
|
+
--source-bam hlala_output.bam \
|
|
127
|
+
--region chr6:28000000-34000000 \
|
|
128
|
+
--genome-build GRCh38 \
|
|
129
|
+
--out-bam remapped.bam
|
|
130
|
+
|
|
131
|
+
# Step 2: sample from the remapped BAM
|
|
132
|
+
samsampleX sample \
|
|
133
|
+
--source-bam remapped.bam \
|
|
134
|
+
--template-bed template.bed \
|
|
135
|
+
--region chr6:28000000-34000000 \
|
|
136
|
+
--out-bam sampled.bam
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
| Option | Description | Default |
|
|
140
|
+
|--------|-------------|---------|
|
|
141
|
+
| `--source-bam FILE` | HLA\*LA-remapped BAM file (required) | - |
|
|
142
|
+
| `--region REGION` | Target region on chr6, samtools-style (required) | - |
|
|
143
|
+
| `--out-bam FILE` | Output BAM file (required) | - |
|
|
144
|
+
| `--genome-build BUILD` | Reference genome build: `GRCh38` or `GRCh37` (required) | - |
|
|
145
|
+
| `--prg-seq FILE` | Path to HLA\*LA `sequences.txt` | `HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT/sequences.txt` |
|
|
146
|
+
|
|
147
|
+
### Stats
|
|
148
|
+
Compare depth distributions between two inputs over a given region. Each input can be a BAM file or a BED file (auto-detected by extension). Reports mean absolute error (MAE) and Wasserstein-1 distance.
|
|
149
|
+
```bash
|
|
150
|
+
# BAM vs BAM
|
|
151
|
+
samsampleX stats \
|
|
152
|
+
--a template.bam \
|
|
153
|
+
--b sampled.bam \
|
|
154
|
+
--region chr1:1000-2000
|
|
155
|
+
|
|
156
|
+
# BED vs BAM (e.g. combined cohort template against sampled output)
|
|
157
|
+
samsampleX stats \
|
|
158
|
+
--a template.bed \
|
|
159
|
+
--b sampled.bam \
|
|
160
|
+
--region chr1:1000-2000
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
| Option | Description | Default |
|
|
164
|
+
|--------|-------------|---------|
|
|
165
|
+
| `--a FILE` | First input — BAM or BED file (reference) (required) | - |
|
|
166
|
+
| `--b FILE` | Second input — BAM or BED file (comparison) (required) | - |
|
|
167
|
+
| `--region REGION` | Target region, samtools-style (required) | - |
|
|
168
|
+
|
|
169
|
+
## Example
|
|
170
|
+
|
|
171
|
+

|
|
172
|
+
|
|
173
|
+
The following commands showcase an example workflow of a short, arbitrary region on chromosome 21. Three 1000 Genomes Project 30X WGS samples are downloaded and mapped to a template, then used to downsample a GIAB 300X WGS sample in the same region. The results are finally displayed on a plot.
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
cd examples/
|
|
178
|
+
|
|
179
|
+
# Download reference genome (GRCh38)
|
|
180
|
+
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
|
|
181
|
+
|
|
182
|
+
# Download three first three 1K Genomes 30X WGS samples from
|
|
183
|
+
# https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/1000G_2504_high_coverage.sequence.index
|
|
184
|
+
wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239480/NA12718.final.cram -O NA12718.cram && samtools index NA12718.cram
|
|
185
|
+
wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239481/NA12748.final.cram -O NA12748.cram && samtools index NA12748.cram
|
|
186
|
+
wget ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR323/ERR3239482/NA12775.final.cram -O NA12775.cram && samtools index NA12775.cram
|
|
187
|
+
|
|
188
|
+
# Convert to BAM, restrict to target region and index
|
|
189
|
+
samtools view NA12718.cram chr21:10000000-10010000 -b -o NA12718.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12718.bam
|
|
190
|
+
samtools view NA12748.cram chr21:10000000-10010000 -b -o NA12748.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12748.bam
|
|
191
|
+
samtools view NA12775.cram chr21:10000000-10010000 -b -o NA12775.bam -T GRCh38_full_analysis_set_plus_decoy_hla.fa && samtools index NA12775.bam
|
|
192
|
+
|
|
193
|
+
# Run samsampleX workflow
|
|
194
|
+
samsampleX map \
|
|
195
|
+
--template-bam NA12718.bam NA12748.bam NA12775.bam \
|
|
196
|
+
--region chr21:10000000-10010000 \
|
|
197
|
+
--mode mean \
|
|
198
|
+
--collapse 0 \
|
|
199
|
+
--out-bed template.bed
|
|
200
|
+
# template.bed should match example-template.bed
|
|
201
|
+
|
|
202
|
+
# Source BAM+index is provided in the examples directory, created by subsetting to target region from
|
|
203
|
+
# https://ftp.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.GRCh38.300x.bam
|
|
204
|
+
samsampleX sample \
|
|
205
|
+
--source-bam HG002.GRCh38.300x.chr21:10000000-10010000.bam \
|
|
206
|
+
--template-bed template.bed \
|
|
207
|
+
--region chr21:10000000-10010000 \
|
|
208
|
+
--seed 42 \
|
|
209
|
+
--out-bam sampled.bam
|
|
210
|
+
|
|
211
|
+
samtools index sampled.bam
|
|
212
|
+
|
|
213
|
+
samsampleX plot \
|
|
214
|
+
--source-bam HG002.GRCh38.300x.chr21:10000000-10010000.bam \
|
|
215
|
+
--template-bed template.bed \
|
|
216
|
+
--out-bam sampled.bam \
|
|
217
|
+
--region chr21:10000000-10010000 \
|
|
218
|
+
--out-png plot.png
|
|
219
|
+
# plot.png should match example-plot.png
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Testing
|
|
223
|
+
|
|
224
|
+
A `pytest` test suite is available. Run with the `-v` flag for a detailed report.
|
|
225
|
+
```bash
|
|
226
|
+
pytest -v
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Algorithm rundown
|
|
230
|
+
|
|
231
|
+
### Mapping
|
|
232
|
+
1. Parse target region from first BAM header
|
|
233
|
+
2. Compute per-position depth of coverage for each BAM over the region
|
|
234
|
+
3. If multiple BAMs: combine depths per-position using `--mode` (min, mean, median, max, random)
|
|
235
|
+
4. Optionally collapse consecutive similar depths (`--collapse`)
|
|
236
|
+
5. Write to BED4 format (`chrom`, `start`, `end`, `depth` columns)
|
|
237
|
+
|
|
238
|
+
### Sampling
|
|
239
|
+
1. **Uniform mode** (`--uniform FRACTION`): Skip template downsampling. For each read, hash the read name with xxHash32 to get $f_{read} \in [0, 1)$; keep if $f_{read} < FRACTION$. Deterministic and position-independent.
|
|
240
|
+
2. **Depth-based mode**: Load template depths from BED file(s); if multiple templates are provided, combine them per-position using the selected `--mode`
|
|
241
|
+
3. Compute source depths from BAM
|
|
242
|
+
4. Calculate per-position sampling coefficient: $ratio(i) = \min(1,\; depth_{template}(i) \;/\; depth_{source}(i))$
|
|
243
|
+
- Positions where the template depth meets or exceeds the source depth get coefficient 1.0 (keep all reads)
|
|
244
|
+
- Positions with zero source depth get coefficient 0.0
|
|
245
|
+
5. Build a cumulative sum of the coefficient array for O(1) range queries
|
|
246
|
+
6. For each read in the source BAM:
|
|
247
|
+
- Hash read name with xxHash32 to produce a deterministic fraction $f_{read} \in [0, 1)$
|
|
248
|
+
- Summarise the coefficient over the read's covered positions using `--stat` (min, mean, median, max, random; default mean via cumsum for mean). `random` picks one overlap ratio from a deterministic index (read span + seed).
|
|
249
|
+
- Keep the read if $f_{read} < ratio_{read}$
|
|
250
|
+
|
|
251
|
+
## Metrics
|
|
252
|
+
| Metric | Significance |
|
|
253
|
+
| ------ | ------------ |
|
|
254
|
+
| Mean Absolute Error | Average absolute per-base depth difference between the two BAMs |
|
|
255
|
+
| Wasserstein-1 Distance | L1 distance between empirical CDFs of depth (scales with region length) |
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
## Benchmarking
|
|
259
|
+
Benchmarking is done by a `snakemake` workflow in the `benchmarks` directory, and thus `snakemake` should be installed beforehand (for HPC systems, also install `snakemake-executor-plugin-slurm` or other plugin compatible with your system type).
|
|
260
|
+
|
|
261
|
+
An `Apptainer` container definition `bench.def` that contains installs for `GATK`, `samtools`, `sambamba` and `samsampleX` is included. Build this container using `apptainer build bench.sif bench.def` before running the workflow.
|
|
262
|
+
|
|
263
|
+
Configure the benchmarking parameters in `config.yaml` in the same directory: copy and rename an existing chunk with all parameters and populate the values. All input files are expected to be found in the same directory as `config.yaml`, BAM files should be indexed using `samtools index`.
|
|
264
|
+
|
|
265
|
+
```{yaml}
|
|
266
|
+
# config.yaml
|
|
267
|
+
benchmarks: # all chunks should be children of this header
|
|
268
|
+
wgs-chr21: # arbitrary name for benchmarking instance, parameters will be children
|
|
269
|
+
chr: "chr21" # specify contig
|
|
270
|
+
start: 1 # region start coordinate
|
|
271
|
+
end: 46709982 # region end coordinate
|
|
272
|
+
seed: 42 # random seed (base)
|
|
273
|
+
n_replicates: 1 # replicate count, will affect seed
|
|
274
|
+
# (e.g. seed=42, n=3 will use seeds 43, 44, 45)
|
|
275
|
+
collapse: 0 # define smoothing during mapping step
|
|
276
|
+
templates: # specify files to use as templates in sampling
|
|
277
|
+
- "template.bam" # all files must be in the benchmarks directory
|
|
278
|
+
|
|
279
|
+
mode: "mean" # how to determine per-position template depths from multiple template files
|
|
280
|
+
source: "source.bam" # specify file to downsample
|
|
281
|
+
|
|
282
|
+
coefficient: 0.1 # coefficient provided to GATK, samtools, sambamba
|
|
283
|
+
|
|
284
|
+
cpu: 2 # specify hardware resource (used by all steps)
|
|
285
|
+
mem_mb: 16384
|
|
286
|
+
time: "10:00"
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
When executing the workflow, navigate to the `benchmarks` directory and make sure to use the following arguments:
|
|
290
|
+
```
|
|
291
|
+
snakemake -p --use-apptainer --apptainer-args '--bind $(pwd)'
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
A directory for all intermediate files will be created for each chunk defined in `config.yaml` and the final benchmark results will be made available in the `benchmarks` directory as `benchmark-{chunk_name}.tsv`.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "samsampleX"
|
|
7
|
+
authors = [
|
|
8
|
+
{name = "Sedat Demiriz", email = "sedat.demiriz@mail.mcgill.ca"},
|
|
9
|
+
{name = "Daniel Taliun", email = "daniel.taliun@mcgill.ca"},
|
|
10
|
+
]
|
|
11
|
+
version = "0.1.0"
|
|
12
|
+
description = "Depth-aware dynamic BAM file downsampling"
|
|
13
|
+
keywords = ["bam", "alignment", "downsampling", "sampling", "samtools", "sambamba", "gatk", "genome", "bioinformatics", "genomics"]
|
|
14
|
+
requires-python = ">=3.9"
|
|
15
|
+
readme = "README.md"
|
|
16
|
+
license = "MIT"
|
|
17
|
+
dependencies = [
|
|
18
|
+
"xxhash>=3.5.0",
|
|
19
|
+
"pysam>=0.23.3",
|
|
20
|
+
"numpy>=2.3.3",
|
|
21
|
+
"matplotlib>=3.10.8",
|
|
22
|
+
"scipy>=1.17.0",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Repository = "https://github.com/sdemiriz/samsampleX"
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
benchmark = [
|
|
30
|
+
"snakemake>=9.16.3",
|
|
31
|
+
"snakemake-executor-plugin-slurm>=2.3.1",
|
|
32
|
+
"pandas>=2.3.3",
|
|
33
|
+
]
|
|
34
|
+
test = [
|
|
35
|
+
"pytest>=9.0.2"
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
samsampleX = "samsamplex.cli:main"
|