bam2tensor 2.5__tar.gz → 2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/docs.yml +1 -1
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/release.yml +3 -3
- {bam2tensor-2.5 → bam2tensor-2.6}/PKG-INFO +96 -2
- {bam2tensor-2.5 → bam2tensor-2.6}/README.md +95 -1
- {bam2tensor-2.5 → bam2tensor-2.6}/pyproject.toml +1 -1
- {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/__init__.py +1 -1
- {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/functions.py +42 -18
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_functions.py +127 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_inspect.py +1 -1
- {bam2tensor-2.5 → bam2tensor-2.6}/uv.lock +131 -115
- {bam2tensor-2.5 → bam2tensor-2.6}/.darglint +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.editorconfig +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.gitattributes +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/actions/setup-env/action.yml +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/dependabot.yml +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/labels.yml +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/release-drafter.yml +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/constraints.txt +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/labeler.yml +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/tests.yml +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.gitignore +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/.pre-commit-config.yaml +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/CLAUDE.md +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/CONTRIBUTING.md +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/LICENSE +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/SECURITY.md +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/Makefile +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/conf.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/contributing.md +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/index.md +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/license.md +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/logo/bam2tensor-logo.afdesign +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/logo/bam2tensor-logo.png +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/make.bat +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/nano-banana-overview-shrunk.png +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/reference.md +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/docs/templates/package.rst_t +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/noxfile.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/__main__.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/embedding.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/inspect.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/metadata.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/py.typed +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/reference.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/__init__.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_duplication.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_embedding.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_fasta.fa +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_filters.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_main.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_metadata.py +0 -0
- {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_reference.py +0 -0
|
@@ -67,16 +67,16 @@ jobs:
|
|
|
67
67
|
|
|
68
68
|
- name: Publish package on PyPI
|
|
69
69
|
if: steps.check-version.outputs.tag || steps.check-tag.outputs.tag
|
|
70
|
-
uses: pypa/gh-action-pypi-publish@v1.
|
|
70
|
+
uses: pypa/gh-action-pypi-publish@v1.14.0
|
|
71
71
|
|
|
72
72
|
- name: Publish package on TestPyPI
|
|
73
73
|
if: (!steps.check-version.outputs.tag && !steps.check-tag.outputs.tag)
|
|
74
|
-
uses: pypa/gh-action-pypi-publish@v1.
|
|
74
|
+
uses: pypa/gh-action-pypi-publish@v1.14.0
|
|
75
75
|
with:
|
|
76
76
|
repository-url: https://test.pypi.org/legacy/
|
|
77
77
|
|
|
78
78
|
- name: Publish the release notes
|
|
79
|
-
uses: release-drafter/release-drafter@v7.
|
|
79
|
+
uses: release-drafter/release-drafter@v7.2.1
|
|
80
80
|
with:
|
|
81
81
|
publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
|
|
82
82
|
tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bam2tensor
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.6
|
|
4
4
|
Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
|
|
5
5
|
Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
|
|
6
6
|
Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
|
|
@@ -72,6 +72,7 @@ Description-Content-Type: text/markdown
|
|
|
72
72
|
- [Custom Output Directory](#custom-output-directory)
|
|
73
73
|
- [Using a Custom Genome](#using-a-custom-genome)
|
|
74
74
|
- [Command-Line Options](#command-line-options)
|
|
75
|
+
- [Filtering Conversion Errors](#filtering-conversion-errors)
|
|
75
76
|
- [Inspecting Output Files](#inspecting-output-files)
|
|
76
77
|
- [Output Data Structure](#output-data-structure)
|
|
77
78
|
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
@@ -99,6 +100,7 @@ Description-Content-Type: text/markdown
|
|
|
99
100
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
100
101
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
101
102
|
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
103
|
+
- **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
|
|
102
104
|
|
|
103
105
|
## Requirements
|
|
104
106
|
|
|
@@ -256,6 +258,25 @@ Options:
|
|
|
256
258
|
determine CpG sites).
|
|
257
259
|
--quality-limit INTEGER Quality filter for aligned reads (default =
|
|
258
260
|
20)
|
|
261
|
+
--filter-non-converted Drop reads with >= --non-converted-threshold
|
|
262
|
+
retained non-CpG cytosines, the signature of
|
|
263
|
+
incomplete bisulfite/EM-seq conversion (port
|
|
264
|
+
of nebiolabs/mark-nonconverted-reads).
|
|
265
|
+
Default: off.
|
|
266
|
+
--non-converted-threshold INTEGER
|
|
267
|
+
Minimum count of retained non-CpG cytosines
|
|
268
|
+
to drop a read (default = 3, matches NEB
|
|
269
|
+
mark-nonconverted-reads).
|
|
270
|
+
--filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
|
|
271
|
+
called unmethylated and cover at least --em-
|
|
272
|
+
overconversion-min-cpgs sites (heuristic for
|
|
273
|
+
the fragment-level over-conversion artifact
|
|
274
|
+
described in Loyfer et al. bioRxiv
|
|
275
|
+
2026.03.24.713040). Default: off.
|
|
276
|
+
--em-overconversion-min-cpgs INTEGER
|
|
277
|
+
Minimum covered CpG count required before
|
|
278
|
+
the EM over-conversion filter will drop a
|
|
279
|
+
read (default = 3).
|
|
259
280
|
--verbose Verbose output.
|
|
260
281
|
--skip-cache De-novo generate CpG sites (slow).
|
|
261
282
|
--debug Debug mode (extensive validity checking +
|
|
@@ -281,6 +302,10 @@ Options:
|
|
|
281
302
|
| `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
|
|
282
303
|
| `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
|
|
283
304
|
| `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
|
|
305
|
+
| `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
|
|
306
|
+
| `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
|
|
307
|
+
| `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
|
|
308
|
+
| `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
|
|
284
309
|
| `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
|
|
285
310
|
| `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
|
|
286
311
|
| `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
|
|
@@ -289,6 +314,66 @@ Options:
|
|
|
289
314
|
| `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
|
|
290
315
|
| `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
|
|
291
316
|
|
|
317
|
+
## Filtering Conversion Errors
|
|
318
|
+
|
|
319
|
+
Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
|
|
320
|
+
|
|
321
|
+
### `--filter-non-converted` — incomplete conversion
|
|
322
|
+
|
|
323
|
+
Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
|
|
324
|
+
|
|
325
|
+
- **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
|
|
326
|
+
- **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
|
|
327
|
+
|
|
328
|
+
### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
|
|
329
|
+
|
|
330
|
+
A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
|
|
331
|
+
|
|
332
|
+
The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
|
|
333
|
+
|
|
334
|
+
### Usage
|
|
335
|
+
|
|
336
|
+
```bash
|
|
337
|
+
bam2tensor \
|
|
338
|
+
--input-path sample.bam \
|
|
339
|
+
--reference-fasta GRCh38.fa \
|
|
340
|
+
--genome-name hg38 \
|
|
341
|
+
--filter-non-converted \
|
|
342
|
+
--filter-em-overconversion
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
Filter parameters and enabled state are written to the output `metadata.json`:
|
|
346
|
+
|
|
347
|
+
```json
|
|
348
|
+
{
|
|
349
|
+
"filters": {
|
|
350
|
+
"non_converted_reads": {"enabled": true, "threshold": 3},
|
|
351
|
+
"em_overconversion": {"enabled": true, "min_cpgs": 3}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### Reproducibility note
|
|
357
|
+
|
|
358
|
+
The two filters differ in whether they can be replayed downstream without the source BAM:
|
|
359
|
+
|
|
360
|
+
- **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
|
|
361
|
+
|
|
362
|
+
```python
|
|
363
|
+
import scipy.sparse
|
|
364
|
+
mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
|
|
365
|
+
min_cpgs = 3
|
|
366
|
+
kept_rows = []
|
|
367
|
+
for i in range(mat.shape[0]):
|
|
368
|
+
row = mat.getrow(i).toarray().ravel()
|
|
369
|
+
covered = row[(row == 0) | (row == 1)] # drop -1 no-data
|
|
370
|
+
is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
|
|
371
|
+
if not is_overconv:
|
|
372
|
+
kept_rows.append(i)
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
- **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
|
|
376
|
+
|
|
292
377
|
## Inspecting Output Files
|
|
293
378
|
|
|
294
379
|
Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
|
|
@@ -302,11 +387,15 @@ sample.methylation.npz
|
|
|
302
387
|
CpG sites: 28,217,448
|
|
303
388
|
Data points: 12,847,322 (sparsity: 99.97%)
|
|
304
389
|
Fragment len: median 167, mean 182, range [50, 600]
|
|
390
|
+
Filters: non-converted (>= 3 non-CpG Cs)
|
|
391
|
+
EM over-conversion (all-unmethylated, >= 3 CpGs)
|
|
305
392
|
CpG index CRC32: a1b2c3d4
|
|
306
|
-
bam2tensor: v2.
|
|
393
|
+
bam2tensor: v2.5
|
|
307
394
|
File size: 14.2 MB
|
|
308
395
|
```
|
|
309
396
|
|
|
397
|
+
When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
|
|
398
|
+
|
|
310
399
|
You can pass multiple files at once:
|
|
311
400
|
|
|
312
401
|
```bash
|
|
@@ -368,6 +457,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
|
|
|
368
457
|
| `expected_chromosomes` | List of chromosomes included in the column mapping |
|
|
369
458
|
| `total_cpg_sites` | Total number of CpG columns in the matrix |
|
|
370
459
|
| `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
|
|
460
|
+
| `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
|
|
371
461
|
|
|
372
462
|
This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
|
|
373
463
|
|
|
@@ -570,6 +660,10 @@ extract_methylation_data_from_bam(
|
|
|
570
660
|
input_bam: str, # Path to BAM file
|
|
571
661
|
genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
|
|
572
662
|
quality_limit: int = 20, # Minimum MAPQ
|
|
663
|
+
filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
|
|
664
|
+
non_converted_threshold: int = 3, # Threshold for the above filter
|
|
665
|
+
filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
|
|
666
|
+
em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
|
|
573
667
|
verbose: bool = False, # Enable verbose output
|
|
574
668
|
debug: bool = False # Enable debug output
|
|
575
669
|
) -> ExtractionResult
|
|
@@ -39,6 +39,7 @@
|
|
|
39
39
|
- [Custom Output Directory](#custom-output-directory)
|
|
40
40
|
- [Using a Custom Genome](#using-a-custom-genome)
|
|
41
41
|
- [Command-Line Options](#command-line-options)
|
|
42
|
+
- [Filtering Conversion Errors](#filtering-conversion-errors)
|
|
42
43
|
- [Inspecting Output Files](#inspecting-output-files)
|
|
43
44
|
- [Output Data Structure](#output-data-structure)
|
|
44
45
|
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
@@ -66,6 +67,7 @@
|
|
|
66
67
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
67
68
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
68
69
|
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
70
|
+
- **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
|
|
69
71
|
|
|
70
72
|
## Requirements
|
|
71
73
|
|
|
@@ -223,6 +225,25 @@ Options:
|
|
|
223
225
|
determine CpG sites).
|
|
224
226
|
--quality-limit INTEGER Quality filter for aligned reads (default =
|
|
225
227
|
20)
|
|
228
|
+
--filter-non-converted Drop reads with >= --non-converted-threshold
|
|
229
|
+
retained non-CpG cytosines, the signature of
|
|
230
|
+
incomplete bisulfite/EM-seq conversion (port
|
|
231
|
+
of nebiolabs/mark-nonconverted-reads).
|
|
232
|
+
Default: off.
|
|
233
|
+
--non-converted-threshold INTEGER
|
|
234
|
+
Minimum count of retained non-CpG cytosines
|
|
235
|
+
to drop a read (default = 3, matches NEB
|
|
236
|
+
mark-nonconverted-reads).
|
|
237
|
+
--filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
|
|
238
|
+
called unmethylated and cover at least --em-
|
|
239
|
+
overconversion-min-cpgs sites (heuristic for
|
|
240
|
+
the fragment-level over-conversion artifact
|
|
241
|
+
described in Loyfer et al. bioRxiv
|
|
242
|
+
2026.03.24.713040). Default: off.
|
|
243
|
+
--em-overconversion-min-cpgs INTEGER
|
|
244
|
+
Minimum covered CpG count required before
|
|
245
|
+
the EM over-conversion filter will drop a
|
|
246
|
+
read (default = 3).
|
|
226
247
|
--verbose Verbose output.
|
|
227
248
|
--skip-cache De-novo generate CpG sites (slow).
|
|
228
249
|
--debug Debug mode (extensive validity checking +
|
|
@@ -248,6 +269,10 @@ Options:
|
|
|
248
269
|
| `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
|
|
249
270
|
| `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
|
|
250
271
|
| `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
|
|
272
|
+
| `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
|
|
273
|
+
| `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
|
|
274
|
+
| `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
|
|
275
|
+
| `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
|
|
251
276
|
| `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
|
|
252
277
|
| `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
|
|
253
278
|
| `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
|
|
@@ -256,6 +281,66 @@ Options:
|
|
|
256
281
|
| `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
|
|
257
282
|
| `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
|
|
258
283
|
|
|
284
|
+
## Filtering Conversion Errors
|
|
285
|
+
|
|
286
|
+
Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
|
|
287
|
+
|
|
288
|
+
### `--filter-non-converted` — incomplete conversion
|
|
289
|
+
|
|
290
|
+
Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
|
|
291
|
+
|
|
292
|
+
- **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
|
|
293
|
+
- **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
|
|
294
|
+
|
|
295
|
+
### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
|
|
296
|
+
|
|
297
|
+
A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
|
|
298
|
+
|
|
299
|
+
The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
|
|
300
|
+
|
|
301
|
+
### Usage
|
|
302
|
+
|
|
303
|
+
```bash
|
|
304
|
+
bam2tensor \
|
|
305
|
+
--input-path sample.bam \
|
|
306
|
+
--reference-fasta GRCh38.fa \
|
|
307
|
+
--genome-name hg38 \
|
|
308
|
+
--filter-non-converted \
|
|
309
|
+
--filter-em-overconversion
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
Filter parameters and enabled state are written to the output `metadata.json`:
|
|
313
|
+
|
|
314
|
+
```json
|
|
315
|
+
{
|
|
316
|
+
"filters": {
|
|
317
|
+
"non_converted_reads": {"enabled": true, "threshold": 3},
|
|
318
|
+
"em_overconversion": {"enabled": true, "min_cpgs": 3}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### Reproducibility note
|
|
324
|
+
|
|
325
|
+
The two filters differ in whether they can be replayed downstream without the source BAM:
|
|
326
|
+
|
|
327
|
+
- **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
import scipy.sparse
|
|
331
|
+
mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
|
|
332
|
+
min_cpgs = 3
|
|
333
|
+
kept_rows = []
|
|
334
|
+
for i in range(mat.shape[0]):
|
|
335
|
+
row = mat.getrow(i).toarray().ravel()
|
|
336
|
+
covered = row[(row == 0) | (row == 1)] # drop -1 no-data
|
|
337
|
+
is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
|
|
338
|
+
if not is_overconv:
|
|
339
|
+
kept_rows.append(i)
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
- **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
|
|
343
|
+
|
|
259
344
|
## Inspecting Output Files
|
|
260
345
|
|
|
261
346
|
Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
|
|
@@ -269,11 +354,15 @@ sample.methylation.npz
|
|
|
269
354
|
CpG sites: 28,217,448
|
|
270
355
|
Data points: 12,847,322 (sparsity: 99.97%)
|
|
271
356
|
Fragment len: median 167, mean 182, range [50, 600]
|
|
357
|
+
Filters: non-converted (>= 3 non-CpG Cs)
|
|
358
|
+
EM over-conversion (all-unmethylated, >= 3 CpGs)
|
|
272
359
|
CpG index CRC32: a1b2c3d4
|
|
273
|
-
bam2tensor: v2.
|
|
360
|
+
bam2tensor: v2.5
|
|
274
361
|
File size: 14.2 MB
|
|
275
362
|
```
|
|
276
363
|
|
|
364
|
+
When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
|
|
365
|
+
|
|
277
366
|
You can pass multiple files at once:
|
|
278
367
|
|
|
279
368
|
```bash
|
|
@@ -335,6 +424,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
|
|
|
335
424
|
| `expected_chromosomes` | List of chromosomes included in the column mapping |
|
|
336
425
|
| `total_cpg_sites` | Total number of CpG columns in the matrix |
|
|
337
426
|
| `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
|
|
427
|
+
| `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
|
|
338
428
|
|
|
339
429
|
This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
|
|
340
430
|
|
|
@@ -537,6 +627,10 @@ extract_methylation_data_from_bam(
|
|
|
537
627
|
input_bam: str, # Path to BAM file
|
|
538
628
|
genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
|
|
539
629
|
quality_limit: int = 20, # Minimum MAPQ
|
|
630
|
+
filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
|
|
631
|
+
non_converted_threshold: int = 3, # Threshold for the above filter
|
|
632
|
+
filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
|
|
633
|
+
em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
|
|
540
634
|
verbose: bool = False, # Enable verbose output
|
|
541
635
|
debug: bool = False # Enable debug output
|
|
542
636
|
) -> ExtractionResult
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bam2tensor"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.6"
|
|
4
4
|
description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
|
|
5
5
|
authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
|
|
6
6
|
license = "MIT"
|
|
@@ -706,25 +706,36 @@ def extract_methylation_data_from_bam(
|
|
|
706
706
|
|
|
707
707
|
# get_aligned_pairs returns a list of tuples of (read_pos, ref_pos)
|
|
708
708
|
# We filter this to only include the specific CpG sites from above
|
|
709
|
+
aligned_pairs = aligned_segment.get_aligned_pairs(matches_only=True)
|
|
709
710
|
this_segment_cpgs = [
|
|
710
|
-
e
|
|
711
|
-
for e in aligned_segment.get_aligned_pairs(matches_only=True)
|
|
712
|
-
if e[1] + 1 in cpgs_within_read_set
|
|
711
|
+
e for e in aligned_pairs if e[1] + 1 in cpgs_within_read_set
|
|
713
712
|
]
|
|
714
713
|
|
|
715
714
|
# If no CpGs covered (after filtering for matches only), skip
|
|
716
715
|
if not this_segment_cpgs:
|
|
717
716
|
continue
|
|
718
717
|
|
|
719
|
-
#
|
|
720
|
-
#
|
|
721
|
-
#
|
|
722
|
-
#
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
718
|
+
# OT (forward parent): methylation-informative base sits on the
|
|
719
|
+
# top-strand C at ref_pos. BAM SEQ is reference-oriented, so
|
|
720
|
+
# C = methylated, T = unmethylated.
|
|
721
|
+
# OB (reverse parent): the original bottom-strand C lives at
|
|
722
|
+
# ref_pos + 1 (the G of the top-strand CG). After the aligner
|
|
723
|
+
# reverse-complements into reference orientation for BAM
|
|
724
|
+
# storage, that base reads G = methylated, A = unmethylated.
|
|
725
|
+
# At ref_pos itself, BAM always shows C (the unaffected
|
|
726
|
+
# bottom-strand G reverse-complemented), which is why reading
|
|
727
|
+
# ref_pos on OB reads collapses every CpG to "methylated".
|
|
728
|
+
query_sequence = aligned_segment.query_sequence
|
|
729
|
+
if bisulfite_parent_strand_is_reverse:
|
|
730
|
+
methylated_base, unmethylated_base = "G", "A"
|
|
731
|
+
# Indels at the CpG boundary mean ref_pos + 1 isn't always
|
|
732
|
+
# query_pos + 1 — go through a ref -> query map.
|
|
733
|
+
ref_to_query: dict[int, int] = {ref: q for q, ref in aligned_pairs}
|
|
734
|
+
else:
|
|
735
|
+
methylated_base, unmethylated_base = "C", "T"
|
|
736
|
+
ref_to_query = {}
|
|
727
737
|
|
|
738
|
+
for query_pos, ref_pos in this_segment_cpgs:
|
|
728
739
|
read_cpg_cols.append(
|
|
729
740
|
genome_methylation_embedding.genomic_position_to_embedding(
|
|
730
741
|
chrom,
|
|
@@ -732,21 +743,34 @@ def extract_methylation_data_from_bam(
|
|
|
732
743
|
)
|
|
733
744
|
)
|
|
734
745
|
|
|
735
|
-
if
|
|
736
|
-
|
|
746
|
+
if bisulfite_parent_strand_is_reverse:
|
|
747
|
+
target_query_pos = ref_to_query.get(ref_pos + 1)
|
|
748
|
+
if target_query_pos is None:
|
|
749
|
+
read_cpg_data.append(-1)
|
|
750
|
+
if debug:
|
|
751
|
+
print(f"\t{query_pos} {ref_pos} [Indel at OB target]")
|
|
752
|
+
continue
|
|
753
|
+
query_base = query_sequence[target_query_pos] # type: ignore[index]
|
|
754
|
+
else:
|
|
755
|
+
query_base = query_sequence[query_pos] # type: ignore[index]
|
|
756
|
+
|
|
757
|
+
if query_base == methylated_base:
|
|
737
758
|
read_cpg_data.append(1)
|
|
738
759
|
if debug:
|
|
739
|
-
print(
|
|
740
|
-
|
|
760
|
+
print(
|
|
761
|
+
f"\t{query_pos} {ref_pos} {methylated_base}->{query_base} [Methylated]"
|
|
762
|
+
)
|
|
763
|
+
elif query_base == unmethylated_base:
|
|
741
764
|
read_cpg_data.append(0)
|
|
742
|
-
# Unmethylated
|
|
743
765
|
if debug:
|
|
744
|
-
print(
|
|
766
|
+
print(
|
|
767
|
+
f"\t{query_pos} {ref_pos} {methylated_base}->{query_base} [Unmethylated]"
|
|
768
|
+
)
|
|
745
769
|
else:
|
|
746
770
|
read_cpg_data.append(-1)
|
|
747
771
|
if debug:
|
|
748
772
|
print(
|
|
749
|
-
f"\t{query_pos} {ref_pos}
|
|
773
|
+
f"\t{query_pos} {ref_pos} {methylated_base}->{query_base} [Unknown! SNV? Indel?]"
|
|
750
774
|
)
|
|
751
775
|
|
|
752
776
|
if filter_em_overconversion and is_em_overconversion_read(
|
|
@@ -1074,6 +1074,133 @@ def test_biscuit_debug_mode_ct_bases(tmp_path):
|
|
|
1074
1074
|
assert result.matrix.shape[0] == 1
|
|
1075
1075
|
|
|
1076
1076
|
|
|
1077
|
+
def test_biscuit_ob_strand_methylation_extraction(tmp_path):
|
|
1078
|
+
"""Biscuit/bwameth OB-strand (YD=r, is_reverse=True) reads must read the
|
|
1079
|
+
methylation-informative base at ref_pos+1 (G=methylated, A=unmethylated),
|
|
1080
|
+
not ref_pos (which is always C in BAM SEQ regardless of methylation state).
|
|
1081
|
+
|
|
1082
|
+
Regression for the bug where OB reads were extracted with C/T logic at
|
|
1083
|
+
ref_pos and thus scored as universally methylated.
|
|
1084
|
+
"""
|
|
1085
|
+
fasta_path = tmp_path / "ref.fa"
|
|
1086
|
+
# CpGs at 1-based positions 10, 21 (top-strand C at 0-based 9, 20; G at 10, 21).
|
|
1087
|
+
seq = "N" * 9 + "CG" + "N" * 9 + "CG" + "N" * 128
|
|
1088
|
+
with open(fasta_path, "w") as f:
|
|
1089
|
+
f.write(">chr1\n" + seq + "\n")
|
|
1090
|
+
|
|
1091
|
+
emb = embedding.GenomeMethylationEmbedding(
|
|
1092
|
+
"test_biscuit_ob",
|
|
1093
|
+
expected_chromosomes=["chr1"],
|
|
1094
|
+
fasta_source=str(fasta_path),
|
|
1095
|
+
skip_cache=True,
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
bam_path = tmp_path / "test.bam"
|
|
1099
|
+
header = {"HD": {"VN": "1.0"}, "SQ": [{"LN": len(seq), "SN": "chr1"}]}
|
|
1100
|
+
# OB read: BAM SEQ is reference-oriented. The C of each top-strand CG is
|
|
1101
|
+
# always C in BAM (bottom-strand G reverse-complemented). The G of each
|
|
1102
|
+
# top-strand CG is what carries the methylation signal: G=methylated,
|
|
1103
|
+
# A=unmethylated.
|
|
1104
|
+
read_seq = list("N" * len(seq))
|
|
1105
|
+
read_seq[9] = "C" # top-strand C of CpG#1 (always C in BAM for OB)
|
|
1106
|
+
read_seq[10] = "G" # methylated → G at ref_pos+1
|
|
1107
|
+
read_seq[20] = "C" # top-strand C of CpG#2 (always C in BAM for OB)
|
|
1108
|
+
read_seq[21] = "A" # unmethylated → A at ref_pos+1
|
|
1109
|
+
with pysam.AlignmentFile(bam_path, "wb", header=header) as out_bam:
|
|
1110
|
+
a = pysam.AlignedSegment()
|
|
1111
|
+
a.query_name = "ob_read"
|
|
1112
|
+
a.query_sequence = "".join(read_seq)
|
|
1113
|
+
a.flag = 0x10 # reverse-mapped
|
|
1114
|
+
a.reference_id = 0
|
|
1115
|
+
a.reference_start = 0
|
|
1116
|
+
a.mapping_quality = 60
|
|
1117
|
+
a.cigartuples = [(0, len(seq))]
|
|
1118
|
+
a.set_tag("MD", str(len(seq)))
|
|
1119
|
+
a.set_tag("YD", "r") # OB / reverse parent strand
|
|
1120
|
+
out_bam.write(a)
|
|
1121
|
+
pysam.index(str(bam_path))
|
|
1122
|
+
|
|
1123
|
+
result = functions.extract_methylation_data_from_bam(
|
|
1124
|
+
input_bam=str(bam_path),
|
|
1125
|
+
genome_methylation_embedding=emb,
|
|
1126
|
+
)
|
|
1127
|
+
assert result.matrix.shape[0] == 1
|
|
1128
|
+
assert result.matrix.nnz == 2
|
|
1129
|
+
data = sorted(result.matrix.data)
|
|
1130
|
+
assert data == [0, 1], (
|
|
1131
|
+
f"Expected one methylated (1) and one unmethylated (0) call, got {data}. "
|
|
1132
|
+
"If this is all 1s, the OB-strand base lookup regression has returned."
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
|
|
1136
|
+
def test_biscuit_ot_and_ob_share_cpg_columns(tmp_path):
|
|
1137
|
+
"""OT and OB reads at the same CpG must land in the same embedding column
|
|
1138
|
+
(canonical CpG site = top-strand C, ref_pos+1 in 1-based coordinates).
|
|
1139
|
+
"""
|
|
1140
|
+
fasta_path = tmp_path / "ref.fa"
|
|
1141
|
+
seq = "N" * 9 + "CG" + "N" * 9 + "CG" + "N" * 128
|
|
1142
|
+
with open(fasta_path, "w") as f:
|
|
1143
|
+
f.write(">chr1\n" + seq + "\n")
|
|
1144
|
+
|
|
1145
|
+
emb = embedding.GenomeMethylationEmbedding(
|
|
1146
|
+
"test_biscuit_ot_ob_columns",
|
|
1147
|
+
expected_chromosomes=["chr1"],
|
|
1148
|
+
fasta_source=str(fasta_path),
|
|
1149
|
+
skip_cache=True,
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
bam_path = tmp_path / "test.bam"
|
|
1153
|
+
header = {"HD": {"VN": "1.0"}, "SQ": [{"LN": len(seq), "SN": "chr1"}]}
|
|
1154
|
+
# OT read: C at top-strand C positions = methylated at both CpGs.
|
|
1155
|
+
ot_seq = list("N" * len(seq))
|
|
1156
|
+
ot_seq[9] = "C"
|
|
1157
|
+
ot_seq[20] = "C"
|
|
1158
|
+
# OB read (BAM in reference orientation): G at top-strand G positions
|
|
1159
|
+
# = methylated at both CpGs.
|
|
1160
|
+
ob_seq = list("N" * len(seq))
|
|
1161
|
+
ob_seq[9] = "C"
|
|
1162
|
+
ob_seq[10] = "G"
|
|
1163
|
+
ob_seq[20] = "C"
|
|
1164
|
+
ob_seq[21] = "G"
|
|
1165
|
+
with pysam.AlignmentFile(bam_path, "wb", header=header) as out_bam:
|
|
1166
|
+
a = pysam.AlignedSegment()
|
|
1167
|
+
a.query_name = "ot_read"
|
|
1168
|
+
a.query_sequence = "".join(ot_seq)
|
|
1169
|
+
a.flag = 0
|
|
1170
|
+
a.reference_id = 0
|
|
1171
|
+
a.reference_start = 0
|
|
1172
|
+
a.mapping_quality = 60
|
|
1173
|
+
a.cigartuples = [(0, len(seq))]
|
|
1174
|
+
a.set_tag("MD", str(len(seq)))
|
|
1175
|
+
a.set_tag("YD", "f")
|
|
1176
|
+
out_bam.write(a)
|
|
1177
|
+
b = pysam.AlignedSegment()
|
|
1178
|
+
b.query_name = "ob_read"
|
|
1179
|
+
b.query_sequence = "".join(ob_seq)
|
|
1180
|
+
b.flag = 0x10
|
|
1181
|
+
b.reference_id = 0
|
|
1182
|
+
b.reference_start = 0
|
|
1183
|
+
b.mapping_quality = 60
|
|
1184
|
+
b.cigartuples = [(0, len(seq))]
|
|
1185
|
+
b.set_tag("MD", str(len(seq)))
|
|
1186
|
+
b.set_tag("YD", "r")
|
|
1187
|
+
out_bam.write(b)
|
|
1188
|
+
pysam.index(str(bam_path))
|
|
1189
|
+
|
|
1190
|
+
result = functions.extract_methylation_data_from_bam(
|
|
1191
|
+
input_bam=str(bam_path),
|
|
1192
|
+
genome_methylation_embedding=emb,
|
|
1193
|
+
)
|
|
1194
|
+
assert result.matrix.shape[0] == 2
|
|
1195
|
+
# Both reads call both CpGs methylated, so we expect two reads × two CpGs
|
|
1196
|
+
# in the same two columns, all with value 1.
|
|
1197
|
+
coo = result.matrix.tocoo()
|
|
1198
|
+
ot_cols = sorted(int(c) for r, c in zip(coo.row, coo.col) if r == 0)
|
|
1199
|
+
ob_cols = sorted(int(c) for r, c in zip(coo.row, coo.col) if r == 1)
|
|
1200
|
+
assert ot_cols == ob_cols, f"OT and OB columns diverged: OT={ot_cols} OB={ob_cols}"
|
|
1201
|
+
assert list(result.matrix.data) == [1, 1, 1, 1]
|
|
1202
|
+
|
|
1203
|
+
|
|
1077
1204
|
# ======================================================================
|
|
1078
1205
|
# XB tag (gem3/Blueprint) extraction tests
|
|
1079
1206
|
# ======================================================================
|
|
@@ -122,7 +122,7 @@ def test_inspect_end_to_end(tmp_path) -> None:
|
|
|
122
122
|
assert result.exit_code == 0
|
|
123
123
|
assert "test" in result.output # genome_name
|
|
124
124
|
assert "CpG index CRC32:" in result.output
|
|
125
|
-
assert "v2.
|
|
125
|
+
assert "v2.6" in result.output
|
|
126
126
|
|
|
127
127
|
|
|
128
128
|
def test_format_size_bytes() -> None:
|