bam2tensor 2.4__tar.gz → 2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/docs.yml +1 -1
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/release.yml +3 -3
- {bam2tensor-2.4 → bam2tensor-2.6}/CLAUDE.md +1 -1
- {bam2tensor-2.4 → bam2tensor-2.6}/PKG-INFO +96 -2
- {bam2tensor-2.4 → bam2tensor-2.6}/README.md +95 -1
- {bam2tensor-2.4 → bam2tensor-2.6}/pyproject.toml +1 -1
- {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/__init__.py +1 -1
- {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/__main__.py +76 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/functions.py +295 -58
- {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/inspect.py +19 -0
- bam2tensor-2.6/tests/test_filters.py +568 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_functions.py +127 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_inspect.py +3 -3
- {bam2tensor-2.4 → bam2tensor-2.6}/uv.lock +131 -115
- {bam2tensor-2.4 → bam2tensor-2.6}/.darglint +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.editorconfig +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.gitattributes +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/actions/setup-env/action.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/dependabot.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/labels.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/release-drafter.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/constraints.txt +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/labeler.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/tests.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.gitignore +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/.pre-commit-config.yaml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/CONTRIBUTING.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/LICENSE +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/SECURITY.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/Makefile +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/conf.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/contributing.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/index.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/license.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/logo/bam2tensor-logo.afdesign +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/logo/bam2tensor-logo.png +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/make.bat +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/nano-banana-overview-shrunk.png +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/reference.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/docs/templates/package.rst_t +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/noxfile.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/embedding.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/metadata.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/py.typed +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/reference.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/tests/__init__.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_duplication.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_embedding.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_fasta.fa +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_main.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_metadata.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_reference.py +0 -0
|
@@ -67,16 +67,16 @@ jobs:
|
|
|
67
67
|
|
|
68
68
|
- name: Publish package on PyPI
|
|
69
69
|
if: steps.check-version.outputs.tag || steps.check-tag.outputs.tag
|
|
70
|
-
uses: pypa/gh-action-pypi-publish@v1.
|
|
70
|
+
uses: pypa/gh-action-pypi-publish@v1.14.0
|
|
71
71
|
|
|
72
72
|
- name: Publish package on TestPyPI
|
|
73
73
|
if: (!steps.check-version.outputs.tag && !steps.check-tag.outputs.tag)
|
|
74
|
-
uses: pypa/gh-action-pypi-publish@v1.
|
|
74
|
+
uses: pypa/gh-action-pypi-publish@v1.14.0
|
|
75
75
|
with:
|
|
76
76
|
repository-url: https://test.pypi.org/legacy/
|
|
77
77
|
|
|
78
78
|
- name: Publish the release notes
|
|
79
|
-
uses: release-drafter/release-drafter@v7.
|
|
79
|
+
uses: release-drafter/release-drafter@v7.2.1
|
|
80
80
|
with:
|
|
81
81
|
publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
|
|
82
82
|
tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
|
|
@@ -40,7 +40,7 @@ uv run mypy src
|
|
|
40
40
|
|
|
41
41
|
```
|
|
42
42
|
src/bam2tensor/
|
|
43
|
-
__init__.py # Package version (2.
|
|
43
|
+
__init__.py # Package version (2.5)
|
|
44
44
|
__main__.py # Click CLI entry point (bam2tensor command)
|
|
45
45
|
inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
|
|
46
46
|
embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bam2tensor
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.6
|
|
4
4
|
Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
|
|
5
5
|
Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
|
|
6
6
|
Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
|
|
@@ -72,6 +72,7 @@ Description-Content-Type: text/markdown
|
|
|
72
72
|
- [Custom Output Directory](#custom-output-directory)
|
|
73
73
|
- [Using a Custom Genome](#using-a-custom-genome)
|
|
74
74
|
- [Command-Line Options](#command-line-options)
|
|
75
|
+
- [Filtering Conversion Errors](#filtering-conversion-errors)
|
|
75
76
|
- [Inspecting Output Files](#inspecting-output-files)
|
|
76
77
|
- [Output Data Structure](#output-data-structure)
|
|
77
78
|
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
@@ -99,6 +100,7 @@ Description-Content-Type: text/markdown
|
|
|
99
100
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
100
101
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
101
102
|
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
103
|
+
- **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
|
|
102
104
|
|
|
103
105
|
## Requirements
|
|
104
106
|
|
|
@@ -256,6 +258,25 @@ Options:
|
|
|
256
258
|
determine CpG sites).
|
|
257
259
|
--quality-limit INTEGER Quality filter for aligned reads (default =
|
|
258
260
|
20)
|
|
261
|
+
--filter-non-converted Drop reads with >= --non-converted-threshold
|
|
262
|
+
retained non-CpG cytosines, the signature of
|
|
263
|
+
incomplete bisulfite/EM-seq conversion (port
|
|
264
|
+
of nebiolabs/mark-nonconverted-reads).
|
|
265
|
+
Default: off.
|
|
266
|
+
--non-converted-threshold INTEGER
|
|
267
|
+
Minimum count of retained non-CpG cytosines
|
|
268
|
+
to drop a read (default = 3, matches NEB
|
|
269
|
+
mark-nonconverted-reads).
|
|
270
|
+
--filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
|
|
271
|
+
called unmethylated and cover at least --em-
|
|
272
|
+
overconversion-min-cpgs sites (heuristic for
|
|
273
|
+
the fragment-level over-conversion artifact
|
|
274
|
+
described in Loyfer et al. bioRxiv
|
|
275
|
+
2026.03.24.713040). Default: off.
|
|
276
|
+
--em-overconversion-min-cpgs INTEGER
|
|
277
|
+
Minimum covered CpG count required before
|
|
278
|
+
the EM over-conversion filter will drop a
|
|
279
|
+
read (default = 3).
|
|
259
280
|
--verbose Verbose output.
|
|
260
281
|
--skip-cache De-novo generate CpG sites (slow).
|
|
261
282
|
--debug Debug mode (extensive validity checking +
|
|
@@ -281,6 +302,10 @@ Options:
|
|
|
281
302
|
| `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
|
|
282
303
|
| `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
|
|
283
304
|
| `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
|
|
305
|
+
| `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
|
|
306
|
+
| `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
|
|
307
|
+
| `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
|
|
308
|
+
| `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
|
|
284
309
|
| `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
|
|
285
310
|
| `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
|
|
286
311
|
| `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
|
|
@@ -289,6 +314,66 @@ Options:
|
|
|
289
314
|
| `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
|
|
290
315
|
| `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
|
|
291
316
|
|
|
317
|
+
## Filtering Conversion Errors
|
|
318
|
+
|
|
319
|
+
Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
|
|
320
|
+
|
|
321
|
+
### `--filter-non-converted` — incomplete conversion
|
|
322
|
+
|
|
323
|
+
Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
|
|
324
|
+
|
|
325
|
+
- **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
|
|
326
|
+
- **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
|
|
327
|
+
|
|
328
|
+
### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
|
|
329
|
+
|
|
330
|
+
A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
|
|
331
|
+
|
|
332
|
+
The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
|
|
333
|
+
|
|
334
|
+
### Usage
|
|
335
|
+
|
|
336
|
+
```bash
|
|
337
|
+
bam2tensor \
|
|
338
|
+
--input-path sample.bam \
|
|
339
|
+
--reference-fasta GRCh38.fa \
|
|
340
|
+
--genome-name hg38 \
|
|
341
|
+
--filter-non-converted \
|
|
342
|
+
--filter-em-overconversion
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
Filter parameters and enabled state are written to the output `metadata.json`:
|
|
346
|
+
|
|
347
|
+
```json
|
|
348
|
+
{
|
|
349
|
+
"filters": {
|
|
350
|
+
"non_converted_reads": {"enabled": true, "threshold": 3},
|
|
351
|
+
"em_overconversion": {"enabled": true, "min_cpgs": 3}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### Reproducibility note
|
|
357
|
+
|
|
358
|
+
The two filters differ in whether they can be replayed downstream without the source BAM:
|
|
359
|
+
|
|
360
|
+
- **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
|
|
361
|
+
|
|
362
|
+
```python
|
|
363
|
+
import scipy.sparse
|
|
364
|
+
mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
|
|
365
|
+
min_cpgs = 3
|
|
366
|
+
kept_rows = []
|
|
367
|
+
for i in range(mat.shape[0]):
|
|
368
|
+
row = mat.getrow(i).toarray().ravel()
|
|
369
|
+
covered = row[(row == 0) | (row == 1)] # drop -1 no-data
|
|
370
|
+
is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
|
|
371
|
+
if not is_overconv:
|
|
372
|
+
kept_rows.append(i)
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
- **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
|
|
376
|
+
|
|
292
377
|
## Inspecting Output Files
|
|
293
378
|
|
|
294
379
|
Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
|
|
@@ -302,11 +387,15 @@ sample.methylation.npz
|
|
|
302
387
|
CpG sites: 28,217,448
|
|
303
388
|
Data points: 12,847,322 (sparsity: 99.97%)
|
|
304
389
|
Fragment len: median 167, mean 182, range [50, 600]
|
|
390
|
+
Filters: non-converted (>= 3 non-CpG Cs)
|
|
391
|
+
EM over-conversion (all-unmethylated, >= 3 CpGs)
|
|
305
392
|
CpG index CRC32: a1b2c3d4
|
|
306
|
-
bam2tensor: v2.
|
|
393
|
+
bam2tensor: v2.5
|
|
307
394
|
File size: 14.2 MB
|
|
308
395
|
```
|
|
309
396
|
|
|
397
|
+
When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
|
|
398
|
+
|
|
310
399
|
You can pass multiple files at once:
|
|
311
400
|
|
|
312
401
|
```bash
|
|
@@ -368,6 +457,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
|
|
|
368
457
|
| `expected_chromosomes` | List of chromosomes included in the column mapping |
|
|
369
458
|
| `total_cpg_sites` | Total number of CpG columns in the matrix |
|
|
370
459
|
| `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
|
|
460
|
+
| `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
|
|
371
461
|
|
|
372
462
|
This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
|
|
373
463
|
|
|
@@ -570,6 +660,10 @@ extract_methylation_data_from_bam(
|
|
|
570
660
|
input_bam: str, # Path to BAM file
|
|
571
661
|
genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
|
|
572
662
|
quality_limit: int = 20, # Minimum MAPQ
|
|
663
|
+
filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
|
|
664
|
+
non_converted_threshold: int = 3, # Threshold for the above filter
|
|
665
|
+
filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
|
|
666
|
+
em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
|
|
573
667
|
verbose: bool = False, # Enable verbose output
|
|
574
668
|
debug: bool = False # Enable debug output
|
|
575
669
|
) -> ExtractionResult
|
|
@@ -39,6 +39,7 @@
|
|
|
39
39
|
- [Custom Output Directory](#custom-output-directory)
|
|
40
40
|
- [Using a Custom Genome](#using-a-custom-genome)
|
|
41
41
|
- [Command-Line Options](#command-line-options)
|
|
42
|
+
- [Filtering Conversion Errors](#filtering-conversion-errors)
|
|
42
43
|
- [Inspecting Output Files](#inspecting-output-files)
|
|
43
44
|
- [Output Data Structure](#output-data-structure)
|
|
44
45
|
- [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
|
|
@@ -66,6 +67,7 @@
|
|
|
66
67
|
- **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
|
|
67
68
|
- **Quality Filtering**: Configurable mapping quality thresholds
|
|
68
69
|
- **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
|
|
70
|
+
- **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
|
|
69
71
|
|
|
70
72
|
## Requirements
|
|
71
73
|
|
|
@@ -223,6 +225,25 @@ Options:
|
|
|
223
225
|
determine CpG sites).
|
|
224
226
|
--quality-limit INTEGER Quality filter for aligned reads (default =
|
|
225
227
|
20)
|
|
228
|
+
--filter-non-converted Drop reads with >= --non-converted-threshold
|
|
229
|
+
retained non-CpG cytosines, the signature of
|
|
230
|
+
incomplete bisulfite/EM-seq conversion (port
|
|
231
|
+
of nebiolabs/mark-nonconverted-reads).
|
|
232
|
+
Default: off.
|
|
233
|
+
--non-converted-threshold INTEGER
|
|
234
|
+
Minimum count of retained non-CpG cytosines
|
|
235
|
+
to drop a read (default = 3, matches NEB
|
|
236
|
+
mark-nonconverted-reads).
|
|
237
|
+
--filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
|
|
238
|
+
called unmethylated and cover at least --em-
|
|
239
|
+
overconversion-min-cpgs sites (heuristic for
|
|
240
|
+
the fragment-level over-conversion artifact
|
|
241
|
+
described in Loyfer et al. bioRxiv
|
|
242
|
+
2026.03.24.713040). Default: off.
|
|
243
|
+
--em-overconversion-min-cpgs INTEGER
|
|
244
|
+
Minimum covered CpG count required before
|
|
245
|
+
the EM over-conversion filter will drop a
|
|
246
|
+
read (default = 3).
|
|
226
247
|
--verbose Verbose output.
|
|
227
248
|
--skip-cache De-novo generate CpG sites (slow).
|
|
228
249
|
--debug Debug mode (extensive validity checking +
|
|
@@ -248,6 +269,10 @@ Options:
|
|
|
248
269
|
| `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
|
|
249
270
|
| `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
|
|
250
271
|
| `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
|
|
272
|
+
| `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
|
|
273
|
+
| `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
|
|
274
|
+
| `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
|
|
275
|
+
| `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
|
|
251
276
|
| `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
|
|
252
277
|
| `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
|
|
253
278
|
| `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
|
|
@@ -256,6 +281,66 @@ Options:
|
|
|
256
281
|
| `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
|
|
257
282
|
| `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
|
|
258
283
|
|
|
284
|
+
## Filtering Conversion Errors
|
|
285
|
+
|
|
286
|
+
Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
|
|
287
|
+
|
|
288
|
+
### `--filter-non-converted` — incomplete conversion
|
|
289
|
+
|
|
290
|
+
Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
|
|
291
|
+
|
|
292
|
+
- **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
|
|
293
|
+
- **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
|
|
294
|
+
|
|
295
|
+
### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
|
|
296
|
+
|
|
297
|
+
A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
|
|
298
|
+
|
|
299
|
+
The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
|
|
300
|
+
|
|
301
|
+
### Usage
|
|
302
|
+
|
|
303
|
+
```bash
|
|
304
|
+
bam2tensor \
|
|
305
|
+
--input-path sample.bam \
|
|
306
|
+
--reference-fasta GRCh38.fa \
|
|
307
|
+
--genome-name hg38 \
|
|
308
|
+
--filter-non-converted \
|
|
309
|
+
--filter-em-overconversion
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
Filter parameters and enabled state are written to the output `metadata.json`:
|
|
313
|
+
|
|
314
|
+
```json
|
|
315
|
+
{
|
|
316
|
+
"filters": {
|
|
317
|
+
"non_converted_reads": {"enabled": true, "threshold": 3},
|
|
318
|
+
"em_overconversion": {"enabled": true, "min_cpgs": 3}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### Reproducibility note
|
|
324
|
+
|
|
325
|
+
The two filters differ in whether they can be replayed downstream without the source BAM:
|
|
326
|
+
|
|
327
|
+
- **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
import scipy.sparse
|
|
331
|
+
mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
|
|
332
|
+
min_cpgs = 3
|
|
333
|
+
kept_rows = []
|
|
334
|
+
for i in range(mat.shape[0]):
|
|
335
|
+
row = mat.getrow(i).toarray().ravel()
|
|
336
|
+
covered = row[(row == 0) | (row == 1)] # drop -1 no-data
|
|
337
|
+
is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
|
|
338
|
+
if not is_overconv:
|
|
339
|
+
kept_rows.append(i)
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
- **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
|
|
343
|
+
|
|
259
344
|
## Inspecting Output Files
|
|
260
345
|
|
|
261
346
|
Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
|
|
@@ -269,11 +354,15 @@ sample.methylation.npz
|
|
|
269
354
|
CpG sites: 28,217,448
|
|
270
355
|
Data points: 12,847,322 (sparsity: 99.97%)
|
|
271
356
|
Fragment len: median 167, mean 182, range [50, 600]
|
|
357
|
+
Filters: non-converted (>= 3 non-CpG Cs)
|
|
358
|
+
EM over-conversion (all-unmethylated, >= 3 CpGs)
|
|
272
359
|
CpG index CRC32: a1b2c3d4
|
|
273
|
-
bam2tensor: v2.
|
|
360
|
+
bam2tensor: v2.5
|
|
274
361
|
File size: 14.2 MB
|
|
275
362
|
```
|
|
276
363
|
|
|
364
|
+
When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
|
|
365
|
+
|
|
277
366
|
You can pass multiple files at once:
|
|
278
367
|
|
|
279
368
|
```bash
|
|
@@ -335,6 +424,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
|
|
|
335
424
|
| `expected_chromosomes` | List of chromosomes included in the column mapping |
|
|
336
425
|
| `total_cpg_sites` | Total number of CpG columns in the matrix |
|
|
337
426
|
| `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
|
|
427
|
+
| `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
|
|
338
428
|
|
|
339
429
|
This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
|
|
340
430
|
|
|
@@ -537,6 +627,10 @@ extract_methylation_data_from_bam(
|
|
|
537
627
|
input_bam: str, # Path to BAM file
|
|
538
628
|
genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
|
|
539
629
|
quality_limit: int = 20, # Minimum MAPQ
|
|
630
|
+
filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
|
|
631
|
+
non_converted_threshold: int = 3, # Threshold for the above filter
|
|
632
|
+
filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
|
|
633
|
+
em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
|
|
540
634
|
verbose: bool = False, # Enable verbose output
|
|
541
635
|
debug: bool = False # Enable debug output
|
|
542
636
|
) -> ExtractionResult
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bam2tensor"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.6"
|
|
4
4
|
description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
|
|
5
5
|
authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
|
|
6
6
|
license = "MIT"
|
|
@@ -229,6 +229,43 @@ def validate_input_output(
|
|
|
229
229
|
default=20,
|
|
230
230
|
type=int,
|
|
231
231
|
)
|
|
232
|
+
@click.option(
|
|
233
|
+
"--filter-non-converted",
|
|
234
|
+
help=(
|
|
235
|
+
"Drop reads with >= --non-converted-threshold retained non-CpG "
|
|
236
|
+
"cytosines, the signature of incomplete bisulfite/EM-seq conversion "
|
|
237
|
+
"(port of nebiolabs/mark-nonconverted-reads). Default: off."
|
|
238
|
+
),
|
|
239
|
+
is_flag=True,
|
|
240
|
+
)
|
|
241
|
+
@click.option(
|
|
242
|
+
"--non-converted-threshold",
|
|
243
|
+
help=(
|
|
244
|
+
"Minimum count of retained non-CpG cytosines to drop a read "
|
|
245
|
+
"(default = 3, matches NEB mark-nonconverted-reads)."
|
|
246
|
+
),
|
|
247
|
+
default=3,
|
|
248
|
+
type=int,
|
|
249
|
+
)
|
|
250
|
+
@click.option(
|
|
251
|
+
"--filter-em-overconversion",
|
|
252
|
+
help=(
|
|
253
|
+
"Drop EM-seq reads whose covered CpGs are all called unmethylated "
|
|
254
|
+
"and cover at least --em-overconversion-min-cpgs sites (heuristic "
|
|
255
|
+
"for the fragment-level over-conversion artifact described in "
|
|
256
|
+
"Loyfer et al. bioRxiv 2026.03.24.713040). Default: off."
|
|
257
|
+
),
|
|
258
|
+
is_flag=True,
|
|
259
|
+
)
|
|
260
|
+
@click.option(
|
|
261
|
+
"--em-overconversion-min-cpgs",
|
|
262
|
+
help=(
|
|
263
|
+
"Minimum covered CpG count required before the EM over-conversion "
|
|
264
|
+
"filter will drop a read (default = 3)."
|
|
265
|
+
),
|
|
266
|
+
default=3,
|
|
267
|
+
type=int,
|
|
268
|
+
)
|
|
232
269
|
@click.option("--verbose", help="Verbose output.", is_flag=True)
|
|
233
270
|
@click.option("--skip-cache", help="De-novo generate CpG sites (slow).", is_flag=True)
|
|
234
271
|
@click.option(
|
|
@@ -263,6 +300,10 @@ def main(
|
|
|
263
300
|
expected_chromosomes: str | None,
|
|
264
301
|
reference_fasta: str | None,
|
|
265
302
|
quality_limit: int,
|
|
303
|
+
filter_non_converted: bool,
|
|
304
|
+
non_converted_threshold: int,
|
|
305
|
+
filter_em_overconversion: bool,
|
|
306
|
+
em_overconversion_min_cpgs: int,
|
|
266
307
|
verbose: bool,
|
|
267
308
|
skip_cache: bool,
|
|
268
309
|
debug: bool,
|
|
@@ -300,6 +341,17 @@ def main(
|
|
|
300
341
|
``--download-reference`` is used.
|
|
301
342
|
quality_limit: Minimum mapping quality (MAPQ) threshold. Reads below
|
|
302
343
|
this quality are excluded.
|
|
344
|
+
filter_non_converted: If True, drop reads with at least
|
|
345
|
+
``non_converted_threshold`` retained non-CpG cytosines —
|
|
346
|
+
indicating incomplete bisulfite/EM-seq conversion.
|
|
347
|
+
non_converted_threshold: Threshold used by the non-converted
|
|
348
|
+
read filter.
|
|
349
|
+
filter_em_overconversion: If True, drop reads whose covered CpGs
|
|
350
|
+
are all called unmethylated and cover at least
|
|
351
|
+
``em_overconversion_min_cpgs`` sites — heuristic for EM-seq
|
|
352
|
+
fragment-level over-conversion (Loyfer et al. 2026).
|
|
353
|
+
em_overconversion_min_cpgs: Minimum covered CpG count required
|
|
354
|
+
before the over-conversion filter will drop a read.
|
|
303
355
|
verbose: If True, print detailed progress information.
|
|
304
356
|
skip_cache: If True, regenerate the CpG site index even if a cache
|
|
305
357
|
file exists.
|
|
@@ -382,6 +434,16 @@ def main(
|
|
|
382
434
|
print(f" Reference: {reference_fasta}")
|
|
383
435
|
print(f" Chromosomes: {chrom_display}")
|
|
384
436
|
print(f" Quality limit: MAPQ >= {quality_limit}")
|
|
437
|
+
if filter_non_converted:
|
|
438
|
+
print(
|
|
439
|
+
f" Filters: non-converted reads (>= "
|
|
440
|
+
f"{non_converted_threshold} retained non-CpG Cs)"
|
|
441
|
+
)
|
|
442
|
+
if filter_em_overconversion:
|
|
443
|
+
print(
|
|
444
|
+
f" EM over-conversion (all-unmethylated, >= "
|
|
445
|
+
f"{em_overconversion_min_cpgs} CpGs)"
|
|
446
|
+
)
|
|
385
447
|
if output_dir:
|
|
386
448
|
print(f" Output dir: {output_dir}")
|
|
387
449
|
else:
|
|
@@ -448,6 +510,10 @@ def main(
|
|
|
448
510
|
input_bam=input_bam,
|
|
449
511
|
genome_methylation_embedding=genome_methylation_embedding,
|
|
450
512
|
quality_limit=quality_limit,
|
|
513
|
+
filter_non_converted=filter_non_converted,
|
|
514
|
+
non_converted_threshold=non_converted_threshold,
|
|
515
|
+
filter_em_overconversion=filter_em_overconversion,
|
|
516
|
+
em_overconversion_min_cpgs=em_overconversion_min_cpgs,
|
|
451
517
|
verbose=verbose,
|
|
452
518
|
debug=debug,
|
|
453
519
|
)
|
|
@@ -476,6 +542,16 @@ def main(
|
|
|
476
542
|
"expected_chromosomes": chrom_list,
|
|
477
543
|
"total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
|
|
478
544
|
"cpg_index_crc32": cpg_crc32,
|
|
545
|
+
"filters": {
|
|
546
|
+
"non_converted_reads": {
|
|
547
|
+
"enabled": filter_non_converted,
|
|
548
|
+
"threshold": non_converted_threshold,
|
|
549
|
+
},
|
|
550
|
+
"em_overconversion": {
|
|
551
|
+
"enabled": filter_em_overconversion,
|
|
552
|
+
"min_cpgs": em_overconversion_min_cpgs,
|
|
553
|
+
},
|
|
554
|
+
},
|
|
479
555
|
},
|
|
480
556
|
)
|
|
481
557
|
print(f" Output: {output_file}")
|