bam2tensor 2.4__tar.gz → 2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/docs.yml +1 -1
  2. {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/release.yml +3 -3
  3. {bam2tensor-2.4 → bam2tensor-2.6}/CLAUDE.md +1 -1
  4. {bam2tensor-2.4 → bam2tensor-2.6}/PKG-INFO +96 -2
  5. {bam2tensor-2.4 → bam2tensor-2.6}/README.md +95 -1
  6. {bam2tensor-2.4 → bam2tensor-2.6}/pyproject.toml +1 -1
  7. {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/__init__.py +1 -1
  8. {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/__main__.py +76 -0
  9. {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/functions.py +295 -58
  10. {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/inspect.py +19 -0
  11. bam2tensor-2.6/tests/test_filters.py +568 -0
  12. {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_functions.py +127 -0
  13. {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_inspect.py +3 -3
  14. {bam2tensor-2.4 → bam2tensor-2.6}/uv.lock +131 -115
  15. {bam2tensor-2.4 → bam2tensor-2.6}/.darglint +0 -0
  16. {bam2tensor-2.4 → bam2tensor-2.6}/.editorconfig +0 -0
  17. {bam2tensor-2.4 → bam2tensor-2.6}/.gitattributes +0 -0
  18. {bam2tensor-2.4 → bam2tensor-2.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  19. {bam2tensor-2.4 → bam2tensor-2.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  20. {bam2tensor-2.4 → bam2tensor-2.6}/.github/actions/setup-env/action.yml +0 -0
  21. {bam2tensor-2.4 → bam2tensor-2.6}/.github/dependabot.yml +0 -0
  22. {bam2tensor-2.4 → bam2tensor-2.6}/.github/labels.yml +0 -0
  23. {bam2tensor-2.4 → bam2tensor-2.6}/.github/release-drafter.yml +0 -0
  24. {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/constraints.txt +0 -0
  25. {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/labeler.yml +0 -0
  26. {bam2tensor-2.4 → bam2tensor-2.6}/.github/workflows/tests.yml +0 -0
  27. {bam2tensor-2.4 → bam2tensor-2.6}/.gitignore +0 -0
  28. {bam2tensor-2.4 → bam2tensor-2.6}/.pre-commit-config.yaml +0 -0
  29. {bam2tensor-2.4 → bam2tensor-2.6}/CONTRIBUTING.md +0 -0
  30. {bam2tensor-2.4 → bam2tensor-2.6}/LICENSE +0 -0
  31. {bam2tensor-2.4 → bam2tensor-2.6}/SECURITY.md +0 -0
  32. {bam2tensor-2.4 → bam2tensor-2.6}/docs/Makefile +0 -0
  33. {bam2tensor-2.4 → bam2tensor-2.6}/docs/conf.py +0 -0
  34. {bam2tensor-2.4 → bam2tensor-2.6}/docs/contributing.md +0 -0
  35. {bam2tensor-2.4 → bam2tensor-2.6}/docs/index.md +0 -0
  36. {bam2tensor-2.4 → bam2tensor-2.6}/docs/license.md +0 -0
  37. {bam2tensor-2.4 → bam2tensor-2.6}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
  38. {bam2tensor-2.4 → bam2tensor-2.6}/docs/logo/bam2tensor-logo.afdesign +0 -0
  39. {bam2tensor-2.4 → bam2tensor-2.6}/docs/logo/bam2tensor-logo.png +0 -0
  40. {bam2tensor-2.4 → bam2tensor-2.6}/docs/make.bat +0 -0
  41. {bam2tensor-2.4 → bam2tensor-2.6}/docs/nano-banana-overview-shrunk.png +0 -0
  42. {bam2tensor-2.4 → bam2tensor-2.6}/docs/reference.md +0 -0
  43. {bam2tensor-2.4 → bam2tensor-2.6}/docs/templates/package.rst_t +0 -0
  44. {bam2tensor-2.4 → bam2tensor-2.6}/noxfile.py +0 -0
  45. {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/embedding.py +0 -0
  46. {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/metadata.py +0 -0
  47. {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/py.typed +0 -0
  48. {bam2tensor-2.4 → bam2tensor-2.6}/src/bam2tensor/reference.py +0 -0
  49. {bam2tensor-2.4 → bam2tensor-2.6}/tests/__init__.py +0 -0
  50. {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_duplication.py +0 -0
  51. {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_embedding.py +0 -0
  52. {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_fasta.fa +0 -0
  53. {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_main.py +0 -0
  54. {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_metadata.py +0 -0
  55. {bam2tensor-2.4 → bam2tensor-2.6}/tests/test_reference.py +0 -0
@@ -47,7 +47,7 @@ jobs:
47
47
  uv run sphinx-build docs docs/_build
48
48
 
49
49
  - name: Upload artifact
50
- uses: actions/upload-pages-artifact@v4
50
+ uses: actions/upload-pages-artifact@v5
51
51
  with:
52
52
  path: "docs/_build"
53
53
 
@@ -67,16 +67,16 @@ jobs:
67
67
 
68
68
  - name: Publish package on PyPI
69
69
  if: steps.check-version.outputs.tag || steps.check-tag.outputs.tag
70
- uses: pypa/gh-action-pypi-publish@v1.13.0
70
+ uses: pypa/gh-action-pypi-publish@v1.14.0
71
71
 
72
72
  - name: Publish package on TestPyPI
73
73
  if: (!steps.check-version.outputs.tag && !steps.check-tag.outputs.tag)
74
- uses: pypa/gh-action-pypi-publish@v1.13.0
74
+ uses: pypa/gh-action-pypi-publish@v1.14.0
75
75
  with:
76
76
  repository-url: https://test.pypi.org/legacy/
77
77
 
78
78
  - name: Publish the release notes
79
- uses: release-drafter/release-drafter@v7.1.1
79
+ uses: release-drafter/release-drafter@v7.2.1
80
80
  with:
81
81
  publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
82
82
  tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
@@ -40,7 +40,7 @@ uv run mypy src
40
40
 
41
41
  ```
42
42
  src/bam2tensor/
43
- __init__.py # Package version (2.4)
43
+ __init__.py # Package version (2.5)
44
44
  __main__.py # Click CLI entry point (bam2tensor command)
45
45
  inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
46
46
  embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bam2tensor
3
- Version: 2.4
3
+ Version: 2.6
4
4
  Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
5
5
  Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
6
6
  Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
@@ -72,6 +72,7 @@ Description-Content-Type: text/markdown
72
72
  - [Custom Output Directory](#custom-output-directory)
73
73
  - [Using a Custom Genome](#using-a-custom-genome)
74
74
  - [Command-Line Options](#command-line-options)
75
+ - [Filtering Conversion Errors](#filtering-conversion-errors)
75
76
  - [Inspecting Output Files](#inspecting-output-files)
76
77
  - [Output Data Structure](#output-data-structure)
77
78
  - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
@@ -99,6 +100,7 @@ Description-Content-Type: text/markdown
99
100
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
100
101
  - **Quality Filtering**: Configurable mapping quality thresholds
101
102
  - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
103
+ - **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
102
104
 
103
105
  ## Requirements
104
106
 
@@ -256,6 +258,25 @@ Options:
256
258
  determine CpG sites).
257
259
  --quality-limit INTEGER Quality filter for aligned reads (default =
258
260
  20)
261
+ --filter-non-converted Drop reads with >= --non-converted-threshold
262
+ retained non-CpG cytosines, the signature of
263
+ incomplete bisulfite/EM-seq conversion (port
264
+ of nebiolabs/mark-nonconverted-reads).
265
+ Default: off.
266
+ --non-converted-threshold INTEGER
267
+ Minimum count of retained non-CpG cytosines
268
+ to drop a read (default = 3, matches NEB
269
+ mark-nonconverted-reads).
270
+ --filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
271
+ called unmethylated and cover at least --em-
272
+ overconversion-min-cpgs sites (heuristic for
273
+ the fragment-level over-conversion artifact
274
+ described in Loyfer et al. bioRxiv
275
+ 2026.03.24.713040). Default: off.
276
+ --em-overconversion-min-cpgs INTEGER
277
+ Minimum covered CpG count required before
278
+ the EM over-conversion filter will drop a
279
+ read (default = 3).
259
280
  --verbose Verbose output.
260
281
  --skip-cache De-novo generate CpG sites (slow).
261
282
  --debug Debug mode (extensive validity checking +
@@ -281,6 +302,10 @@ Options:
281
302
  | `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
282
303
  | `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
283
304
  | `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
305
+ | `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
306
+ | `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
307
+ | `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
308
+ | `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
284
309
  | `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
285
310
  | `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
286
311
  | `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
@@ -289,6 +314,66 @@ Options:
289
314
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
290
315
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
291
316
 
317
+ ## Filtering Conversion Errors
318
+
319
+ Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
320
+
321
+ ### `--filter-non-converted` — incomplete conversion
322
+
323
+ Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
324
+
325
+ - **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
326
+ - **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
327
+
328
+ ### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
329
+
330
+ A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
331
+
332
+ The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
333
+
334
+ ### Usage
335
+
336
+ ```bash
337
+ bam2tensor \
338
+ --input-path sample.bam \
339
+ --reference-fasta GRCh38.fa \
340
+ --genome-name hg38 \
341
+ --filter-non-converted \
342
+ --filter-em-overconversion
343
+ ```
344
+
345
+ Filter parameters and enabled state are written to the output `metadata.json`:
346
+
347
+ ```json
348
+ {
349
+ "filters": {
350
+ "non_converted_reads": {"enabled": true, "threshold": 3},
351
+ "em_overconversion": {"enabled": true, "min_cpgs": 3}
352
+ }
353
+ }
354
+ ```
355
+
356
+ ### Reproducibility note
357
+
358
+ The two filters differ in whether they can be replayed downstream without the source BAM:
359
+
360
+ - **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
361
+
362
+ ```python
363
+ import scipy.sparse
364
+ mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
365
+ min_cpgs = 3
366
+ kept_rows = []
367
+ for i in range(mat.shape[0]):
368
+ row = mat.getrow(i).toarray().ravel()
369
+ covered = row[(row == 0) | (row == 1)] # drop -1 no-data
370
+ is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
371
+ if not is_overconv:
372
+ kept_rows.append(i)
373
+ ```
374
+
375
+ - **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
376
+
292
377
  ## Inspecting Output Files
293
378
 
294
379
  Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
@@ -302,11 +387,15 @@ sample.methylation.npz
302
387
  CpG sites: 28,217,448
303
388
  Data points: 12,847,322 (sparsity: 99.97%)
304
389
  Fragment len: median 167, mean 182, range [50, 600]
390
+ Filters: non-converted (>= 3 non-CpG Cs)
391
+ EM over-conversion (all-unmethylated, >= 3 CpGs)
305
392
  CpG index CRC32: a1b2c3d4
306
- bam2tensor: v2.4
393
+ bam2tensor: v2.5
307
394
  File size: 14.2 MB
308
395
  ```
309
396
 
397
+ When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
398
+
310
399
  You can pass multiple files at once:
311
400
 
312
401
  ```bash
@@ -368,6 +457,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
368
457
  | `expected_chromosomes` | List of chromosomes included in the column mapping |
369
458
  | `total_cpg_sites` | Total number of CpG columns in the matrix |
370
459
  | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
460
+ | `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
371
461
 
372
462
  This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
373
463
 
@@ -570,6 +660,10 @@ extract_methylation_data_from_bam(
570
660
  input_bam: str, # Path to BAM file
571
661
  genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
572
662
  quality_limit: int = 20, # Minimum MAPQ
663
+ filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
664
+ non_converted_threshold: int = 3, # Threshold for the above filter
665
+ filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
666
+ em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
573
667
  verbose: bool = False, # Enable verbose output
574
668
  debug: bool = False # Enable debug output
575
669
  ) -> ExtractionResult
@@ -39,6 +39,7 @@
39
39
  - [Custom Output Directory](#custom-output-directory)
40
40
  - [Using a Custom Genome](#using-a-custom-genome)
41
41
  - [Command-Line Options](#command-line-options)
42
+ - [Filtering Conversion Errors](#filtering-conversion-errors)
42
43
  - [Inspecting Output Files](#inspecting-output-files)
43
44
  - [Output Data Structure](#output-data-structure)
44
45
  - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
@@ -66,6 +67,7 @@
66
67
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
67
68
  - **Quality Filtering**: Configurable mapping quality thresholds
68
69
  - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
70
+ - **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
69
71
 
70
72
  ## Requirements
71
73
 
@@ -223,6 +225,25 @@ Options:
223
225
  determine CpG sites).
224
226
  --quality-limit INTEGER Quality filter for aligned reads (default =
225
227
  20)
228
+ --filter-non-converted Drop reads with >= --non-converted-threshold
229
+ retained non-CpG cytosines, the signature of
230
+ incomplete bisulfite/EM-seq conversion (port
231
+ of nebiolabs/mark-nonconverted-reads).
232
+ Default: off.
233
+ --non-converted-threshold INTEGER
234
+ Minimum count of retained non-CpG cytosines
235
+ to drop a read (default = 3, matches NEB
236
+ mark-nonconverted-reads).
237
+ --filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
238
+ called unmethylated and cover at least --em-
239
+ overconversion-min-cpgs sites (heuristic for
240
+ the fragment-level over-conversion artifact
241
+ described in Loyfer et al. bioRxiv
242
+ 2026.03.24.713040). Default: off.
243
+ --em-overconversion-min-cpgs INTEGER
244
+ Minimum covered CpG count required before
245
+ the EM over-conversion filter will drop a
246
+ read (default = 3).
226
247
  --verbose Verbose output.
227
248
  --skip-cache De-novo generate CpG sites (slow).
228
249
  --debug Debug mode (extensive validity checking +
@@ -248,6 +269,10 @@ Options:
248
269
  | `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
249
270
  | `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
250
271
  | `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
272
+ | `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
273
+ | `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
274
+ | `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
275
+ | `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
251
276
  | `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
252
277
  | `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
253
278
  | `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
@@ -256,6 +281,66 @@ Options:
256
281
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
257
282
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
258
283
 
284
+ ## Filtering Conversion Errors
285
+
286
+ Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
287
+
288
+ ### `--filter-non-converted` — incomplete conversion
289
+
290
+ Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
291
+
292
+ - **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
293
+ - **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
294
+
295
+ ### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
296
+
297
+ A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
298
+
299
+ The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
300
+
301
+ ### Usage
302
+
303
+ ```bash
304
+ bam2tensor \
305
+ --input-path sample.bam \
306
+ --reference-fasta GRCh38.fa \
307
+ --genome-name hg38 \
308
+ --filter-non-converted \
309
+ --filter-em-overconversion
310
+ ```
311
+
312
+ Filter parameters and enabled state are written to the output `metadata.json`:
313
+
314
+ ```json
315
+ {
316
+ "filters": {
317
+ "non_converted_reads": {"enabled": true, "threshold": 3},
318
+ "em_overconversion": {"enabled": true, "min_cpgs": 3}
319
+ }
320
+ }
321
+ ```
322
+
323
+ ### Reproducibility note
324
+
325
+ The two filters differ in whether they can be replayed downstream without the source BAM:
326
+
327
+ - **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
328
+
329
+ ```python
330
+ import scipy.sparse
331
+ mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
332
+ min_cpgs = 3
333
+ kept_rows = []
334
+ for i in range(mat.shape[0]):
335
+ row = mat.getrow(i).toarray().ravel()
336
+ covered = row[(row == 0) | (row == 1)] # drop -1 no-data
337
+ is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
338
+ if not is_overconv:
339
+ kept_rows.append(i)
340
+ ```
341
+
342
+ - **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
343
+
259
344
  ## Inspecting Output Files
260
345
 
261
346
  Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
@@ -269,11 +354,15 @@ sample.methylation.npz
269
354
  CpG sites: 28,217,448
270
355
  Data points: 12,847,322 (sparsity: 99.97%)
271
356
  Fragment len: median 167, mean 182, range [50, 600]
357
+ Filters: non-converted (>= 3 non-CpG Cs)
358
+ EM over-conversion (all-unmethylated, >= 3 CpGs)
272
359
  CpG index CRC32: a1b2c3d4
273
- bam2tensor: v2.4
360
+ bam2tensor: v2.5
274
361
  File size: 14.2 MB
275
362
  ```
276
363
 
364
+ When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
365
+
277
366
  You can pass multiple files at once:
278
367
 
279
368
  ```bash
@@ -335,6 +424,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
335
424
  | `expected_chromosomes` | List of chromosomes included in the column mapping |
336
425
  | `total_cpg_sites` | Total number of CpG columns in the matrix |
337
426
  | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
427
+ | `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
338
428
 
339
429
  This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
340
430
 
@@ -537,6 +627,10 @@ extract_methylation_data_from_bam(
537
627
  input_bam: str, # Path to BAM file
538
628
  genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
539
629
  quality_limit: int = 20, # Minimum MAPQ
630
+ filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
631
+ non_converted_threshold: int = 3, # Threshold for the above filter
632
+ filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
633
+ em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
540
634
  verbose: bool = False, # Enable verbose output
541
635
  debug: bool = False # Enable debug output
542
636
  ) -> ExtractionResult
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bam2tensor"
3
- version = "2.4"
3
+ version = "2.6"
4
4
  description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
5
5
  authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
6
6
  license = "MIT"
@@ -50,4 +50,4 @@ See Also:
50
50
  - https://mcwdsi.github.io/bam2tensor for full documentation
51
51
  """
52
52
 
53
- __version__ = "2.4"
53
+ __version__ = "2.6"
@@ -229,6 +229,43 @@ def validate_input_output(
229
229
  default=20,
230
230
  type=int,
231
231
  )
232
+ @click.option(
233
+ "--filter-non-converted",
234
+ help=(
235
+ "Drop reads with >= --non-converted-threshold retained non-CpG "
236
+ "cytosines, the signature of incomplete bisulfite/EM-seq conversion "
237
+ "(port of nebiolabs/mark-nonconverted-reads). Default: off."
238
+ ),
239
+ is_flag=True,
240
+ )
241
+ @click.option(
242
+ "--non-converted-threshold",
243
+ help=(
244
+ "Minimum count of retained non-CpG cytosines to drop a read "
245
+ "(default = 3, matches NEB mark-nonconverted-reads)."
246
+ ),
247
+ default=3,
248
+ type=int,
249
+ )
250
+ @click.option(
251
+ "--filter-em-overconversion",
252
+ help=(
253
+ "Drop EM-seq reads whose covered CpGs are all called unmethylated "
254
+ "and cover at least --em-overconversion-min-cpgs sites (heuristic "
255
+ "for the fragment-level over-conversion artifact described in "
256
+ "Loyfer et al. bioRxiv 2026.03.24.713040). Default: off."
257
+ ),
258
+ is_flag=True,
259
+ )
260
+ @click.option(
261
+ "--em-overconversion-min-cpgs",
262
+ help=(
263
+ "Minimum covered CpG count required before the EM over-conversion "
264
+ "filter will drop a read (default = 3)."
265
+ ),
266
+ default=3,
267
+ type=int,
268
+ )
232
269
  @click.option("--verbose", help="Verbose output.", is_flag=True)
233
270
  @click.option("--skip-cache", help="De-novo generate CpG sites (slow).", is_flag=True)
234
271
  @click.option(
@@ -263,6 +300,10 @@ def main(
263
300
  expected_chromosomes: str | None,
264
301
  reference_fasta: str | None,
265
302
  quality_limit: int,
303
+ filter_non_converted: bool,
304
+ non_converted_threshold: int,
305
+ filter_em_overconversion: bool,
306
+ em_overconversion_min_cpgs: int,
266
307
  verbose: bool,
267
308
  skip_cache: bool,
268
309
  debug: bool,
@@ -300,6 +341,17 @@ def main(
300
341
  ``--download-reference`` is used.
301
342
  quality_limit: Minimum mapping quality (MAPQ) threshold. Reads below
302
343
  this quality are excluded.
344
+ filter_non_converted: If True, drop reads with at least
345
+ ``non_converted_threshold`` retained non-CpG cytosines —
346
+ indicating incomplete bisulfite/EM-seq conversion.
347
+ non_converted_threshold: Threshold used by the non-converted
348
+ read filter.
349
+ filter_em_overconversion: If True, drop reads whose covered CpGs
350
+ are all called unmethylated and cover at least
351
+ ``em_overconversion_min_cpgs`` sites — heuristic for EM-seq
352
+ fragment-level over-conversion (Loyfer et al. 2026).
353
+ em_overconversion_min_cpgs: Minimum covered CpG count required
354
+ before the over-conversion filter will drop a read.
303
355
  verbose: If True, print detailed progress information.
304
356
  skip_cache: If True, regenerate the CpG site index even if a cache
305
357
  file exists.
@@ -382,6 +434,16 @@ def main(
382
434
  print(f" Reference: {reference_fasta}")
383
435
  print(f" Chromosomes: {chrom_display}")
384
436
  print(f" Quality limit: MAPQ >= {quality_limit}")
437
+ if filter_non_converted:
438
+ print(
439
+ f" Filters: non-converted reads (>= "
440
+ f"{non_converted_threshold} retained non-CpG Cs)"
441
+ )
442
+ if filter_em_overconversion:
443
+ print(
444
+ f" EM over-conversion (all-unmethylated, >= "
445
+ f"{em_overconversion_min_cpgs} CpGs)"
446
+ )
385
447
  if output_dir:
386
448
  print(f" Output dir: {output_dir}")
387
449
  else:
@@ -448,6 +510,10 @@ def main(
448
510
  input_bam=input_bam,
449
511
  genome_methylation_embedding=genome_methylation_embedding,
450
512
  quality_limit=quality_limit,
513
+ filter_non_converted=filter_non_converted,
514
+ non_converted_threshold=non_converted_threshold,
515
+ filter_em_overconversion=filter_em_overconversion,
516
+ em_overconversion_min_cpgs=em_overconversion_min_cpgs,
451
517
  verbose=verbose,
452
518
  debug=debug,
453
519
  )
@@ -476,6 +542,16 @@ def main(
476
542
  "expected_chromosomes": chrom_list,
477
543
  "total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
478
544
  "cpg_index_crc32": cpg_crc32,
545
+ "filters": {
546
+ "non_converted_reads": {
547
+ "enabled": filter_non_converted,
548
+ "threshold": non_converted_threshold,
549
+ },
550
+ "em_overconversion": {
551
+ "enabled": filter_em_overconversion,
552
+ "min_cpgs": em_overconversion_min_cpgs,
553
+ },
554
+ },
479
555
  },
480
556
  )
481
557
  print(f" Output: {output_file}")