bam2tensor 2.5__tar.gz → 2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/docs.yml +1 -1
  2. {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/release.yml +3 -3
  3. {bam2tensor-2.5 → bam2tensor-2.6}/PKG-INFO +96 -2
  4. {bam2tensor-2.5 → bam2tensor-2.6}/README.md +95 -1
  5. {bam2tensor-2.5 → bam2tensor-2.6}/pyproject.toml +1 -1
  6. {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/__init__.py +1 -1
  7. {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/functions.py +42 -18
  8. {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_functions.py +127 -0
  9. {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_inspect.py +1 -1
  10. {bam2tensor-2.5 → bam2tensor-2.6}/uv.lock +131 -115
  11. {bam2tensor-2.5 → bam2tensor-2.6}/.darglint +0 -0
  12. {bam2tensor-2.5 → bam2tensor-2.6}/.editorconfig +0 -0
  13. {bam2tensor-2.5 → bam2tensor-2.6}/.gitattributes +0 -0
  14. {bam2tensor-2.5 → bam2tensor-2.6}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  15. {bam2tensor-2.5 → bam2tensor-2.6}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  16. {bam2tensor-2.5 → bam2tensor-2.6}/.github/actions/setup-env/action.yml +0 -0
  17. {bam2tensor-2.5 → bam2tensor-2.6}/.github/dependabot.yml +0 -0
  18. {bam2tensor-2.5 → bam2tensor-2.6}/.github/labels.yml +0 -0
  19. {bam2tensor-2.5 → bam2tensor-2.6}/.github/release-drafter.yml +0 -0
  20. {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/constraints.txt +0 -0
  21. {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/labeler.yml +0 -0
  22. {bam2tensor-2.5 → bam2tensor-2.6}/.github/workflows/tests.yml +0 -0
  23. {bam2tensor-2.5 → bam2tensor-2.6}/.gitignore +0 -0
  24. {bam2tensor-2.5 → bam2tensor-2.6}/.pre-commit-config.yaml +0 -0
  25. {bam2tensor-2.5 → bam2tensor-2.6}/CLAUDE.md +0 -0
  26. {bam2tensor-2.5 → bam2tensor-2.6}/CONTRIBUTING.md +0 -0
  27. {bam2tensor-2.5 → bam2tensor-2.6}/LICENSE +0 -0
  28. {bam2tensor-2.5 → bam2tensor-2.6}/SECURITY.md +0 -0
  29. {bam2tensor-2.5 → bam2tensor-2.6}/docs/Makefile +0 -0
  30. {bam2tensor-2.5 → bam2tensor-2.6}/docs/conf.py +0 -0
  31. {bam2tensor-2.5 → bam2tensor-2.6}/docs/contributing.md +0 -0
  32. {bam2tensor-2.5 → bam2tensor-2.6}/docs/index.md +0 -0
  33. {bam2tensor-2.5 → bam2tensor-2.6}/docs/license.md +0 -0
  34. {bam2tensor-2.5 → bam2tensor-2.6}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
  35. {bam2tensor-2.5 → bam2tensor-2.6}/docs/logo/bam2tensor-logo.afdesign +0 -0
  36. {bam2tensor-2.5 → bam2tensor-2.6}/docs/logo/bam2tensor-logo.png +0 -0
  37. {bam2tensor-2.5 → bam2tensor-2.6}/docs/make.bat +0 -0
  38. {bam2tensor-2.5 → bam2tensor-2.6}/docs/nano-banana-overview-shrunk.png +0 -0
  39. {bam2tensor-2.5 → bam2tensor-2.6}/docs/reference.md +0 -0
  40. {bam2tensor-2.5 → bam2tensor-2.6}/docs/templates/package.rst_t +0 -0
  41. {bam2tensor-2.5 → bam2tensor-2.6}/noxfile.py +0 -0
  42. {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/__main__.py +0 -0
  43. {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/embedding.py +0 -0
  44. {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/inspect.py +0 -0
  45. {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/metadata.py +0 -0
  46. {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/py.typed +0 -0
  47. {bam2tensor-2.5 → bam2tensor-2.6}/src/bam2tensor/reference.py +0 -0
  48. {bam2tensor-2.5 → bam2tensor-2.6}/tests/__init__.py +0 -0
  49. {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_duplication.py +0 -0
  50. {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_embedding.py +0 -0
  51. {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_fasta.fa +0 -0
  52. {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_filters.py +0 -0
  53. {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_main.py +0 -0
  54. {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_metadata.py +0 -0
  55. {bam2tensor-2.5 → bam2tensor-2.6}/tests/test_reference.py +0 -0
@@ -47,7 +47,7 @@ jobs:
47
47
  uv run sphinx-build docs docs/_build
48
48
 
49
49
  - name: Upload artifact
50
- uses: actions/upload-pages-artifact@v4
50
+ uses: actions/upload-pages-artifact@v5
51
51
  with:
52
52
  path: "docs/_build"
53
53
 
@@ -67,16 +67,16 @@ jobs:
67
67
 
68
68
  - name: Publish package on PyPI
69
69
  if: steps.check-version.outputs.tag || steps.check-tag.outputs.tag
70
- uses: pypa/gh-action-pypi-publish@v1.13.0
70
+ uses: pypa/gh-action-pypi-publish@v1.14.0
71
71
 
72
72
  - name: Publish package on TestPyPI
73
73
  if: (!steps.check-version.outputs.tag && !steps.check-tag.outputs.tag)
74
- uses: pypa/gh-action-pypi-publish@v1.13.0
74
+ uses: pypa/gh-action-pypi-publish@v1.14.0
75
75
  with:
76
76
  repository-url: https://test.pypi.org/legacy/
77
77
 
78
78
  - name: Publish the release notes
79
- uses: release-drafter/release-drafter@v7.1.1
79
+ uses: release-drafter/release-drafter@v7.2.1
80
80
  with:
81
81
  publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
82
82
  tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bam2tensor
3
- Version: 2.5
3
+ Version: 2.6
4
4
  Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
5
5
  Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
6
6
  Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
@@ -72,6 +72,7 @@ Description-Content-Type: text/markdown
72
72
  - [Custom Output Directory](#custom-output-directory)
73
73
  - [Using a Custom Genome](#using-a-custom-genome)
74
74
  - [Command-Line Options](#command-line-options)
75
+ - [Filtering Conversion Errors](#filtering-conversion-errors)
75
76
  - [Inspecting Output Files](#inspecting-output-files)
76
77
  - [Output Data Structure](#output-data-structure)
77
78
  - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
@@ -99,6 +100,7 @@ Description-Content-Type: text/markdown
99
100
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
100
101
  - **Quality Filtering**: Configurable mapping quality thresholds
101
102
  - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
103
+ - **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
102
104
 
103
105
  ## Requirements
104
106
 
@@ -256,6 +258,25 @@ Options:
256
258
  determine CpG sites).
257
259
  --quality-limit INTEGER Quality filter for aligned reads (default =
258
260
  20)
261
+ --filter-non-converted Drop reads with >= --non-converted-threshold
262
+ retained non-CpG cytosines, the signature of
263
+ incomplete bisulfite/EM-seq conversion (port
264
+ of nebiolabs/mark-nonconverted-reads).
265
+ Default: off.
266
+ --non-converted-threshold INTEGER
267
+ Minimum count of retained non-CpG cytosines
268
+ to drop a read (default = 3, matches NEB
269
+ mark-nonconverted-reads).
270
+ --filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
271
+ called unmethylated and cover at least --em-
272
+ overconversion-min-cpgs sites (heuristic for
273
+ the fragment-level over-conversion artifact
274
+ described in Loyfer et al. bioRxiv
275
+ 2026.03.24.713040). Default: off.
276
+ --em-overconversion-min-cpgs INTEGER
277
+ Minimum covered CpG count required before
278
+ the EM over-conversion filter will drop a
279
+ read (default = 3).
259
280
  --verbose Verbose output.
260
281
  --skip-cache De-novo generate CpG sites (slow).
261
282
  --debug Debug mode (extensive validity checking +
@@ -281,6 +302,10 @@ Options:
281
302
  | `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
282
303
  | `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
283
304
  | `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
305
+ | `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
306
+ | `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
307
+ | `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
308
+ | `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
284
309
  | `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
285
310
  | `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
286
311
  | `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
@@ -289,6 +314,66 @@ Options:
289
314
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
290
315
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
291
316
 
317
+ ## Filtering Conversion Errors
318
+
319
+ Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
320
+
321
+ ### `--filter-non-converted` — incomplete conversion
322
+
323
+ Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
324
+
325
+ - **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
326
+ - **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
327
+
328
+ ### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
329
+
330
+ A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
331
+
332
+ The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
333
+
334
+ ### Usage
335
+
336
+ ```bash
337
+ bam2tensor \
338
+ --input-path sample.bam \
339
+ --reference-fasta GRCh38.fa \
340
+ --genome-name hg38 \
341
+ --filter-non-converted \
342
+ --filter-em-overconversion
343
+ ```
344
+
345
+ Filter parameters and enabled state are written to the output `metadata.json`:
346
+
347
+ ```json
348
+ {
349
+ "filters": {
350
+ "non_converted_reads": {"enabled": true, "threshold": 3},
351
+ "em_overconversion": {"enabled": true, "min_cpgs": 3}
352
+ }
353
+ }
354
+ ```
355
+
356
+ ### Reproducibility note
357
+
358
+ The two filters differ in whether they can be replayed downstream without the source BAM:
359
+
360
+ - **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
361
+
362
+ ```python
363
+ import scipy.sparse
364
+ mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
365
+ min_cpgs = 3
366
+ kept_rows = []
367
+ for i in range(mat.shape[0]):
368
+ row = mat.getrow(i).toarray().ravel()
369
+ covered = row[(row == 0) | (row == 1)] # drop -1 no-data
370
+ is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
371
+ if not is_overconv:
372
+ kept_rows.append(i)
373
+ ```
374
+
375
+ - **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
376
+
292
377
  ## Inspecting Output Files
293
378
 
294
379
  Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
@@ -302,11 +387,15 @@ sample.methylation.npz
302
387
  CpG sites: 28,217,448
303
388
  Data points: 12,847,322 (sparsity: 99.97%)
304
389
  Fragment len: median 167, mean 182, range [50, 600]
390
+ Filters: non-converted (>= 3 non-CpG Cs)
391
+ EM over-conversion (all-unmethylated, >= 3 CpGs)
305
392
  CpG index CRC32: a1b2c3d4
306
- bam2tensor: v2.4
393
+ bam2tensor: v2.5
307
394
  File size: 14.2 MB
308
395
  ```
309
396
 
397
+ When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
398
+
310
399
  You can pass multiple files at once:
311
400
 
312
401
  ```bash
@@ -368,6 +457,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
368
457
  | `expected_chromosomes` | List of chromosomes included in the column mapping |
369
458
  | `total_cpg_sites` | Total number of CpG columns in the matrix |
370
459
  | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
460
+ | `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
371
461
 
372
462
  This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
373
463
 
@@ -570,6 +660,10 @@ extract_methylation_data_from_bam(
570
660
  input_bam: str, # Path to BAM file
571
661
  genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
572
662
  quality_limit: int = 20, # Minimum MAPQ
663
+ filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
664
+ non_converted_threshold: int = 3, # Threshold for the above filter
665
+ filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
666
+ em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
573
667
  verbose: bool = False, # Enable verbose output
574
668
  debug: bool = False # Enable debug output
575
669
  ) -> ExtractionResult
@@ -39,6 +39,7 @@
39
39
  - [Custom Output Directory](#custom-output-directory)
40
40
  - [Using a Custom Genome](#using-a-custom-genome)
41
41
  - [Command-Line Options](#command-line-options)
42
+ - [Filtering Conversion Errors](#filtering-conversion-errors)
42
43
  - [Inspecting Output Files](#inspecting-output-files)
43
44
  - [Output Data Structure](#output-data-structure)
44
45
  - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
@@ -66,6 +67,7 @@
66
67
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
67
68
  - **Quality Filtering**: Configurable mapping quality thresholds
68
69
  - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
70
+ - **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
69
71
 
70
72
  ## Requirements
71
73
 
@@ -223,6 +225,25 @@ Options:
223
225
  determine CpG sites).
224
226
  --quality-limit INTEGER Quality filter for aligned reads (default =
225
227
  20)
228
+ --filter-non-converted Drop reads with >= --non-converted-threshold
229
+ retained non-CpG cytosines, the signature of
230
+ incomplete bisulfite/EM-seq conversion (port
231
+ of nebiolabs/mark-nonconverted-reads).
232
+ Default: off.
233
+ --non-converted-threshold INTEGER
234
+ Minimum count of retained non-CpG cytosines
235
+ to drop a read (default = 3, matches NEB
236
+ mark-nonconverted-reads).
237
+ --filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
238
+ called unmethylated and cover at least --em-
239
+ overconversion-min-cpgs sites (heuristic for
240
+ the fragment-level over-conversion artifact
241
+ described in Loyfer et al. bioRxiv
242
+ 2026.03.24.713040). Default: off.
243
+ --em-overconversion-min-cpgs INTEGER
244
+ Minimum covered CpG count required before
245
+ the EM over-conversion filter will drop a
246
+ read (default = 3).
226
247
  --verbose Verbose output.
227
248
  --skip-cache De-novo generate CpG sites (slow).
228
249
  --debug Debug mode (extensive validity checking +
@@ -248,6 +269,10 @@ Options:
248
269
  | `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
249
270
  | `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
250
271
  | `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
272
+ | `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
273
+ | `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
274
+ | `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
275
+ | `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
251
276
  | `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
252
277
  | `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
253
278
  | `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
@@ -256,6 +281,66 @@ Options:
256
281
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
257
282
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
258
283
 
284
+ ## Filtering Conversion Errors
285
+
286
+ Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
287
+
288
+ ### `--filter-non-converted` — incomplete conversion
289
+
290
+ Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
291
+
292
+ - **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
293
+ - **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
294
+
295
+ ### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
296
+
297
+ A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
298
+
299
+ The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
300
+
301
+ ### Usage
302
+
303
+ ```bash
304
+ bam2tensor \
305
+ --input-path sample.bam \
306
+ --reference-fasta GRCh38.fa \
307
+ --genome-name hg38 \
308
+ --filter-non-converted \
309
+ --filter-em-overconversion
310
+ ```
311
+
312
+ Filter parameters and enabled state are written to the output `metadata.json`:
313
+
314
+ ```json
315
+ {
316
+ "filters": {
317
+ "non_converted_reads": {"enabled": true, "threshold": 3},
318
+ "em_overconversion": {"enabled": true, "min_cpgs": 3}
319
+ }
320
+ }
321
+ ```
322
+
323
+ ### Reproducibility note
324
+
325
+ The two filters differ in whether they can be replayed downstream without the source BAM:
326
+
327
+ - **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
328
+
329
+ ```python
330
+ import scipy.sparse
331
+ mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
332
+ min_cpgs = 3
333
+ kept_rows = []
334
+ for i in range(mat.shape[0]):
335
+ row = mat.getrow(i).toarray().ravel()
336
+ covered = row[(row == 0) | (row == 1)] # drop -1 no-data
337
+ is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
338
+ if not is_overconv:
339
+ kept_rows.append(i)
340
+ ```
341
+
342
+ - **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
343
+
259
344
  ## Inspecting Output Files
260
345
 
261
346
  Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
@@ -269,11 +354,15 @@ sample.methylation.npz
269
354
  CpG sites: 28,217,448
270
355
  Data points: 12,847,322 (sparsity: 99.97%)
271
356
  Fragment len: median 167, mean 182, range [50, 600]
357
+ Filters: non-converted (>= 3 non-CpG Cs)
358
+ EM over-conversion (all-unmethylated, >= 3 CpGs)
272
359
  CpG index CRC32: a1b2c3d4
273
- bam2tensor: v2.4
360
+ bam2tensor: v2.5
274
361
  File size: 14.2 MB
275
362
  ```
276
363
 
364
+ When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
365
+
277
366
  You can pass multiple files at once:
278
367
 
279
368
  ```bash
@@ -335,6 +424,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
335
424
  | `expected_chromosomes` | List of chromosomes included in the column mapping |
336
425
  | `total_cpg_sites` | Total number of CpG columns in the matrix |
337
426
  | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
427
+ | `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
338
428
 
339
429
  This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
340
430
 
@@ -537,6 +627,10 @@ extract_methylation_data_from_bam(
537
627
  input_bam: str, # Path to BAM file
538
628
  genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
539
629
  quality_limit: int = 20, # Minimum MAPQ
630
+ filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
631
+ non_converted_threshold: int = 3, # Threshold for the above filter
632
+ filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
633
+ em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
540
634
  verbose: bool = False, # Enable verbose output
541
635
  debug: bool = False # Enable debug output
542
636
  ) -> ExtractionResult
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bam2tensor"
3
- version = "2.5"
3
+ version = "2.6"
4
4
  description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
5
5
  authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
6
6
  license = "MIT"
@@ -50,4 +50,4 @@ See Also:
50
50
  - https://mcwdsi.github.io/bam2tensor for full documentation
51
51
  """
52
52
 
53
- __version__ = "2.5"
53
+ __version__ = "2.6"
@@ -706,25 +706,36 @@ def extract_methylation_data_from_bam(
706
706
 
707
707
  # get_aligned_pairs returns a list of tuples of (read_pos, ref_pos)
708
708
  # We filter this to only include the specific CpG sites from above
709
+ aligned_pairs = aligned_segment.get_aligned_pairs(matches_only=True)
709
710
  this_segment_cpgs = [
710
- e
711
- for e in aligned_segment.get_aligned_pairs(matches_only=True)
712
- if e[1] + 1 in cpgs_within_read_set
711
+ e for e in aligned_pairs if e[1] + 1 in cpgs_within_read_set
713
712
  ]
714
713
 
715
714
  # If no CpGs covered (after filtering for matches only), skip
716
715
  if not this_segment_cpgs:
717
716
  continue
718
717
 
719
- # Ok we're on the same strand as the methylation (right?)
720
- # Let's compare the possible CpGs in this interval to the reference and note status
721
- # A methylated C will be *unchanged* and read as C (pair G)
722
- # An unmethylated C will be *changed* and read as T (pair A)
723
- for query_pos, ref_pos in this_segment_cpgs:
724
- query_base = aligned_segment.query_sequence[query_pos] # type: ignore
725
- # query_base_raw = aligned_segment.get_forward_sequence()[query_pos] # raw off sequencer
726
- # query_base_no_offset = aligned_segment.query_alignment_sequence[query_pos] # this needs to be offset by the soft clip
718
+ # OT (forward parent): methylation-informative base sits on the
719
+ # top-strand C at ref_pos. BAM SEQ is reference-oriented, so
720
+ # C = methylated, T = unmethylated.
721
+ # OB (reverse parent): the original bottom-strand C lives at
722
+ # ref_pos + 1 (the G of the top-strand CG). After the aligner
723
+ # reverse-complements into reference orientation for BAM
724
+ # storage, that base reads G = methylated, A = unmethylated.
725
+ # At ref_pos itself, BAM always shows C (the unaffected
726
+ # bottom-strand G reverse-complemented), which is why reading
727
+ # ref_pos on OB reads collapses every CpG to "methylated".
728
+ query_sequence = aligned_segment.query_sequence
729
+ if bisulfite_parent_strand_is_reverse:
730
+ methylated_base, unmethylated_base = "G", "A"
731
+ # Indels at the CpG boundary mean ref_pos + 1 isn't always
732
+ # query_pos + 1 — go through a ref -> query map.
733
+ ref_to_query: dict[int, int] = {ref: q for q, ref in aligned_pairs}
734
+ else:
735
+ methylated_base, unmethylated_base = "C", "T"
736
+ ref_to_query = {}
727
737
 
738
+ for query_pos, ref_pos in this_segment_cpgs:
728
739
  read_cpg_cols.append(
729
740
  genome_methylation_embedding.genomic_position_to_embedding(
730
741
  chrom,
@@ -732,21 +743,34 @@ def extract_methylation_data_from_bam(
732
743
  )
733
744
  )
734
745
 
735
- if query_base == "C":
736
- # Methylated
746
+ if bisulfite_parent_strand_is_reverse:
747
+ target_query_pos = ref_to_query.get(ref_pos + 1)
748
+ if target_query_pos is None:
749
+ read_cpg_data.append(-1)
750
+ if debug:
751
+ print(f"\t{query_pos} {ref_pos} [Indel at OB target]")
752
+ continue
753
+ query_base = query_sequence[target_query_pos] # type: ignore[index]
754
+ else:
755
+ query_base = query_sequence[query_pos] # type: ignore[index]
756
+
757
+ if query_base == methylated_base:
737
758
  read_cpg_data.append(1)
738
759
  if debug:
739
- print(f"\t{query_pos} {ref_pos} C->{query_base} [Methylated]")
740
- elif query_base == "T":
760
+ print(
761
+ f"\t{query_pos} {ref_pos} {methylated_base}->{query_base} [Methylated]"
762
+ )
763
+ elif query_base == unmethylated_base:
741
764
  read_cpg_data.append(0)
742
- # Unmethylated
743
765
  if debug:
744
- print(f"\t{query_pos} {ref_pos} C->{query_base} [Unmethylated]")
766
+ print(
767
+ f"\t{query_pos} {ref_pos} {methylated_base}->{query_base} [Unmethylated]"
768
+ )
745
769
  else:
746
770
  read_cpg_data.append(-1)
747
771
  if debug:
748
772
  print(
749
- f"\t{query_pos} {ref_pos} C->{query_base} [Unknown! SNV? Indel?]"
773
+ f"\t{query_pos} {ref_pos} {methylated_base}->{query_base} [Unknown! SNV? Indel?]"
750
774
  )
751
775
 
752
776
  if filter_em_overconversion and is_em_overconversion_read(
@@ -1074,6 +1074,133 @@ def test_biscuit_debug_mode_ct_bases(tmp_path):
1074
1074
  assert result.matrix.shape[0] == 1
1075
1075
 
1076
1076
 
1077
+ def test_biscuit_ob_strand_methylation_extraction(tmp_path):
1078
+ """Biscuit/bwameth OB-strand (YD=r, is_reverse=True) reads must read the
1079
+ methylation-informative base at ref_pos+1 (G=methylated, A=unmethylated),
1080
+ not ref_pos (which is always C in BAM SEQ regardless of methylation state).
1081
+
1082
+ Regression for the bug where OB reads were extracted with C/T logic at
1083
+ ref_pos and thus scored as universally methylated.
1084
+ """
1085
+ fasta_path = tmp_path / "ref.fa"
1086
+ # CpGs at 1-based positions 10, 21 (top-strand C at 0-based 9, 20; G at 10, 21).
1087
+ seq = "N" * 9 + "CG" + "N" * 9 + "CG" + "N" * 128
1088
+ with open(fasta_path, "w") as f:
1089
+ f.write(">chr1\n" + seq + "\n")
1090
+
1091
+ emb = embedding.GenomeMethylationEmbedding(
1092
+ "test_biscuit_ob",
1093
+ expected_chromosomes=["chr1"],
1094
+ fasta_source=str(fasta_path),
1095
+ skip_cache=True,
1096
+ )
1097
+
1098
+ bam_path = tmp_path / "test.bam"
1099
+ header = {"HD": {"VN": "1.0"}, "SQ": [{"LN": len(seq), "SN": "chr1"}]}
1100
+ # OB read: BAM SEQ is reference-oriented. The C of each top-strand CG is
1101
+ # always C in BAM (bottom-strand G reverse-complemented). The G of each
1102
+ # top-strand CG is what carries the methylation signal: G=methylated,
1103
+ # A=unmethylated.
1104
+ read_seq = list("N" * len(seq))
1105
+ read_seq[9] = "C" # top-strand C of CpG#1 (always C in BAM for OB)
1106
+ read_seq[10] = "G" # methylated → G at ref_pos+1
1107
+ read_seq[20] = "C" # top-strand C of CpG#2 (always C in BAM for OB)
1108
+ read_seq[21] = "A" # unmethylated → A at ref_pos+1
1109
+ with pysam.AlignmentFile(bam_path, "wb", header=header) as out_bam:
1110
+ a = pysam.AlignedSegment()
1111
+ a.query_name = "ob_read"
1112
+ a.query_sequence = "".join(read_seq)
1113
+ a.flag = 0x10 # reverse-mapped
1114
+ a.reference_id = 0
1115
+ a.reference_start = 0
1116
+ a.mapping_quality = 60
1117
+ a.cigartuples = [(0, len(seq))]
1118
+ a.set_tag("MD", str(len(seq)))
1119
+ a.set_tag("YD", "r") # OB / reverse parent strand
1120
+ out_bam.write(a)
1121
+ pysam.index(str(bam_path))
1122
+
1123
+ result = functions.extract_methylation_data_from_bam(
1124
+ input_bam=str(bam_path),
1125
+ genome_methylation_embedding=emb,
1126
+ )
1127
+ assert result.matrix.shape[0] == 1
1128
+ assert result.matrix.nnz == 2
1129
+ data = sorted(result.matrix.data)
1130
+ assert data == [0, 1], (
1131
+ f"Expected one methylated (1) and one unmethylated (0) call, got {data}. "
1132
+ "If this is all 1s, the OB-strand base lookup regression has returned."
1133
+ )
1134
+
1135
+
1136
+ def test_biscuit_ot_and_ob_share_cpg_columns(tmp_path):
1137
+ """OT and OB reads at the same CpG must land in the same embedding column
1138
+ (canonical CpG site = top-strand C, ref_pos+1 in 1-based coordinates).
1139
+ """
1140
+ fasta_path = tmp_path / "ref.fa"
1141
+ seq = "N" * 9 + "CG" + "N" * 9 + "CG" + "N" * 128
1142
+ with open(fasta_path, "w") as f:
1143
+ f.write(">chr1\n" + seq + "\n")
1144
+
1145
+ emb = embedding.GenomeMethylationEmbedding(
1146
+ "test_biscuit_ot_ob_columns",
1147
+ expected_chromosomes=["chr1"],
1148
+ fasta_source=str(fasta_path),
1149
+ skip_cache=True,
1150
+ )
1151
+
1152
+ bam_path = tmp_path / "test.bam"
1153
+ header = {"HD": {"VN": "1.0"}, "SQ": [{"LN": len(seq), "SN": "chr1"}]}
1154
+ # OT read: C at top-strand C positions = methylated at both CpGs.
1155
+ ot_seq = list("N" * len(seq))
1156
+ ot_seq[9] = "C"
1157
+ ot_seq[20] = "C"
1158
+ # OB read (BAM in reference orientation): G at top-strand G positions
1159
+ # = methylated at both CpGs.
1160
+ ob_seq = list("N" * len(seq))
1161
+ ob_seq[9] = "C"
1162
+ ob_seq[10] = "G"
1163
+ ob_seq[20] = "C"
1164
+ ob_seq[21] = "G"
1165
+ with pysam.AlignmentFile(bam_path, "wb", header=header) as out_bam:
1166
+ a = pysam.AlignedSegment()
1167
+ a.query_name = "ot_read"
1168
+ a.query_sequence = "".join(ot_seq)
1169
+ a.flag = 0
1170
+ a.reference_id = 0
1171
+ a.reference_start = 0
1172
+ a.mapping_quality = 60
1173
+ a.cigartuples = [(0, len(seq))]
1174
+ a.set_tag("MD", str(len(seq)))
1175
+ a.set_tag("YD", "f")
1176
+ out_bam.write(a)
1177
+ b = pysam.AlignedSegment()
1178
+ b.query_name = "ob_read"
1179
+ b.query_sequence = "".join(ob_seq)
1180
+ b.flag = 0x10
1181
+ b.reference_id = 0
1182
+ b.reference_start = 0
1183
+ b.mapping_quality = 60
1184
+ b.cigartuples = [(0, len(seq))]
1185
+ b.set_tag("MD", str(len(seq)))
1186
+ b.set_tag("YD", "r")
1187
+ out_bam.write(b)
1188
+ pysam.index(str(bam_path))
1189
+
1190
+ result = functions.extract_methylation_data_from_bam(
1191
+ input_bam=str(bam_path),
1192
+ genome_methylation_embedding=emb,
1193
+ )
1194
+ assert result.matrix.shape[0] == 2
1195
+ # Both reads call both CpGs methylated, so we expect two reads × two CpGs
1196
+ # in the same two columns, all with value 1.
1197
+ coo = result.matrix.tocoo()
1198
+ ot_cols = sorted(int(c) for r, c in zip(coo.row, coo.col) if r == 0)
1199
+ ob_cols = sorted(int(c) for r, c in zip(coo.row, coo.col) if r == 1)
1200
+ assert ot_cols == ob_cols, f"OT and OB columns diverged: OT={ot_cols} OB={ob_cols}"
1201
+ assert list(result.matrix.data) == [1, 1, 1, 1]
1202
+
1203
+
1077
1204
  # ======================================================================
1078
1205
  # XB tag (gem3/Blueprint) extraction tests
1079
1206
  # ======================================================================
@@ -122,7 +122,7 @@ def test_inspect_end_to_end(tmp_path) -> None:
122
122
  assert result.exit_code == 0
123
123
  assert "test" in result.output # genome_name
124
124
  assert "CpG index CRC32:" in result.output
125
- assert "v2.5" in result.output
125
+ assert "v2.6" in result.output
126
126
 
127
127
 
128
128
  def test_format_size_bytes() -> None: