bam2tensor 2.5__tar.gz → 2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {bam2tensor-2.5 → bam2tensor-2.7}/.github/workflows/docs.yml +1 -1
  2. {bam2tensor-2.5 → bam2tensor-2.7}/.github/workflows/release.yml +3 -3
  3. {bam2tensor-2.5 → bam2tensor-2.7}/PKG-INFO +96 -2
  4. {bam2tensor-2.5 → bam2tensor-2.7}/README.md +95 -1
  5. {bam2tensor-2.5 → bam2tensor-2.7}/pyproject.toml +1 -1
  6. {bam2tensor-2.5 → bam2tensor-2.7}/src/bam2tensor/__init__.py +1 -1
  7. {bam2tensor-2.5 → bam2tensor-2.7}/src/bam2tensor/embedding.py +70 -23
  8. {bam2tensor-2.5 → bam2tensor-2.7}/src/bam2tensor/functions.py +42 -18
  9. {bam2tensor-2.5 → bam2tensor-2.7}/src/bam2tensor/metadata.py +29 -2
  10. {bam2tensor-2.5 → bam2tensor-2.7}/tests/test_embedding.py +221 -1
  11. {bam2tensor-2.5 → bam2tensor-2.7}/tests/test_functions.py +127 -0
  12. {bam2tensor-2.5 → bam2tensor-2.7}/tests/test_inspect.py +2 -2
  13. {bam2tensor-2.5 → bam2tensor-2.7}/uv.lock +131 -115
  14. {bam2tensor-2.5 → bam2tensor-2.7}/.darglint +0 -0
  15. {bam2tensor-2.5 → bam2tensor-2.7}/.editorconfig +0 -0
  16. {bam2tensor-2.5 → bam2tensor-2.7}/.gitattributes +0 -0
  17. {bam2tensor-2.5 → bam2tensor-2.7}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  18. {bam2tensor-2.5 → bam2tensor-2.7}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  19. {bam2tensor-2.5 → bam2tensor-2.7}/.github/actions/setup-env/action.yml +0 -0
  20. {bam2tensor-2.5 → bam2tensor-2.7}/.github/dependabot.yml +0 -0
  21. {bam2tensor-2.5 → bam2tensor-2.7}/.github/labels.yml +0 -0
  22. {bam2tensor-2.5 → bam2tensor-2.7}/.github/release-drafter.yml +0 -0
  23. {bam2tensor-2.5 → bam2tensor-2.7}/.github/workflows/constraints.txt +0 -0
  24. {bam2tensor-2.5 → bam2tensor-2.7}/.github/workflows/labeler.yml +0 -0
  25. {bam2tensor-2.5 → bam2tensor-2.7}/.github/workflows/tests.yml +0 -0
  26. {bam2tensor-2.5 → bam2tensor-2.7}/.gitignore +0 -0
  27. {bam2tensor-2.5 → bam2tensor-2.7}/.pre-commit-config.yaml +0 -0
  28. {bam2tensor-2.5 → bam2tensor-2.7}/CLAUDE.md +0 -0
  29. {bam2tensor-2.5 → bam2tensor-2.7}/CONTRIBUTING.md +0 -0
  30. {bam2tensor-2.5 → bam2tensor-2.7}/LICENSE +0 -0
  31. {bam2tensor-2.5 → bam2tensor-2.7}/SECURITY.md +0 -0
  32. {bam2tensor-2.5 → bam2tensor-2.7}/docs/Makefile +0 -0
  33. {bam2tensor-2.5 → bam2tensor-2.7}/docs/conf.py +0 -0
  34. {bam2tensor-2.5 → bam2tensor-2.7}/docs/contributing.md +0 -0
  35. {bam2tensor-2.5 → bam2tensor-2.7}/docs/index.md +0 -0
  36. {bam2tensor-2.5 → bam2tensor-2.7}/docs/license.md +0 -0
  37. {bam2tensor-2.5 → bam2tensor-2.7}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
  38. {bam2tensor-2.5 → bam2tensor-2.7}/docs/logo/bam2tensor-logo.afdesign +0 -0
  39. {bam2tensor-2.5 → bam2tensor-2.7}/docs/logo/bam2tensor-logo.png +0 -0
  40. {bam2tensor-2.5 → bam2tensor-2.7}/docs/make.bat +0 -0
  41. {bam2tensor-2.5 → bam2tensor-2.7}/docs/nano-banana-overview-shrunk.png +0 -0
  42. {bam2tensor-2.5 → bam2tensor-2.7}/docs/reference.md +0 -0
  43. {bam2tensor-2.5 → bam2tensor-2.7}/docs/templates/package.rst_t +0 -0
  44. {bam2tensor-2.5 → bam2tensor-2.7}/noxfile.py +0 -0
  45. {bam2tensor-2.5 → bam2tensor-2.7}/src/bam2tensor/__main__.py +0 -0
  46. {bam2tensor-2.5 → bam2tensor-2.7}/src/bam2tensor/inspect.py +0 -0
  47. {bam2tensor-2.5 → bam2tensor-2.7}/src/bam2tensor/py.typed +0 -0
  48. {bam2tensor-2.5 → bam2tensor-2.7}/src/bam2tensor/reference.py +0 -0
  49. {bam2tensor-2.5 → bam2tensor-2.7}/tests/__init__.py +0 -0
  50. {bam2tensor-2.5 → bam2tensor-2.7}/tests/test_duplication.py +0 -0
  51. {bam2tensor-2.5 → bam2tensor-2.7}/tests/test_fasta.fa +0 -0
  52. {bam2tensor-2.5 → bam2tensor-2.7}/tests/test_filters.py +0 -0
  53. {bam2tensor-2.5 → bam2tensor-2.7}/tests/test_main.py +0 -0
  54. {bam2tensor-2.5 → bam2tensor-2.7}/tests/test_metadata.py +0 -0
  55. {bam2tensor-2.5 → bam2tensor-2.7}/tests/test_reference.py +0 -0
@@ -47,7 +47,7 @@ jobs:
47
47
  uv run sphinx-build docs docs/_build
48
48
 
49
49
  - name: Upload artifact
50
- uses: actions/upload-pages-artifact@v4
50
+ uses: actions/upload-pages-artifact@v5
51
51
  with:
52
52
  path: "docs/_build"
53
53
 
@@ -67,16 +67,16 @@ jobs:
67
67
 
68
68
  - name: Publish package on PyPI
69
69
  if: steps.check-version.outputs.tag || steps.check-tag.outputs.tag
70
- uses: pypa/gh-action-pypi-publish@v1.13.0
70
+ uses: pypa/gh-action-pypi-publish@v1.14.0
71
71
 
72
72
  - name: Publish package on TestPyPI
73
73
  if: (!steps.check-version.outputs.tag && !steps.check-tag.outputs.tag)
74
- uses: pypa/gh-action-pypi-publish@v1.13.0
74
+ uses: pypa/gh-action-pypi-publish@v1.14.0
75
75
  with:
76
76
  repository-url: https://test.pypi.org/legacy/
77
77
 
78
78
  - name: Publish the release notes
79
- uses: release-drafter/release-drafter@v7.1.1
79
+ uses: release-drafter/release-drafter@v7.2.1
80
80
  with:
81
81
  publish: ${{ steps.check-version.outputs.tag != '' || steps.check-tag.outputs.tag != '' }}
82
82
  tag: ${{ steps.check-version.outputs.tag || steps.check-tag.outputs.tag }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bam2tensor
3
- Version: 2.5
3
+ Version: 2.7
4
4
  Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
5
5
  Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
6
6
  Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
@@ -72,6 +72,7 @@ Description-Content-Type: text/markdown
72
72
  - [Custom Output Directory](#custom-output-directory)
73
73
  - [Using a Custom Genome](#using-a-custom-genome)
74
74
  - [Command-Line Options](#command-line-options)
75
+ - [Filtering Conversion Errors](#filtering-conversion-errors)
75
76
  - [Inspecting Output Files](#inspecting-output-files)
76
77
  - [Output Data Structure](#output-data-structure)
77
78
  - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
@@ -99,6 +100,7 @@ Description-Content-Type: text/markdown
99
100
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
100
101
  - **Quality Filtering**: Configurable mapping quality thresholds
101
102
  - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
103
+ - **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
102
104
 
103
105
  ## Requirements
104
106
 
@@ -256,6 +258,25 @@ Options:
256
258
  determine CpG sites).
257
259
  --quality-limit INTEGER Quality filter for aligned reads (default =
258
260
  20)
261
+ --filter-non-converted Drop reads with >= --non-converted-threshold
262
+ retained non-CpG cytosines, the signature of
263
+ incomplete bisulfite/EM-seq conversion (port
264
+ of nebiolabs/mark-nonconverted-reads).
265
+ Default: off.
266
+ --non-converted-threshold INTEGER
267
+ Minimum count of retained non-CpG cytosines
268
+ to drop a read (default = 3, matches NEB
269
+ mark-nonconverted-reads).
270
+ --filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
271
+ called unmethylated and cover at least --em-
272
+ overconversion-min-cpgs sites (heuristic for
273
+ the fragment-level over-conversion artifact
274
+ described in Loyfer et al. bioRxiv
275
+ 2026.03.24.713040). Default: off.
276
+ --em-overconversion-min-cpgs INTEGER
277
+ Minimum covered CpG count required before
278
+ the EM over-conversion filter will drop a
279
+ read (default = 3).
259
280
  --verbose Verbose output.
260
281
  --skip-cache De-novo generate CpG sites (slow).
261
282
  --debug Debug mode (extensive validity checking +
@@ -281,6 +302,10 @@ Options:
281
302
  | `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
282
303
  | `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
283
304
  | `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
305
+ | `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
306
+ | `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
307
+ | `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
308
+ | `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
284
309
  | `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
285
310
  | `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
286
311
  | `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
@@ -289,6 +314,66 @@ Options:
289
314
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
290
315
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
291
316
 
317
+ ## Filtering Conversion Errors
318
+
319
+ Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
320
+
321
+ ### `--filter-non-converted` — incomplete conversion
322
+
323
+ Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
324
+
325
+ - **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
326
+ - **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
327
+
328
+ ### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
329
+
330
+ A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
331
+
332
+ The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
333
+
334
+ ### Usage
335
+
336
+ ```bash
337
+ bam2tensor \
338
+ --input-path sample.bam \
339
+ --reference-fasta GRCh38.fa \
340
+ --genome-name hg38 \
341
+ --filter-non-converted \
342
+ --filter-em-overconversion
343
+ ```
344
+
345
+ Filter parameters and enabled state are written to the output `metadata.json`:
346
+
347
+ ```json
348
+ {
349
+ "filters": {
350
+ "non_converted_reads": {"enabled": true, "threshold": 3},
351
+ "em_overconversion": {"enabled": true, "min_cpgs": 3}
352
+ }
353
+ }
354
+ ```
355
+
356
+ ### Reproducibility note
357
+
358
+ The two filters differ in whether they can be replayed downstream without the source BAM:
359
+
360
+ - **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
361
+
362
+ ```python
363
+ import scipy.sparse
364
+ mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
365
+ min_cpgs = 3
366
+ kept_rows = []
367
+ for i in range(mat.shape[0]):
368
+ row = mat.getrow(i).toarray().ravel()
369
+ covered = row[(row == 0) | (row == 1)] # drop -1 no-data
370
+ is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
371
+ if not is_overconv:
372
+ kept_rows.append(i)
373
+ ```
374
+
375
+ - **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
376
+
292
377
  ## Inspecting Output Files
293
378
 
294
379
  Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
@@ -302,11 +387,15 @@ sample.methylation.npz
302
387
  CpG sites: 28,217,448
303
388
  Data points: 12,847,322 (sparsity: 99.97%)
304
389
  Fragment len: median 167, mean 182, range [50, 600]
390
+ Filters: non-converted (>= 3 non-CpG Cs)
391
+ EM over-conversion (all-unmethylated, >= 3 CpGs)
305
392
  CpG index CRC32: a1b2c3d4
306
- bam2tensor: v2.4
393
+ bam2tensor: v2.5
307
394
  File size: 14.2 MB
308
395
  ```
309
396
 
397
+ When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
398
+
310
399
  You can pass multiple files at once:
311
400
 
312
401
  ```bash
@@ -368,6 +457,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
368
457
  | `expected_chromosomes` | List of chromosomes included in the column mapping |
369
458
  | `total_cpg_sites` | Total number of CpG columns in the matrix |
370
459
  | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
460
+ | `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
371
461
 
372
462
  This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
373
463
 
@@ -570,6 +660,10 @@ extract_methylation_data_from_bam(
570
660
  input_bam: str, # Path to BAM file
571
661
  genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
572
662
  quality_limit: int = 20, # Minimum MAPQ
663
+ filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
664
+ non_converted_threshold: int = 3, # Threshold for the above filter
665
+ filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
666
+ em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
573
667
  verbose: bool = False, # Enable verbose output
574
668
  debug: bool = False # Enable debug output
575
669
  ) -> ExtractionResult
@@ -39,6 +39,7 @@
39
39
  - [Custom Output Directory](#custom-output-directory)
40
40
  - [Using a Custom Genome](#using-a-custom-genome)
41
41
  - [Command-Line Options](#command-line-options)
42
+ - [Filtering Conversion Errors](#filtering-conversion-errors)
42
43
  - [Inspecting Output Files](#inspecting-output-files)
43
44
  - [Output Data Structure](#output-data-structure)
44
45
  - [Per-Read Fragment Length (TLEN)](#per-read-fragment-length-tlen)
@@ -66,6 +67,7 @@
66
67
  - **Caching**: CpG site indexing is cached to accelerate repeated runs on the same genome
67
68
  - **Quality Filtering**: Configurable mapping quality thresholds
68
69
  - **Per-Read Fragment Length**: Stores BAM TLEN (template length) alongside the methylation tensor for joint fragment-methylation analysis
70
+ - **Conversion-Error Filters**: Optional per-read filters for incomplete bisulfite/EM-seq conversion (ported from `nebiolabs/mark-nonconverted-reads`) and EM-seq fragment-level over-conversion (Loyfer et al. 2026)
69
71
 
70
72
  ## Requirements
71
73
 
@@ -223,6 +225,25 @@ Options:
223
225
  determine CpG sites).
224
226
  --quality-limit INTEGER Quality filter for aligned reads (default =
225
227
  20)
228
+ --filter-non-converted Drop reads with >= --non-converted-threshold
229
+ retained non-CpG cytosines, the signature of
230
+ incomplete bisulfite/EM-seq conversion (port
231
+ of nebiolabs/mark-nonconverted-reads).
232
+ Default: off.
233
+ --non-converted-threshold INTEGER
234
+ Minimum count of retained non-CpG cytosines
235
+ to drop a read (default = 3, matches NEB
236
+ mark-nonconverted-reads).
237
+ --filter-em-overconversion Drop EM-seq reads whose covered CpGs are all
238
+ called unmethylated and cover at least --em-
239
+ overconversion-min-cpgs sites (heuristic for
240
+ the fragment-level over-conversion artifact
241
+ described in Loyfer et al. bioRxiv
242
+ 2026.03.24.713040). Default: off.
243
+ --em-overconversion-min-cpgs INTEGER
244
+ Minimum covered CpG count required before
245
+ the EM over-conversion filter will drop a
246
+ read (default = 3).
226
247
  --verbose Verbose output.
227
248
  --skip-cache De-novo generate CpG sites (slow).
228
249
  --debug Debug mode (extensive validity checking +
@@ -248,6 +269,10 @@ Options:
248
269
  | `--expected-chromosomes` | Comma-separated list of chromosome names to process. Chromosomes not in this list are skipped. Defaults to human autosomes + sex chromosomes. |
249
270
  | `--reference-fasta` | Path to the reference genome FASTA file. Must match the genome used for alignment. |
250
271
  | `--quality-limit` | Minimum mapping quality score (MAPQ) for reads to be included. Default is 20. |
272
+ | `--filter-non-converted` | Drop reads with retained non-CpG cytosines above `--non-converted-threshold` (incomplete conversion). See [Filtering Conversion Errors](#filtering-conversion-errors). |
273
+ | `--non-converted-threshold` | Threshold for the non-converted filter. Default is 3. |
274
+ | `--filter-em-overconversion` | Drop EM-seq reads whose covered CpGs are all unmethylated and cover ≥ `--em-overconversion-min-cpgs` sites. See [Filtering Conversion Errors](#filtering-conversion-errors). |
275
+ | `--em-overconversion-min-cpgs` | Minimum covered CpG count before the EM over-conversion filter will drop a read. Default is 3. |
251
276
  | `--verbose` | Enable detailed progress output including per-chromosome progress bars. |
252
277
  | `--skip-cache` | Force regeneration of CpG site cache. Useful if you've modified the reference or chromosome list. |
253
278
  | `--debug` | Enable extensive validation and debug output. Slower but useful for troubleshooting. |
@@ -256,6 +281,66 @@ Options:
256
281
  | `--download-reference` | Download and cache a known reference genome. Choices: `hg38`, `hg19`, `mm10`, `T2T-CHM13`. Replaces `--reference-fasta`. |
257
282
  | `--list-genomes` | List available reference genomes for `--download-reference` and exit. |
258
283
 
284
+ ## Filtering Conversion Errors
285
+
286
+ Bisulfite and EM-seq library preparation can produce two kinds of per-read conversion errors that bias downstream methylation calls. bam2tensor provides two opt-in filters to drop affected reads at extraction time. Both are **default-off**, apply per read, and are recorded in the output `metadata.json` so downstream consumers know which filters were applied.
287
+
288
+ ### `--filter-non-converted` — incomplete conversion
289
+
290
+ Ports the logic of [nebiolabs/mark-nonconverted-reads](https://github.com/nebiolabs/mark-nonconverted-reads). A read is dropped if it carries at least `--non-converted-threshold` (default 3) retained non-CpG cytosines, a signature of incomplete bisulfite or EM-seq conversion.
291
+
292
+ - **Bismark BAMs**: counted directly from the `XM` tag's uppercase `H`/`X`/`U` characters (retained cytosines in CHH/CHG/unknown contexts).
293
+ - **Biscuit / bwameth / gem3 BAMs**: counted by comparing the read to the reference via the `MD` tag (using pysam's `get_aligned_pairs(with_seq=True)`). SNPs — where the read's retained `C` sits over a reference base that isn't `C` — are excluded from the count, matching NEB's reference-validation step. No separate FASTA reload is required.
294
+
295
+ ### `--filter-em-overconversion` — EM-seq fragment-level over-conversion
296
+
297
+ A heuristic inspired by [Loyfer et al. (bioRxiv 2026.03.24.713040)](https://www.biorxiv.org/content/10.64898/2026.03.24.713040v1). That paper shows EM-seq reproducibly produces ~1–2.5% of multi-CpG fragments that appear fully unmethylated across every covered CpG — a fragment-level artifact absent from WGBS and Oxford Nanopore. This filter drops any read whose covered CpGs are **all** called unmethylated *and* cover at least `--em-overconversion-min-cpgs` sites (default 3, the regime where the EM-seq artifact is clearly separable from WGBS in Loyfer et al. Fig. 1C).
298
+
299
+ The filter is a blunt instrument: it will also drop genuinely fully-unmethylated biological fragments at unmethylated markers. Enable it only when your downstream application (e.g., cfDNA deconvolution at constitutively methylated loci) can tolerate that trade-off.
300
+
301
+ ### Usage
302
+
303
+ ```bash
304
+ bam2tensor \
305
+ --input-path sample.bam \
306
+ --reference-fasta GRCh38.fa \
307
+ --genome-name hg38 \
308
+ --filter-non-converted \
309
+ --filter-em-overconversion
310
+ ```
311
+
312
+ Filter parameters and enabled state are written to the output `metadata.json`:
313
+
314
+ ```json
315
+ {
316
+ "filters": {
317
+ "non_converted_reads": {"enabled": true, "threshold": 3},
318
+ "em_overconversion": {"enabled": true, "min_cpgs": 3}
319
+ }
320
+ }
321
+ ```
322
+
323
+ ### Reproducibility note
324
+
325
+ The two filters differ in whether they can be replayed downstream without the source BAM:
326
+
327
+ - **`--filter-em-overconversion` is reproducible from the `.npz` alone.** The heuristic is a pure function of each row's CpG state values. A downstream consumer who receives an unfiltered `.npz` can replay the filter at analysis time:
328
+
329
+ ```python
330
+ import scipy.sparse
331
+ mat = scipy.sparse.load_npz("sample.methylation.npz").tocsr()
332
+ min_cpgs = 3
333
+ kept_rows = []
334
+ for i in range(mat.shape[0]):
335
+ row = mat.getrow(i).toarray().ravel()
336
+ covered = row[(row == 0) | (row == 1)] # drop -1 no-data
337
+ is_overconv = len(covered) >= min_cpgs and (covered == 0).all()
338
+ if not is_overconv:
339
+ kept_rows.append(i)
340
+ ```
341
+
342
+ - **`--filter-non-converted` is *not* reproducible from the `.npz` alone.** It relies on retained non-CpG cytosines (or Bismark's `H`/`X`/`U`), which are never written to the matrix. If you need this filter, apply it at extraction time (or re-run bam2tensor against the original BAM).
343
+
259
344
  ## Inspecting Output Files
260
345
 
261
346
  Use `bam2tensor-inspect` to view a summary of any `.methylation.npz` file without writing Python:
@@ -269,11 +354,15 @@ sample.methylation.npz
269
354
  CpG sites: 28,217,448
270
355
  Data points: 12,847,322 (sparsity: 99.97%)
271
356
  Fragment len: median 167, mean 182, range [50, 600]
357
+ Filters: non-converted (>= 3 non-CpG Cs)
358
+ EM over-conversion (all-unmethylated, >= 3 CpGs)
272
359
  CpG index CRC32: a1b2c3d4
273
- bam2tensor: v2.4
360
+ bam2tensor: v2.5
274
361
  File size: 14.2 MB
275
362
  ```
276
363
 
364
+ When no filters were applied, the line reads `Filters: none`. Files produced by bam2tensor versions older than v2.5 omit the line entirely.
365
+
277
366
  You can pass multiple files at once:
278
367
 
279
368
  ```bash
@@ -335,6 +424,7 @@ Each `.methylation.npz` file includes a `metadata.json` entry inside the ZIP arc
335
424
  | `expected_chromosomes` | List of chromosomes included in the column mapping |
336
425
  | `total_cpg_sites` | Total number of CpG columns in the matrix |
337
426
  | `cpg_index_crc32` | CRC32 checksum of the CpG site positions (verifies identical column semantics) |
427
+ | `filters` | Nested dict recording which opt-in conversion-error filters were applied (`non_converted_reads`, `em_overconversion`) and their parameters. See [Filtering Conversion Errors](#filtering-conversion-errors). Added in v2.5. |
338
428
 
339
429
  This metadata is ignored by `scipy.sparse.load_npz`, so existing code continues to work. To read it:
340
430
 
@@ -537,6 +627,10 @@ extract_methylation_data_from_bam(
537
627
  input_bam: str, # Path to BAM file
538
628
  genome_methylation_embedding: GenomeMethylationEmbedding, # Embedding object
539
629
  quality_limit: int = 20, # Minimum MAPQ
630
+ filter_non_converted: bool = False, # Drop reads with retained non-CpG Cs
631
+ non_converted_threshold: int = 3, # Threshold for the above filter
632
+ filter_em_overconversion: bool = False, # Drop EM-seq fragment-level over-conversion reads
633
+ em_overconversion_min_cpgs: int = 3, # Min CpGs before applying the above filter
540
634
  verbose: bool = False, # Enable verbose output
541
635
  debug: bool = False # Enable debug output
542
636
  ) -> ExtractionResult
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bam2tensor"
3
- version = "2.5"
3
+ version = "2.7"
4
4
  description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
5
5
  authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
6
6
  license = "MIT"
@@ -50,4 +50,4 @@ See Also:
50
50
  - https://mcwdsi.github.io/bam2tensor for full documentation
51
51
  """
52
52
 
53
- __version__ = "2.5"
53
+ __version__ = "2.7"
@@ -45,6 +45,9 @@ import numpy as np
45
45
  from tqdm import tqdm
46
46
  from Bio import SeqIO
47
47
 
48
+ from bam2tensor import __version__
49
+ from bam2tensor.metadata import compute_fasta_sha256
50
+
48
51
 
49
52
  class GenomeMethylationEmbedding:
50
53
  """Manages CpG site positions and coordinate conversions for a reference genome.
@@ -173,7 +176,12 @@ class GenomeMethylationEmbedding:
173
176
  window_size == self.window_size
174
177
  ), "Window size does not match cached window size!"
175
178
  except FileNotFoundError as e:
176
- if self.verbose:
179
+ # Stale-cache rejections (version or FASTA SHA-256 mismatch)
180
+ # raise FileNotFoundError too — always surface those so users
181
+ # are not silently regenerating a cache they thought was valid.
182
+ if os.path.exists(self.cache_file):
183
+ print(f"Discarding stale embedding cache: {e}")
184
+ elif self.verbose:
177
185
  print("Could not load methylation embedding from cache: " + str(e))
178
186
 
179
187
  if not cache_available:
@@ -224,6 +232,9 @@ class GenomeMethylationEmbedding:
224
232
  The cache file is named "{genome_name}.cache.json.gz" and contains:
225
233
  - genome_name: The genome identifier
226
234
  - fasta_source: Path to the original FASTA file
235
+ - fasta_sha256: SHA-256 of the FASTA file bytes (for cache validation)
236
+ - bam2tensor_version: Version that produced this cache
237
+ - total_cpg_sites: Total CpG count across all included chromosomes
227
238
  - expected_chromosomes: List of included chromosomes
228
239
  - window_size: The window_size parameter (for compatibility checking)
229
240
  - cpg_sites_dict: Dictionary of chromosome -> list of CpG positions
@@ -241,9 +252,13 @@ class GenomeMethylationEmbedding:
241
252
 
242
253
  assert len(self.cpg_sites_dict) > 0, "CpG sites dict is empty!"
243
254
 
255
+ total_cpg_sites = sum(len(v) for v in self.cpg_sites_dict.values())
244
256
  cache_data = {
245
257
  "genome_name": self.genome_name,
246
258
  "fasta_source": self.fasta_source,
259
+ "fasta_sha256": compute_fasta_sha256(self.fasta_source),
260
+ "bam2tensor_version": __version__,
261
+ "total_cpg_sites": total_cpg_sites,
247
262
  "expected_chromosomes": self.expected_chromosomes,
248
263
  "window_size": self.window_size,
249
264
  "cpg_sites_dict": self.cpg_sites_dict,
@@ -263,38 +278,66 @@ class GenomeMethylationEmbedding:
263
278
  restore all CpG site data. If successful, this avoids the slow
264
279
  FASTA parsing step.
265
280
 
281
+ Provenance is validated before the cached data is trusted: the
282
+ cache must have been written by the same major.minor of
283
+ bam2tensor and must reference a FASTA file with the same SHA-256
284
+ as the current ``fasta_source``. A stale cache is rejected with
285
+ a ``FileNotFoundError`` so the caller falls through to a fresh
286
+ FASTA parse and overwrites the stale cache on save.
287
+
266
288
  Returns:
267
289
  True if the cache was successfully loaded.
268
290
 
269
291
  Raises:
270
- FileNotFoundError: If the cache file does not exist.
292
+ FileNotFoundError: If the cache file does not exist, or if
293
+ the cache is stale (version mismatch or FASTA SHA-256
294
+ mismatch).
271
295
 
272
296
  Note:
273
- After loading, the caller should verify that expected_chromosomes
274
- and window_size match the current configuration, as this method
275
- overwrites those attributes with cached values.
297
+ After loading, the caller should verify that
298
+ ``expected_chromosomes`` and ``window_size`` match the current
299
+ configuration, as this method overwrites those attributes
300
+ with cached values.
276
301
  """
277
302
 
278
- if os.path.exists(self.cache_file):
279
- if self.verbose:
280
- print(f"\tReading embedding from cache: {self.cache_file}")
303
+ if not os.path.exists(self.cache_file):
304
+ raise FileNotFoundError("No cache of embedding found.")
281
305
 
282
- # TODO: Add type hinting via TypedDicts?
283
- # e.g. https://stackoverflow.com/questions/51291722/define-a-jsonable-type-using-mypy-pep-526
284
- with gzip.open(self.cache_file, "rt") as f:
285
- self.cache_data = json.load(f)
306
+ if self.verbose:
307
+ print(f"\tReading embedding from cache: {self.cache_file}")
286
308
 
287
- # Load the cached data
288
- self.genome_name = self.cache_data["genome_name"]
289
- self.fasta_source = self.cache_data["fasta_source"]
290
- self.expected_chromosomes = self.cache_data["expected_chromosomes"]
291
- self.window_size = self.cache_data["window_size"]
292
- self.cpg_sites_dict = self.cache_data["cpg_sites_dict"]
309
+ with gzip.open(self.cache_file, "rt") as f:
310
+ self.cache_data = json.load(f)
293
311
 
294
- if self.verbose:
295
- print(f"\tCached genome fasta source: {self.fasta_source}")
296
- else:
297
- raise FileNotFoundError("No cache of embedding found.")
312
+ # Validate cache provenance: stale caches predating v2.7 used a
313
+ # case-sensitive CpG search that silently dropped roughly half
314
+ # the CpG sites in soft-masked FASTAs (e.g. UCSC's hg38.fa.gz).
315
+ cached_version = self.cache_data.get("bam2tensor_version")
316
+ if cached_version != __version__:
317
+ raise FileNotFoundError(
318
+ f"Stale cache {self.cache_file!r}: written by bam2tensor "
319
+ f"{cached_version!r}, current is {__version__!r}. "
320
+ "Regenerating."
321
+ )
322
+
323
+ cached_fasta_sha256 = self.cache_data.get("fasta_sha256")
324
+ current_fasta_sha256 = compute_fasta_sha256(self.fasta_source)
325
+ if cached_fasta_sha256 != current_fasta_sha256:
326
+ raise FileNotFoundError(
327
+ f"Stale cache {self.cache_file!r}: FASTA SHA-256 mismatch "
328
+ f"(cache={cached_fasta_sha256}, current={current_fasta_sha256}). "
329
+ "Regenerating."
330
+ )
331
+
332
+ # Load the cached data
333
+ self.genome_name = self.cache_data["genome_name"]
334
+ self.fasta_source = self.cache_data["fasta_source"]
335
+ self.expected_chromosomes = self.cache_data["expected_chromosomes"]
336
+ self.window_size = self.cache_data["window_size"]
337
+ self.cpg_sites_dict = self.cache_data["cpg_sites_dict"]
338
+
339
+ if self.verbose:
340
+ print(f"\tCached genome fasta source: {self.fasta_source}")
298
341
 
299
342
  return True
300
343
 
@@ -350,7 +393,11 @@ class GenomeMethylationEmbedding:
350
393
  if self.verbose:
351
394
  tqdm.write(f"\tSkipping chromosome {seqrecord.id}")
352
395
  continue
353
- sequence = seqrecord.seq
396
+ # Upper-case the sequence so soft-masked FASTAs (UCSC's default
397
+ # hg38.fa.gz uses lowercase for RepeatMasker/TRF regions) do not
398
+ # silently drop CpGs in repeats — that is roughly half of all
399
+ # CpGs in the human genome.
400
+ sequence = seqrecord.seq.upper()
354
401
 
355
402
  # Find all CpG sites
356
403
  # The pos+1 is because we want to store the 1-based position, because .bed is wild and arguably 1-based maybe:
@@ -706,25 +706,36 @@ def extract_methylation_data_from_bam(
706
706
 
707
707
  # get_aligned_pairs returns a list of tuples of (read_pos, ref_pos)
708
708
  # We filter this to only include the specific CpG sites from above
709
+ aligned_pairs = aligned_segment.get_aligned_pairs(matches_only=True)
709
710
  this_segment_cpgs = [
710
- e
711
- for e in aligned_segment.get_aligned_pairs(matches_only=True)
712
- if e[1] + 1 in cpgs_within_read_set
711
+ e for e in aligned_pairs if e[1] + 1 in cpgs_within_read_set
713
712
  ]
714
713
 
715
714
  # If no CpGs covered (after filtering for matches only), skip
716
715
  if not this_segment_cpgs:
717
716
  continue
718
717
 
719
- # Ok we're on the same strand as the methylation (right?)
720
- # Let's compare the possible CpGs in this interval to the reference and note status
721
- # A methylated C will be *unchanged* and read as C (pair G)
722
- # An unmethylated C will be *changed* and read as T (pair A)
723
- for query_pos, ref_pos in this_segment_cpgs:
724
- query_base = aligned_segment.query_sequence[query_pos] # type: ignore
725
- # query_base_raw = aligned_segment.get_forward_sequence()[query_pos] # raw off sequencer
726
- # query_base_no_offset = aligned_segment.query_alignment_sequence[query_pos] # this needs to be offset by the soft clip
718
+ # OT (forward parent): methylation-informative base sits on the
719
+ # top-strand C at ref_pos. BAM SEQ is reference-oriented, so
720
+ # C = methylated, T = unmethylated.
721
+ # OB (reverse parent): the original bottom-strand C lives at
722
+ # ref_pos + 1 (the G of the top-strand CG). After the aligner
723
+ # reverse-complements into reference orientation for BAM
724
+ # storage, that base reads G = methylated, A = unmethylated.
725
+ # At ref_pos itself, BAM always shows C (the unaffected
726
+ # bottom-strand G reverse-complemented), which is why reading
727
+ # ref_pos on OB reads collapses every CpG to "methylated".
728
+ query_sequence = aligned_segment.query_sequence
729
+ if bisulfite_parent_strand_is_reverse:
730
+ methylated_base, unmethylated_base = "G", "A"
731
+ # Indels at the CpG boundary mean ref_pos + 1 isn't always
732
+ # query_pos + 1 — go through a ref -> query map.
733
+ ref_to_query: dict[int, int] = {ref: q for q, ref in aligned_pairs}
734
+ else:
735
+ methylated_base, unmethylated_base = "C", "T"
736
+ ref_to_query = {}
727
737
 
738
+ for query_pos, ref_pos in this_segment_cpgs:
728
739
  read_cpg_cols.append(
729
740
  genome_methylation_embedding.genomic_position_to_embedding(
730
741
  chrom,
@@ -732,21 +743,34 @@ def extract_methylation_data_from_bam(
732
743
  )
733
744
  )
734
745
 
735
- if query_base == "C":
736
- # Methylated
746
+ if bisulfite_parent_strand_is_reverse:
747
+ target_query_pos = ref_to_query.get(ref_pos + 1)
748
+ if target_query_pos is None:
749
+ read_cpg_data.append(-1)
750
+ if debug:
751
+ print(f"\t{query_pos} {ref_pos} [Indel at OB target]")
752
+ continue
753
+ query_base = query_sequence[target_query_pos] # type: ignore[index]
754
+ else:
755
+ query_base = query_sequence[query_pos] # type: ignore[index]
756
+
757
+ if query_base == methylated_base:
737
758
  read_cpg_data.append(1)
738
759
  if debug:
739
- print(f"\t{query_pos} {ref_pos} C->{query_base} [Methylated]")
740
- elif query_base == "T":
760
+ print(
761
+ f"\t{query_pos} {ref_pos} {methylated_base}->{query_base} [Methylated]"
762
+ )
763
+ elif query_base == unmethylated_base:
741
764
  read_cpg_data.append(0)
742
- # Unmethylated
743
765
  if debug:
744
- print(f"\t{query_pos} {ref_pos} C->{query_base} [Unmethylated]")
766
+ print(
767
+ f"\t{query_pos} {ref_pos} {methylated_base}->{query_base} [Unmethylated]"
768
+ )
745
769
  else:
746
770
  read_cpg_data.append(-1)
747
771
  if debug:
748
772
  print(
749
- f"\t{query_pos} {ref_pos} C->{query_base} [Unknown! SNV? Indel?]"
773
+ f"\t{query_pos} {ref_pos} {methylated_base}->{query_base} [Unknown! SNV? Indel?]"
750
774
  )
751
775
 
752
776
  if filter_em_overconversion and is_em_overconversion_read(
@@ -27,17 +27,44 @@ Example:
27
27
  hg38
28
28
  """
29
29
 
30
+ import hashlib
30
31
  import io
31
32
  import json
32
33
  import zipfile
33
34
  import zlib
35
+ from typing import TYPE_CHECKING
34
36
 
35
37
  import numpy as np
36
38
 
37
- from bam2tensor.embedding import GenomeMethylationEmbedding
39
+ if TYPE_CHECKING:
40
+ # Avoid a runtime circular import: embedding.py imports compute_fasta_sha256
41
+ # from this module, and this module only needs the embedding type for
42
+ # annotations.
43
+ from bam2tensor.embedding import GenomeMethylationEmbedding
38
44
 
39
45
 
40
- def compute_cpg_index_crc32(embedding: GenomeMethylationEmbedding) -> str:
46
+ def compute_fasta_sha256(fasta_source: str) -> str:
47
+ """Compute the SHA-256 of a FASTA file's bytes on disk.
48
+
49
+ Used to stamp the CpG-site cache (see
50
+ :py:class:`bam2tensor.embedding.GenomeMethylationEmbedding`) so a
51
+ cache can be rejected when the underlying FASTA changes (e.g. a
52
+ user swaps a soft-masked build for a hard-masked one).
53
+
54
+ Args:
55
+ fasta_source: Path to the reference FASTA file.
56
+
57
+ Returns:
58
+ The hex-encoded SHA-256 digest of the file's bytes.
59
+ """
60
+ h = hashlib.sha256()
61
+ with open(fasta_source, "rb") as f:
62
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
63
+ h.update(chunk)
64
+ return h.hexdigest()
65
+
66
+
67
+ def compute_cpg_index_crc32(embedding: "GenomeMethylationEmbedding") -> str:
41
68
  """Compute a CRC32 checksum over the CpG site positions in an embedding.
42
69
 
43
70
  The checksum captures the exact column mapping of the sparse matrix: