bam2tensor 2.4__tar.gz → 2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {bam2tensor-2.4 → bam2tensor-2.5}/CLAUDE.md +1 -1
  2. {bam2tensor-2.4 → bam2tensor-2.5}/PKG-INFO +1 -1
  3. {bam2tensor-2.4 → bam2tensor-2.5}/pyproject.toml +1 -1
  4. {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/__init__.py +1 -1
  5. {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/__main__.py +76 -0
  6. {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/functions.py +253 -40
  7. {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/inspect.py +19 -0
  8. bam2tensor-2.5/tests/test_filters.py +568 -0
  9. {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_inspect.py +3 -3
  10. {bam2tensor-2.4 → bam2tensor-2.5}/uv.lock +1 -1
  11. {bam2tensor-2.4 → bam2tensor-2.5}/.darglint +0 -0
  12. {bam2tensor-2.4 → bam2tensor-2.5}/.editorconfig +0 -0
  13. {bam2tensor-2.4 → bam2tensor-2.5}/.gitattributes +0 -0
  14. {bam2tensor-2.4 → bam2tensor-2.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
  15. {bam2tensor-2.4 → bam2tensor-2.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
  16. {bam2tensor-2.4 → bam2tensor-2.5}/.github/actions/setup-env/action.yml +0 -0
  17. {bam2tensor-2.4 → bam2tensor-2.5}/.github/dependabot.yml +0 -0
  18. {bam2tensor-2.4 → bam2tensor-2.5}/.github/labels.yml +0 -0
  19. {bam2tensor-2.4 → bam2tensor-2.5}/.github/release-drafter.yml +0 -0
  20. {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/constraints.txt +0 -0
  21. {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/docs.yml +0 -0
  22. {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/labeler.yml +0 -0
  23. {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/release.yml +0 -0
  24. {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/tests.yml +0 -0
  25. {bam2tensor-2.4 → bam2tensor-2.5}/.gitignore +0 -0
  26. {bam2tensor-2.4 → bam2tensor-2.5}/.pre-commit-config.yaml +0 -0
  27. {bam2tensor-2.4 → bam2tensor-2.5}/CONTRIBUTING.md +0 -0
  28. {bam2tensor-2.4 → bam2tensor-2.5}/LICENSE +0 -0
  29. {bam2tensor-2.4 → bam2tensor-2.5}/README.md +0 -0
  30. {bam2tensor-2.4 → bam2tensor-2.5}/SECURITY.md +0 -0
  31. {bam2tensor-2.4 → bam2tensor-2.5}/docs/Makefile +0 -0
  32. {bam2tensor-2.4 → bam2tensor-2.5}/docs/conf.py +0 -0
  33. {bam2tensor-2.4 → bam2tensor-2.5}/docs/contributing.md +0 -0
  34. {bam2tensor-2.4 → bam2tensor-2.5}/docs/index.md +0 -0
  35. {bam2tensor-2.4 → bam2tensor-2.5}/docs/license.md +0 -0
  36. {bam2tensor-2.4 → bam2tensor-2.5}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
  37. {bam2tensor-2.4 → bam2tensor-2.5}/docs/logo/bam2tensor-logo.afdesign +0 -0
  38. {bam2tensor-2.4 → bam2tensor-2.5}/docs/logo/bam2tensor-logo.png +0 -0
  39. {bam2tensor-2.4 → bam2tensor-2.5}/docs/make.bat +0 -0
  40. {bam2tensor-2.4 → bam2tensor-2.5}/docs/nano-banana-overview-shrunk.png +0 -0
  41. {bam2tensor-2.4 → bam2tensor-2.5}/docs/reference.md +0 -0
  42. {bam2tensor-2.4 → bam2tensor-2.5}/docs/templates/package.rst_t +0 -0
  43. {bam2tensor-2.4 → bam2tensor-2.5}/noxfile.py +0 -0
  44. {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/embedding.py +0 -0
  45. {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/metadata.py +0 -0
  46. {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/py.typed +0 -0
  47. {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/reference.py +0 -0
  48. {bam2tensor-2.4 → bam2tensor-2.5}/tests/__init__.py +0 -0
  49. {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_duplication.py +0 -0
  50. {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_embedding.py +0 -0
  51. {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_fasta.fa +0 -0
  52. {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_functions.py +0 -0
  53. {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_main.py +0 -0
  54. {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_metadata.py +0 -0
  55. {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_reference.py +0 -0
@@ -40,7 +40,7 @@ uv run mypy src
40
40
 
41
41
  ```
42
42
  src/bam2tensor/
43
- __init__.py # Package version (2.4)
43
+ __init__.py # Package version (2.5)
44
44
  __main__.py # Click CLI entry point (bam2tensor command)
45
45
  inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
46
46
  embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bam2tensor
3
- Version: 2.4
3
+ Version: 2.5
4
4
  Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
5
5
  Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
6
6
  Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "bam2tensor"
3
- version = "2.4"
3
+ version = "2.5"
4
4
  description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
5
5
  authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
6
6
  license = "MIT"
@@ -50,4 +50,4 @@ See Also:
50
50
  - https://mcwdsi.github.io/bam2tensor for full documentation
51
51
  """
52
52
 
53
- __version__ = "2.4"
53
+ __version__ = "2.5"
@@ -229,6 +229,43 @@ def validate_input_output(
229
229
  default=20,
230
230
  type=int,
231
231
  )
232
+ @click.option(
233
+ "--filter-non-converted",
234
+ help=(
235
+ "Drop reads with >= --non-converted-threshold retained non-CpG "
236
+ "cytosines, the signature of incomplete bisulfite/EM-seq conversion "
237
+ "(port of nebiolabs/mark-nonconverted-reads). Default: off."
238
+ ),
239
+ is_flag=True,
240
+ )
241
+ @click.option(
242
+ "--non-converted-threshold",
243
+ help=(
244
+ "Minimum count of retained non-CpG cytosines to drop a read "
245
+ "(default = 3, matches NEB mark-nonconverted-reads)."
246
+ ),
247
+ default=3,
248
+ type=int,
249
+ )
250
+ @click.option(
251
+ "--filter-em-overconversion",
252
+ help=(
253
+ "Drop EM-seq reads whose covered CpGs are all called unmethylated "
254
+ "and cover at least --em-overconversion-min-cpgs sites (heuristic "
255
+ "for the fragment-level over-conversion artifact described in "
256
+ "Loyfer et al. bioRxiv 2026.03.24.713040). Default: off."
257
+ ),
258
+ is_flag=True,
259
+ )
260
+ @click.option(
261
+ "--em-overconversion-min-cpgs",
262
+ help=(
263
+ "Minimum covered CpG count required before the EM over-conversion "
264
+ "filter will drop a read (default = 3)."
265
+ ),
266
+ default=3,
267
+ type=int,
268
+ )
232
269
  @click.option("--verbose", help="Verbose output.", is_flag=True)
233
270
  @click.option("--skip-cache", help="De-novo generate CpG sites (slow).", is_flag=True)
234
271
  @click.option(
@@ -263,6 +300,10 @@ def main(
263
300
  expected_chromosomes: str | None,
264
301
  reference_fasta: str | None,
265
302
  quality_limit: int,
303
+ filter_non_converted: bool,
304
+ non_converted_threshold: int,
305
+ filter_em_overconversion: bool,
306
+ em_overconversion_min_cpgs: int,
266
307
  verbose: bool,
267
308
  skip_cache: bool,
268
309
  debug: bool,
@@ -300,6 +341,17 @@ def main(
300
341
  ``--download-reference`` is used.
301
342
  quality_limit: Minimum mapping quality (MAPQ) threshold. Reads below
302
343
  this quality are excluded.
344
+ filter_non_converted: If True, drop reads with at least
345
+ ``non_converted_threshold`` retained non-CpG cytosines —
346
+ indicating incomplete bisulfite/EM-seq conversion.
347
+ non_converted_threshold: Threshold used by the non-converted
348
+ read filter.
349
+ filter_em_overconversion: If True, drop reads whose covered CpGs
350
+ are all called unmethylated and cover at least
351
+ ``em_overconversion_min_cpgs`` sites — heuristic for EM-seq
352
+ fragment-level over-conversion (Loyfer et al. 2026).
353
+ em_overconversion_min_cpgs: Minimum covered CpG count required
354
+ before the over-conversion filter will drop a read.
303
355
  verbose: If True, print detailed progress information.
304
356
  skip_cache: If True, regenerate the CpG site index even if a cache
305
357
  file exists.
@@ -382,6 +434,16 @@ def main(
382
434
  print(f" Reference: {reference_fasta}")
383
435
  print(f" Chromosomes: {chrom_display}")
384
436
  print(f" Quality limit: MAPQ >= {quality_limit}")
437
+ if filter_non_converted:
438
+ print(
439
+ f" Filters: non-converted reads (>= "
440
+ f"{non_converted_threshold} retained non-CpG Cs)"
441
+ )
442
+ if filter_em_overconversion:
443
+ print(
444
+ f" EM over-conversion (all-unmethylated, >= "
445
+ f"{em_overconversion_min_cpgs} CpGs)"
446
+ )
385
447
  if output_dir:
386
448
  print(f" Output dir: {output_dir}")
387
449
  else:
@@ -448,6 +510,10 @@ def main(
448
510
  input_bam=input_bam,
449
511
  genome_methylation_embedding=genome_methylation_embedding,
450
512
  quality_limit=quality_limit,
513
+ filter_non_converted=filter_non_converted,
514
+ non_converted_threshold=non_converted_threshold,
515
+ filter_em_overconversion=filter_em_overconversion,
516
+ em_overconversion_min_cpgs=em_overconversion_min_cpgs,
451
517
  verbose=verbose,
452
518
  debug=debug,
453
519
  )
@@ -476,6 +542,16 @@ def main(
476
542
  "expected_chromosomes": chrom_list,
477
543
  "total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
478
544
  "cpg_index_crc32": cpg_crc32,
545
+ "filters": {
546
+ "non_converted_reads": {
547
+ "enabled": filter_non_converted,
548
+ "threshold": non_converted_threshold,
549
+ },
550
+ "em_overconversion": {
551
+ "enabled": filter_em_overconversion,
552
+ "min_cpgs": em_overconversion_min_cpgs,
553
+ },
554
+ },
479
555
  },
480
556
  )
481
557
  print(f" Output: {output_file}")
@@ -80,6 +80,146 @@ class ExtractionResult(NamedTuple):
80
80
  _SKIP_FLAGS = 0x400 | 0x200 | 0x100 | 0x800
81
81
 
82
82
 
83
+ def count_non_cpg_retained_xm(xm_tag: str) -> int:
84
+ """Count retained non-CpG cytosines in a Bismark XM methylation string.
85
+
86
+ Bismark's ``XM`` tag encodes per-base methylation context. Uppercase
87
+ letters indicate a cytosine that remained as ``C`` in the read
88
+ (i.e., was *not* converted by bisulfite/EM-seq treatment). ``H``,
89
+ ``X`` and ``U`` correspond to retained cytosines in CHH, CHG and
90
+ unknown-context positions respectively. A high count of these on a
91
+ single read is a strong signal of incomplete conversion.
92
+
93
+ Args:
94
+ xm_tag: The value of a read's Bismark ``XM`` tag.
95
+
96
+ Returns:
97
+ The count of ``H``, ``X`` and ``U`` characters in ``xm_tag``.
98
+
99
+ Example:
100
+ >>> count_non_cpg_retained_xm("..Z..hhh..HHH..z..")
101
+ 3
102
+ """
103
+ return xm_tag.count("H") + xm_tag.count("X") + xm_tag.count("U")
104
+
105
+
106
+ def count_non_cpg_retained_reference(
107
+ aligned_segment: pysam.AlignedSegment,
108
+ is_reverse_parent_strand: bool,
109
+ ) -> int:
110
+ """Count retained non-CpG bases validated against the reference.
111
+
112
+ For a correctly bisulfite- or EM-seq-converted read, every
113
+ non-CpG cytosine on the parent strand should have been converted.
114
+ On the forward-parent strand that means every non-CpG ``C`` in the
115
+ reference should appear as ``T`` in the read; on the reverse-parent
116
+ strand every non-CpG ``G`` should appear as ``A``. Positions where
117
+ the read still carries the unconverted base *and* the reference
118
+ genuinely has a ``C``/``G`` (i.e., the mismatch is not a SNP) count
119
+ as retained.
120
+
121
+ This is a faithful port of the logic in
122
+ ``nebiolabs/mark-nonconverted-reads``, re-using the read's existing
123
+ ``MD`` tag (via :py:meth:`pysam.AlignedSegment.get_aligned_pairs`
124
+ with ``with_seq=True``) instead of requiring a separate reference
125
+ FASTA.
126
+
127
+ Args:
128
+ aligned_segment: A pysam aligned read. Must carry an ``MD``
129
+ tag; BAMs produced by Bismark, Biscuit, bwameth and gem3
130
+ all set this tag by default.
131
+ is_reverse_parent_strand: ``True`` if the read derives from the
132
+ reverse (OB/CTOB) bisulfite parent strand, ``False`` for
133
+ the forward (OT/CTOT) strand.
134
+
135
+ Returns:
136
+ The number of reference-validated retained non-CpG
137
+ cytosines (or guanines, for the reverse parent strand). Returns
138
+ ``0`` when the read has no sequence or no ``MD`` tag is present.
139
+ """
140
+ if aligned_segment.query_sequence is None:
141
+ return 0
142
+
143
+ try:
144
+ pairs = aligned_segment.get_aligned_pairs(matches_only=True, with_seq=True)
145
+ except ValueError:
146
+ # MD tag missing — cannot validate against reference.
147
+ return 0
148
+
149
+ # Map ref_pos → reference base (uppercase) for CpG-context lookup.
150
+ # matches_only=True guarantees query_pos, ref_pos, ref_base are all set.
151
+ ref_pos_to_base = {rpos: rb.upper() for _, rpos, rb in pairs}
152
+
153
+ # On match, pysam returns ref_base uppercase (query matches ref).
154
+ # On mismatch (SNP), it returns lowercase. We only care about matches
155
+ # where ref is C/G — those are genuine retained, non-converted bases.
156
+ target = "G" if is_reverse_parent_strand else "C"
157
+
158
+ count = 0
159
+ for _, rpos, ref_base in pairs:
160
+ if ref_base != target:
161
+ # Not a match, or ref is not C/G. This rejects SNPs (lowercase)
162
+ # and converted positions (read has T/A, match has different base).
163
+ continue
164
+ # Exclude CpG context: on forward strand, next ref base == G;
165
+ # on reverse strand, previous ref base == C.
166
+ if is_reverse_parent_strand:
167
+ if ref_pos_to_base.get(rpos - 1) == "C":
168
+ continue
169
+ else:
170
+ if ref_pos_to_base.get(rpos + 1) == "G":
171
+ continue
172
+ count += 1
173
+
174
+ return count
175
+
176
+
177
+ def is_em_overconversion_read(
178
+ read_cpg_states: list[int],
179
+ min_cpgs: int,
180
+ ) -> bool:
181
+ """Identify reads flagged as EM-seq fragment-level over-conversion.
182
+
183
+ Loyfer et al. (bioRxiv 2026.03.24.713040) report that EM-seq
184
+ produces a reproducible ~1–2.5% of multi-CpG fragments that appear
185
+ fully unmethylated across every covered CpG, driven by failed TET
186
+ protection and subsequent APOBEC hyper-conversion of an entire
187
+ molecule. At constitutively methylated loci these reads are purely
188
+ technical. Without a per-region methylation prior, the simplest
189
+ correction consistent with their observation is: drop reads whose
190
+ covered CpGs are all called unmethylated *and* cover at least
191
+ ``min_cpgs`` sites (the paper's Fig. 1C regime where the artifact
192
+ diverges clearly from WGBS).
193
+
194
+ This heuristic also drops genuinely fully-unmethylated biological
195
+ fragments, so callers should opt in only when the downstream
196
+ application can tolerate that trade-off.
197
+
198
+ Args:
199
+ read_cpg_states: Per-CpG methylation state values for a single
200
+ read, in column-order, using the bam2tensor encoding
201
+ (``1``=methylated, ``0``=unmethylated, ``-1``=no data).
202
+ min_cpgs: Minimum number of covered CpGs required to apply the
203
+ filter. Reads with fewer covered CpGs are never flagged.
204
+
205
+ Returns:
206
+ ``True`` when the read has at least ``min_cpgs`` covered CpGs
207
+ and every covered CpG is called unmethylated (value ``0``).
208
+ ``-1`` (no-data) values do not count as unmethylated.
209
+
210
+ Example:
211
+ >>> is_em_overconversion_read([0, 0, 0], min_cpgs=3)
212
+ True
213
+ >>> is_em_overconversion_read([0, 0, 1], min_cpgs=3)
214
+ False
215
+ >>> is_em_overconversion_read([0, 0], min_cpgs=3)
216
+ False
217
+ """
218
+ if len(read_cpg_states) < min_cpgs:
219
+ return False
220
+ return all(state == 0 for state in read_cpg_states)
221
+
222
+
83
223
  def detect_aligner(input_bam: str, sample_size: int = 1000) -> str:
84
224
  """Detect the aligner used to produce a BAM file by checking read tags.
85
225
 
@@ -198,6 +338,10 @@ def extract_methylation_data_from_bam(
198
338
  input_bam: str,
199
339
  genome_methylation_embedding: GenomeMethylationEmbedding,
200
340
  quality_limit: int = 20,
341
+ filter_non_converted: bool = False,
342
+ non_converted_threshold: int = 3,
343
+ filter_em_overconversion: bool = False,
344
+ em_overconversion_min_cpgs: int = 3,
201
345
  verbose: bool = False,
202
346
  debug: bool = False,
203
347
  ) -> ExtractionResult:
@@ -216,6 +360,17 @@ def extract_methylation_data_from_bam(
216
360
  - For Biscuit/bwameth/gem3: only parent-strand reads are processed
217
361
  - For Bismark: all reads are processed (XM tag has pre-resolved calls)
218
362
 
363
+ Two additional, opt-in per-read filters are available:
364
+ - Non-converted reads (``filter_non_converted``): drops reads with
365
+ too many retained non-CpG cytosines, the hallmark of incomplete
366
+ bisulfite/EM-seq conversion. Ports the logic of
367
+ ``nebiolabs/mark-nonconverted-reads``.
368
+ - EM-seq fragment-level over-conversion
369
+ (``filter_em_overconversion``): drops reads whose covered CpGs
370
+ are all called unmethylated, a heuristic for the EM-seq
371
+ artifact described by Loyfer et al.
372
+ (bioRxiv 2026.03.24.713040).
373
+
219
374
  Two extraction paths are supported, detected automatically per-read:
220
375
 
221
376
  **Bismark path** (XM tag present):
@@ -238,6 +393,23 @@ def extract_methylation_data_from_bam(
238
393
  quality_limit: Minimum mapping quality (MAPQ) threshold for reads.
239
394
  Reads with MAPQ below this value are skipped. Default is 20,
240
395
  which excludes reads mapping to multiple locations equally well.
396
+ filter_non_converted: If True, drop reads that carry at least
397
+ ``non_converted_threshold`` retained non-CpG cytosines, a
398
+ signature of incomplete bisulfite/EM-seq conversion. Default
399
+ False.
400
+ non_converted_threshold: Minimum count of retained non-CpG
401
+ cytosines required for the non-converted filter to drop a
402
+ read. Matches the NEB ``mark-nonconverted-reads`` default of
403
+ 3.
404
+ filter_em_overconversion: If True, drop reads whose covered CpGs
405
+ are all called unmethylated and cover at least
406
+ ``em_overconversion_min_cpgs`` sites — the Loyfer et al.
407
+ EM-seq fragment-level over-conversion heuristic. Default
408
+ False.
409
+ em_overconversion_min_cpgs: Minimum covered CpG count required
410
+ before the over-conversion filter will drop a read. Matches
411
+ the regime in Loyfer et al. Fig. 1C where the EM-seq
412
+ artifact is clearly separable from WGBS.
241
413
  verbose: If True, display a progress bar and print the total read
242
414
  count. Useful for monitoring progress on large files.
243
415
  debug: If True, enable extensive validation and debug output.
@@ -354,6 +526,12 @@ def extract_methylation_data_from_bam(
354
526
  if aligned_segment.flag & _SKIP_FLAGS:
355
527
  continue
356
528
 
529
+ # Per-read buffers. We only flush these into the global
530
+ # coo_* arrays once the read passes all filters (including
531
+ # the post-CpG EM over-conversion filter).
532
+ read_cpg_cols: list[int] = []
533
+ read_cpg_data: list[int] = []
534
+
357
535
  # ============================================================
358
536
  # Bismark path: XM tag contains pre-resolved methylation calls.
359
537
  # No strand filtering needed — Bismark already resolved strand
@@ -363,6 +541,13 @@ def extract_methylation_data_from_bam(
363
541
  if aligned_segment.has_tag("XM"):
364
542
  xm_tag: str = aligned_segment.get_tag("XM") # type: ignore[assignment]
365
543
 
544
+ # Non-converted filter (Bismark): XM tag already encodes
545
+ # retained non-CpG cytosines as H/X/U. Apply before any
546
+ # CpG work so we bail as early as possible.
547
+ if filter_non_converted:
548
+ if count_non_cpg_retained_xm(xm_tag) >= non_converted_threshold:
549
+ continue
550
+
366
551
  # Find CpG sites covered by this read
367
552
  start_idx = bisect.bisect_left(
368
553
  cpg_sites, aligned_segment.reference_start + 1
@@ -385,7 +570,6 @@ def extract_methylation_data_from_bam(
385
570
  if debug:
386
571
  print(f"Query (Bismark): {aligned_segment.query_name}")
387
572
 
388
- has_cpg_data = False
389
573
  for query_pos, ref_pos in this_segment_cpgs:
390
574
  # Bounds check: XM tag should match query length, but be defensive
391
575
  if query_pos >= len(xm_tag):
@@ -393,39 +577,49 @@ def extract_methylation_data_from_bam(
393
577
 
394
578
  xm_char = xm_tag[query_pos]
395
579
  if xm_char == "Z":
396
- coo_data.append(1) # Methylated CpG
580
+ read_cpg_data.append(1) # Methylated CpG
397
581
  elif xm_char == "z":
398
- coo_data.append(0) # Unmethylated CpG
582
+ read_cpg_data.append(0) # Unmethylated CpG
399
583
  else:
400
584
  # Non-CpG context at a CpG site (shouldn't happen
401
585
  # normally, but possible with edge-case alignments)
402
- coo_data.append(-1)
586
+ read_cpg_data.append(-1)
403
587
 
404
- coo_row.append(read_number)
405
- coo_col.append(
588
+ read_cpg_cols.append(
406
589
  genome_methylation_embedding.genomic_position_to_embedding(
407
590
  chrom,
408
591
  ref_pos + 1,
409
592
  )
410
593
  )
411
- has_cpg_data = True
412
594
 
413
595
  if debug:
414
596
  print(f"\t{query_pos} {ref_pos} XM={xm_char}")
415
597
 
416
- if has_cpg_data:
598
+ if not read_cpg_data:
599
+ continue
600
+
601
+ if filter_em_overconversion and is_em_overconversion_read(
602
+ read_cpg_data, em_overconversion_min_cpgs
603
+ ):
417
604
  if debug:
418
- # Ensure each read is only seen once
419
- read_key = aligned_segment.query_name + ( # type: ignore
420
- "_1" if aligned_segment.is_read1 else "_2"
421
- )
422
- assert (
423
- read_key not in debug_read_name_to_row_number
424
- ), "Read seen twice!"
425
- debug_read_name_to_row_number[read_key] = read_number
426
- print("************************************************\n")
427
- tlen_list.append(aligned_segment.template_length)
428
- read_number += 1
605
+ print("\tEM over-conversion filter: dropping read.")
606
+ continue
607
+
608
+ if debug:
609
+ read_key = aligned_segment.query_name + ( # type: ignore
610
+ "_1" if aligned_segment.is_read1 else "_2"
611
+ )
612
+ assert (
613
+ read_key not in debug_read_name_to_row_number
614
+ ), "Read seen twice!"
615
+ debug_read_name_to_row_number[read_key] = read_number
616
+ print("************************************************\n")
617
+
618
+ coo_row.extend([read_number] * len(read_cpg_cols))
619
+ coo_col.extend(read_cpg_cols)
620
+ coo_data.extend(read_cpg_data)
621
+ tlen_list.append(aligned_segment.template_length)
622
+ read_number += 1
429
623
 
430
624
  continue # Skip the Biscuit/bwameth/gem3 path below
431
625
 
@@ -460,6 +654,22 @@ def extract_methylation_data_from_bam(
460
654
  print("\tNot on methylated strand, ignoring.")
461
655
  continue
462
656
 
657
+ # Non-converted filter (Biscuit/bwameth/gem3): count retained
658
+ # non-CpG Cs (forward parent) or Gs (reverse parent) validated
659
+ # against the reference via the MD tag. Applied after the
660
+ # strand check so we don't waste work on daughter-strand reads.
661
+ if filter_non_converted:
662
+ if (
663
+ count_non_cpg_retained_reference(
664
+ aligned_segment,
665
+ bool(bisulfite_parent_strand_is_reverse),
666
+ )
667
+ >= non_converted_threshold
668
+ ):
669
+ if debug:
670
+ print("\tNon-converted filter: dropping read.")
671
+ continue
672
+
463
673
  # Use bisect to find CpGs covered by this read
464
674
  # aligned_segment.reference_start is 0-based inclusive
465
675
  # aligned_segment.reference_end is 0-based exclusive
@@ -492,15 +702,6 @@ def extract_methylation_data_from_bam(
492
702
  "XB"
493
703
  ) # Bisulfite strand tag (YD for Biscuit/bwameth, XB for gem3)
494
704
 
495
- # Ensure each read is only seen once
496
- assert (
497
- aligned_segment.query_name not in debug_read_name_to_row_number
498
- ), "Read seen twice!"
499
- debug_read_name_to_row_number[
500
- aligned_segment.query_name # type: ignore
501
- + ("_1" if aligned_segment.is_read1 else "_2")
502
- ] = read_number
503
-
504
705
  # TODO: We ignore paired/unpaired read status for now. Should we treat paired reads / overlapping reads differently?
505
706
 
506
707
  # get_aligned_pairs returns a list of tuples of (read_pos, ref_pos)
@@ -524,12 +725,7 @@ def extract_methylation_data_from_bam(
524
725
  # query_base_raw = aligned_segment.get_forward_sequence()[query_pos] # raw off sequencer
525
726
  # query_base_no_offset = aligned_segment.query_alignment_sequence[query_pos] # this needs to be offset by the soft clip
526
727
 
527
- # Store the read # in our sparse array
528
- coo_row.append(read_number)
529
-
530
- # Store the CpG site in our sparse array
531
- # TODO: Object orient these inputs? -- lots of bad inheritence style here
532
- coo_col.append(
728
+ read_cpg_cols.append(
533
729
  genome_methylation_embedding.genomic_position_to_embedding(
534
730
  chrom,
535
731
  ref_pos + 1,
@@ -538,30 +734,47 @@ def extract_methylation_data_from_bam(
538
734
 
539
735
  if query_base == "C":
540
736
  # Methylated
541
- coo_data.append(1)
737
+ read_cpg_data.append(1)
542
738
  if debug:
543
739
  print(f"\t{query_pos} {ref_pos} C->{query_base} [Methylated]")
544
740
  elif query_base == "T":
545
- coo_data.append(0)
741
+ read_cpg_data.append(0)
546
742
  # Unmethylated
547
743
  if debug:
548
744
  print(f"\t{query_pos} {ref_pos} C->{query_base} [Unmethylated]")
549
745
  else:
550
- coo_data.append(-1) # or just 0?
746
+ read_cpg_data.append(-1)
551
747
  if debug:
552
748
  print(
553
749
  f"\t{query_pos} {ref_pos} C->{query_base} [Unknown! SNV? Indel?]"
554
750
  )
555
751
 
752
+ if filter_em_overconversion and is_em_overconversion_read(
753
+ read_cpg_data, em_overconversion_min_cpgs
754
+ ):
755
+ if debug:
756
+ print("\tEM over-conversion filter: dropping read.")
757
+ continue
758
+
759
+ if debug:
760
+ # Ensure each read is only seen once
761
+ assert (
762
+ aligned_segment.query_name not in debug_read_name_to_row_number
763
+ ), "Read seen twice!"
764
+ debug_read_name_to_row_number[
765
+ aligned_segment.query_name # type: ignore
766
+ + ("_1" if aligned_segment.is_read1 else "_2")
767
+ ] = read_number
768
+
769
+ coo_row.extend([read_number] * len(read_cpg_cols))
770
+ coo_col.extend(read_cpg_cols)
771
+ coo_data.extend(read_cpg_data)
556
772
  tlen_list.append(aligned_segment.template_length)
557
773
  read_number += 1
558
774
 
559
775
  if debug:
560
776
  print("************************************************\n")
561
777
 
562
- # query_bp = aligned_segment.query_sequence[pileupread.query_position]
563
- # reference_bp = aligned_segment.get_reference_sequence()[aligned_segment.reference_start - pileupcolumn.reference_pos].upper()
564
-
565
778
  ## IIRC there's still a critical edge here, where sometimes we raise ValueError('row index exceeds matrix dimensions')
566
779
 
567
780
  if debug:
@@ -116,6 +116,25 @@ def inspect_npz(npz_path: str) -> None:
116
116
  else:
117
117
  print(" Fragment len: all zero (single-end data)")
118
118
 
119
+ if meta and "filters" in meta:
120
+ filters = meta["filters"]
121
+ active = []
122
+ nc = filters.get("non_converted_reads", {})
123
+ if nc.get("enabled"):
124
+ active.append(f"non-converted (>= {nc.get('threshold')} non-CpG Cs)")
125
+ em = filters.get("em_overconversion", {})
126
+ if em.get("enabled"):
127
+ active.append(
128
+ f"EM over-conversion (all-unmethylated, >= "
129
+ f"{em.get('min_cpgs')} CpGs)"
130
+ )
131
+ if active:
132
+ print(f" Filters: {active[0]}")
133
+ for extra in active[1:]:
134
+ print(f" {extra}")
135
+ else:
136
+ print(" Filters: none")
137
+
119
138
  if meta and "cpg_index_crc32" in meta:
120
139
  print(f" CpG index CRC32: {meta['cpg_index_crc32']}")
121
140
  if meta and "bam2tensor_version" in meta: