bam2tensor 2.4__tar.gz → 2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bam2tensor-2.4 → bam2tensor-2.5}/CLAUDE.md +1 -1
- {bam2tensor-2.4 → bam2tensor-2.5}/PKG-INFO +1 -1
- {bam2tensor-2.4 → bam2tensor-2.5}/pyproject.toml +1 -1
- {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/__init__.py +1 -1
- {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/__main__.py +76 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/functions.py +253 -40
- {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/inspect.py +19 -0
- bam2tensor-2.5/tests/test_filters.py +568 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_inspect.py +3 -3
- {bam2tensor-2.4 → bam2tensor-2.5}/uv.lock +1 -1
- {bam2tensor-2.4 → bam2tensor-2.5}/.darglint +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.editorconfig +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.gitattributes +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/ISSUE_TEMPLATE/bug_report.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/ISSUE_TEMPLATE/feature_request.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/actions/setup-env/action.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/dependabot.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/labels.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/release-drafter.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/constraints.txt +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/docs.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/labeler.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/release.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.github/workflows/tests.yml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.gitignore +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/.pre-commit-config.yaml +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/CONTRIBUTING.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/LICENSE +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/README.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/SECURITY.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/Makefile +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/conf.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/contributing.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/index.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/license.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/logo/604669_dna turning into math, computer _xl-1024-v1-0.png +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/logo/bam2tensor-logo.afdesign +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/logo/bam2tensor-logo.png +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/make.bat +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/nano-banana-overview-shrunk.png +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/reference.md +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/docs/templates/package.rst_t +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/noxfile.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/embedding.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/metadata.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/py.typed +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/reference.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/tests/__init__.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_duplication.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_embedding.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_fasta.fa +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_functions.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_main.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_metadata.py +0 -0
- {bam2tensor-2.4 → bam2tensor-2.5}/tests/test_reference.py +0 -0
|
@@ -40,7 +40,7 @@ uv run mypy src
|
|
|
40
40
|
|
|
41
41
|
```
|
|
42
42
|
src/bam2tensor/
|
|
43
|
-
__init__.py # Package version (2.
|
|
43
|
+
__init__.py # Package version (2.5)
|
|
44
44
|
__main__.py # Click CLI entry point (bam2tensor command)
|
|
45
45
|
inspect.py # Inspect CLI entry point (bam2tensor-inspect command)
|
|
46
46
|
embedding.py # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bam2tensor
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5
|
|
4
4
|
Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
|
|
5
5
|
Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
|
|
6
6
|
Project-URL: Repository, https://github.com/mcwdsi/bam2tensor
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "bam2tensor"
|
|
3
|
-
version = "2.
|
|
3
|
+
version = "2.5"
|
|
4
4
|
description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
|
|
5
5
|
authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
|
|
6
6
|
license = "MIT"
|
|
@@ -229,6 +229,43 @@ def validate_input_output(
|
|
|
229
229
|
default=20,
|
|
230
230
|
type=int,
|
|
231
231
|
)
|
|
232
|
+
@click.option(
|
|
233
|
+
"--filter-non-converted",
|
|
234
|
+
help=(
|
|
235
|
+
"Drop reads with >= --non-converted-threshold retained non-CpG "
|
|
236
|
+
"cytosines, the signature of incomplete bisulfite/EM-seq conversion "
|
|
237
|
+
"(port of nebiolabs/mark-nonconverted-reads). Default: off."
|
|
238
|
+
),
|
|
239
|
+
is_flag=True,
|
|
240
|
+
)
|
|
241
|
+
@click.option(
|
|
242
|
+
"--non-converted-threshold",
|
|
243
|
+
help=(
|
|
244
|
+
"Minimum count of retained non-CpG cytosines to drop a read "
|
|
245
|
+
"(default = 3, matches NEB mark-nonconverted-reads)."
|
|
246
|
+
),
|
|
247
|
+
default=3,
|
|
248
|
+
type=int,
|
|
249
|
+
)
|
|
250
|
+
@click.option(
|
|
251
|
+
"--filter-em-overconversion",
|
|
252
|
+
help=(
|
|
253
|
+
"Drop EM-seq reads whose covered CpGs are all called unmethylated "
|
|
254
|
+
"and cover at least --em-overconversion-min-cpgs sites (heuristic "
|
|
255
|
+
"for the fragment-level over-conversion artifact described in "
|
|
256
|
+
"Loyfer et al. bioRxiv 2026.03.24.713040). Default: off."
|
|
257
|
+
),
|
|
258
|
+
is_flag=True,
|
|
259
|
+
)
|
|
260
|
+
@click.option(
|
|
261
|
+
"--em-overconversion-min-cpgs",
|
|
262
|
+
help=(
|
|
263
|
+
"Minimum covered CpG count required before the EM over-conversion "
|
|
264
|
+
"filter will drop a read (default = 3)."
|
|
265
|
+
),
|
|
266
|
+
default=3,
|
|
267
|
+
type=int,
|
|
268
|
+
)
|
|
232
269
|
@click.option("--verbose", help="Verbose output.", is_flag=True)
|
|
233
270
|
@click.option("--skip-cache", help="De-novo generate CpG sites (slow).", is_flag=True)
|
|
234
271
|
@click.option(
|
|
@@ -263,6 +300,10 @@ def main(
|
|
|
263
300
|
expected_chromosomes: str | None,
|
|
264
301
|
reference_fasta: str | None,
|
|
265
302
|
quality_limit: int,
|
|
303
|
+
filter_non_converted: bool,
|
|
304
|
+
non_converted_threshold: int,
|
|
305
|
+
filter_em_overconversion: bool,
|
|
306
|
+
em_overconversion_min_cpgs: int,
|
|
266
307
|
verbose: bool,
|
|
267
308
|
skip_cache: bool,
|
|
268
309
|
debug: bool,
|
|
@@ -300,6 +341,17 @@ def main(
|
|
|
300
341
|
``--download-reference`` is used.
|
|
301
342
|
quality_limit: Minimum mapping quality (MAPQ) threshold. Reads below
|
|
302
343
|
this quality are excluded.
|
|
344
|
+
filter_non_converted: If True, drop reads with at least
|
|
345
|
+
``non_converted_threshold`` retained non-CpG cytosines —
|
|
346
|
+
indicating incomplete bisulfite/EM-seq conversion.
|
|
347
|
+
non_converted_threshold: Threshold used by the non-converted
|
|
348
|
+
read filter.
|
|
349
|
+
filter_em_overconversion: If True, drop reads whose covered CpGs
|
|
350
|
+
are all called unmethylated and cover at least
|
|
351
|
+
``em_overconversion_min_cpgs`` sites — heuristic for EM-seq
|
|
352
|
+
fragment-level over-conversion (Loyfer et al. 2026).
|
|
353
|
+
em_overconversion_min_cpgs: Minimum covered CpG count required
|
|
354
|
+
before the over-conversion filter will drop a read.
|
|
303
355
|
verbose: If True, print detailed progress information.
|
|
304
356
|
skip_cache: If True, regenerate the CpG site index even if a cache
|
|
305
357
|
file exists.
|
|
@@ -382,6 +434,16 @@ def main(
|
|
|
382
434
|
print(f" Reference: {reference_fasta}")
|
|
383
435
|
print(f" Chromosomes: {chrom_display}")
|
|
384
436
|
print(f" Quality limit: MAPQ >= {quality_limit}")
|
|
437
|
+
if filter_non_converted:
|
|
438
|
+
print(
|
|
439
|
+
f" Filters: non-converted reads (>= "
|
|
440
|
+
f"{non_converted_threshold} retained non-CpG Cs)"
|
|
441
|
+
)
|
|
442
|
+
if filter_em_overconversion:
|
|
443
|
+
print(
|
|
444
|
+
f" EM over-conversion (all-unmethylated, >= "
|
|
445
|
+
f"{em_overconversion_min_cpgs} CpGs)"
|
|
446
|
+
)
|
|
385
447
|
if output_dir:
|
|
386
448
|
print(f" Output dir: {output_dir}")
|
|
387
449
|
else:
|
|
@@ -448,6 +510,10 @@ def main(
|
|
|
448
510
|
input_bam=input_bam,
|
|
449
511
|
genome_methylation_embedding=genome_methylation_embedding,
|
|
450
512
|
quality_limit=quality_limit,
|
|
513
|
+
filter_non_converted=filter_non_converted,
|
|
514
|
+
non_converted_threshold=non_converted_threshold,
|
|
515
|
+
filter_em_overconversion=filter_em_overconversion,
|
|
516
|
+
em_overconversion_min_cpgs=em_overconversion_min_cpgs,
|
|
451
517
|
verbose=verbose,
|
|
452
518
|
debug=debug,
|
|
453
519
|
)
|
|
@@ -476,6 +542,16 @@ def main(
|
|
|
476
542
|
"expected_chromosomes": chrom_list,
|
|
477
543
|
"total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
|
|
478
544
|
"cpg_index_crc32": cpg_crc32,
|
|
545
|
+
"filters": {
|
|
546
|
+
"non_converted_reads": {
|
|
547
|
+
"enabled": filter_non_converted,
|
|
548
|
+
"threshold": non_converted_threshold,
|
|
549
|
+
},
|
|
550
|
+
"em_overconversion": {
|
|
551
|
+
"enabled": filter_em_overconversion,
|
|
552
|
+
"min_cpgs": em_overconversion_min_cpgs,
|
|
553
|
+
},
|
|
554
|
+
},
|
|
479
555
|
},
|
|
480
556
|
)
|
|
481
557
|
print(f" Output: {output_file}")
|
|
@@ -80,6 +80,146 @@ class ExtractionResult(NamedTuple):
|
|
|
80
80
|
_SKIP_FLAGS = 0x400 | 0x200 | 0x100 | 0x800
|
|
81
81
|
|
|
82
82
|
|
|
83
|
+
def count_non_cpg_retained_xm(xm_tag: str) -> int:
|
|
84
|
+
"""Count retained non-CpG cytosines in a Bismark XM methylation string.
|
|
85
|
+
|
|
86
|
+
Bismark's ``XM`` tag encodes per-base methylation context. Uppercase
|
|
87
|
+
letters indicate a cytosine that remained as ``C`` in the read
|
|
88
|
+
(i.e., was *not* converted by bisulfite/EM-seq treatment). ``H``,
|
|
89
|
+
``X`` and ``U`` correspond to retained cytosines in CHH, CHG and
|
|
90
|
+
unknown-context positions respectively. A high count of these on a
|
|
91
|
+
single read is a strong signal of incomplete conversion.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
xm_tag: The value of a read's Bismark ``XM`` tag.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
The count of ``H``, ``X`` and ``U`` characters in ``xm_tag``.
|
|
98
|
+
|
|
99
|
+
Example:
|
|
100
|
+
>>> count_non_cpg_retained_xm("..Z..hhh..HHH..z..")
|
|
101
|
+
3
|
|
102
|
+
"""
|
|
103
|
+
return xm_tag.count("H") + xm_tag.count("X") + xm_tag.count("U")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def count_non_cpg_retained_reference(
|
|
107
|
+
aligned_segment: pysam.AlignedSegment,
|
|
108
|
+
is_reverse_parent_strand: bool,
|
|
109
|
+
) -> int:
|
|
110
|
+
"""Count retained non-CpG bases validated against the reference.
|
|
111
|
+
|
|
112
|
+
For a correctly bisulfite- or EM-seq-converted read, every
|
|
113
|
+
non-CpG cytosine on the parent strand should have been converted.
|
|
114
|
+
On the forward-parent strand that means every non-CpG ``C`` in the
|
|
115
|
+
reference should appear as ``T`` in the read; on the reverse-parent
|
|
116
|
+
strand every non-CpG ``G`` should appear as ``A``. Positions where
|
|
117
|
+
the read still carries the unconverted base *and* the reference
|
|
118
|
+
genuinely has a ``C``/``G`` (i.e., the mismatch is not a SNP) count
|
|
119
|
+
as retained.
|
|
120
|
+
|
|
121
|
+
This is a faithful port of the logic in
|
|
122
|
+
``nebiolabs/mark-nonconverted-reads``, re-using the read's existing
|
|
123
|
+
``MD`` tag (via :py:meth:`pysam.AlignedSegment.get_aligned_pairs`
|
|
124
|
+
with ``with_seq=True``) instead of requiring a separate reference
|
|
125
|
+
FASTA.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
aligned_segment: A pysam aligned read. Must carry an ``MD``
|
|
129
|
+
tag; BAMs produced by Bismark, Biscuit, bwameth and gem3
|
|
130
|
+
all set this tag by default.
|
|
131
|
+
is_reverse_parent_strand: ``True`` if the read derives from the
|
|
132
|
+
reverse (OB/CTOB) bisulfite parent strand, ``False`` for
|
|
133
|
+
the forward (OT/CTOT) strand.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
The number of reference-validated retained non-CpG
|
|
137
|
+
cytosines (or guanines, for the reverse parent strand). Returns
|
|
138
|
+
``0`` when the read has no sequence or no ``MD`` tag is present.
|
|
139
|
+
"""
|
|
140
|
+
if aligned_segment.query_sequence is None:
|
|
141
|
+
return 0
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
pairs = aligned_segment.get_aligned_pairs(matches_only=True, with_seq=True)
|
|
145
|
+
except ValueError:
|
|
146
|
+
# MD tag missing — cannot validate against reference.
|
|
147
|
+
return 0
|
|
148
|
+
|
|
149
|
+
# Map ref_pos → reference base (uppercase) for CpG-context lookup.
|
|
150
|
+
# matches_only=True guarantees query_pos, ref_pos, ref_base are all set.
|
|
151
|
+
ref_pos_to_base = {rpos: rb.upper() for _, rpos, rb in pairs}
|
|
152
|
+
|
|
153
|
+
# On match, pysam returns ref_base uppercase (query matches ref).
|
|
154
|
+
# On mismatch (SNP), it returns lowercase. We only care about matches
|
|
155
|
+
# where ref is C/G — those are genuine retained, non-converted bases.
|
|
156
|
+
target = "G" if is_reverse_parent_strand else "C"
|
|
157
|
+
|
|
158
|
+
count = 0
|
|
159
|
+
for _, rpos, ref_base in pairs:
|
|
160
|
+
if ref_base != target:
|
|
161
|
+
# Not a match, or ref is not C/G. This rejects SNPs (lowercase)
|
|
162
|
+
# and converted positions (read has T/A, match has different base).
|
|
163
|
+
continue
|
|
164
|
+
# Exclude CpG context: on forward strand, next ref base == G;
|
|
165
|
+
# on reverse strand, previous ref base == C.
|
|
166
|
+
if is_reverse_parent_strand:
|
|
167
|
+
if ref_pos_to_base.get(rpos - 1) == "C":
|
|
168
|
+
continue
|
|
169
|
+
else:
|
|
170
|
+
if ref_pos_to_base.get(rpos + 1) == "G":
|
|
171
|
+
continue
|
|
172
|
+
count += 1
|
|
173
|
+
|
|
174
|
+
return count
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def is_em_overconversion_read(
|
|
178
|
+
read_cpg_states: list[int],
|
|
179
|
+
min_cpgs: int,
|
|
180
|
+
) -> bool:
|
|
181
|
+
"""Identify reads flagged as EM-seq fragment-level over-conversion.
|
|
182
|
+
|
|
183
|
+
Loyfer et al. (bioRxiv 2026.03.24.713040) report that EM-seq
|
|
184
|
+
produces a reproducible ~1–2.5% of multi-CpG fragments that appear
|
|
185
|
+
fully unmethylated across every covered CpG, driven by failed TET
|
|
186
|
+
protection and subsequent APOBEC hyper-conversion of an entire
|
|
187
|
+
molecule. At constitutively methylated loci these reads are purely
|
|
188
|
+
technical. Without a per-region methylation prior, the simplest
|
|
189
|
+
correction consistent with their observation is: drop reads whose
|
|
190
|
+
covered CpGs are all called unmethylated *and* cover at least
|
|
191
|
+
``min_cpgs`` sites (the paper's Fig. 1C regime where the artifact
|
|
192
|
+
diverges clearly from WGBS).
|
|
193
|
+
|
|
194
|
+
This heuristic also drops genuinely fully-unmethylated biological
|
|
195
|
+
fragments, so callers should opt in only when the downstream
|
|
196
|
+
application can tolerate that trade-off.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
read_cpg_states: Per-CpG methylation state values for a single
|
|
200
|
+
read, in column-order, using the bam2tensor encoding
|
|
201
|
+
(``1``=methylated, ``0``=unmethylated, ``-1``=no data).
|
|
202
|
+
min_cpgs: Minimum number of covered CpGs required to apply the
|
|
203
|
+
filter. Reads with fewer covered CpGs are never flagged.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
``True`` when the read has at least ``min_cpgs`` covered CpGs
|
|
207
|
+
and every covered CpG is called unmethylated (value ``0``).
|
|
208
|
+
``-1`` (no-data) values do not count as unmethylated.
|
|
209
|
+
|
|
210
|
+
Example:
|
|
211
|
+
>>> is_em_overconversion_read([0, 0, 0], min_cpgs=3)
|
|
212
|
+
True
|
|
213
|
+
>>> is_em_overconversion_read([0, 0, 1], min_cpgs=3)
|
|
214
|
+
False
|
|
215
|
+
>>> is_em_overconversion_read([0, 0], min_cpgs=3)
|
|
216
|
+
False
|
|
217
|
+
"""
|
|
218
|
+
if len(read_cpg_states) < min_cpgs:
|
|
219
|
+
return False
|
|
220
|
+
return all(state == 0 for state in read_cpg_states)
|
|
221
|
+
|
|
222
|
+
|
|
83
223
|
def detect_aligner(input_bam: str, sample_size: int = 1000) -> str:
|
|
84
224
|
"""Detect the aligner used to produce a BAM file by checking read tags.
|
|
85
225
|
|
|
@@ -198,6 +338,10 @@ def extract_methylation_data_from_bam(
|
|
|
198
338
|
input_bam: str,
|
|
199
339
|
genome_methylation_embedding: GenomeMethylationEmbedding,
|
|
200
340
|
quality_limit: int = 20,
|
|
341
|
+
filter_non_converted: bool = False,
|
|
342
|
+
non_converted_threshold: int = 3,
|
|
343
|
+
filter_em_overconversion: bool = False,
|
|
344
|
+
em_overconversion_min_cpgs: int = 3,
|
|
201
345
|
verbose: bool = False,
|
|
202
346
|
debug: bool = False,
|
|
203
347
|
) -> ExtractionResult:
|
|
@@ -216,6 +360,17 @@ def extract_methylation_data_from_bam(
|
|
|
216
360
|
- For Biscuit/bwameth/gem3: only parent-strand reads are processed
|
|
217
361
|
- For Bismark: all reads are processed (XM tag has pre-resolved calls)
|
|
218
362
|
|
|
363
|
+
Two additional, opt-in per-read filters are available:
|
|
364
|
+
- Non-converted reads (``filter_non_converted``): drops reads with
|
|
365
|
+
too many retained non-CpG cytosines, the hallmark of incomplete
|
|
366
|
+
bisulfite/EM-seq conversion. Ports the logic of
|
|
367
|
+
``nebiolabs/mark-nonconverted-reads``.
|
|
368
|
+
- EM-seq fragment-level over-conversion
|
|
369
|
+
(``filter_em_overconversion``): drops reads whose covered CpGs
|
|
370
|
+
are all called unmethylated, a heuristic for the EM-seq
|
|
371
|
+
artifact described by Loyfer et al.
|
|
372
|
+
(bioRxiv 2026.03.24.713040).
|
|
373
|
+
|
|
219
374
|
Two extraction paths are supported, detected automatically per-read:
|
|
220
375
|
|
|
221
376
|
**Bismark path** (XM tag present):
|
|
@@ -238,6 +393,23 @@ def extract_methylation_data_from_bam(
|
|
|
238
393
|
quality_limit: Minimum mapping quality (MAPQ) threshold for reads.
|
|
239
394
|
Reads with MAPQ below this value are skipped. Default is 20,
|
|
240
395
|
which excludes reads mapping to multiple locations equally well.
|
|
396
|
+
filter_non_converted: If True, drop reads that carry at least
|
|
397
|
+
``non_converted_threshold`` retained non-CpG cytosines, a
|
|
398
|
+
signature of incomplete bisulfite/EM-seq conversion. Default
|
|
399
|
+
False.
|
|
400
|
+
non_converted_threshold: Minimum count of retained non-CpG
|
|
401
|
+
cytosines required for the non-converted filter to drop a
|
|
402
|
+
read. Matches the NEB ``mark-nonconverted-reads`` default of
|
|
403
|
+
3.
|
|
404
|
+
filter_em_overconversion: If True, drop reads whose covered CpGs
|
|
405
|
+
are all called unmethylated and cover at least
|
|
406
|
+
``em_overconversion_min_cpgs`` sites — the Loyfer et al.
|
|
407
|
+
EM-seq fragment-level over-conversion heuristic. Default
|
|
408
|
+
False.
|
|
409
|
+
em_overconversion_min_cpgs: Minimum covered CpG count required
|
|
410
|
+
before the over-conversion filter will drop a read. Matches
|
|
411
|
+
the regime in Loyfer et al. Fig. 1C where the EM-seq
|
|
412
|
+
artifact is clearly separable from WGBS.
|
|
241
413
|
verbose: If True, display a progress bar and print the total read
|
|
242
414
|
count. Useful for monitoring progress on large files.
|
|
243
415
|
debug: If True, enable extensive validation and debug output.
|
|
@@ -354,6 +526,12 @@ def extract_methylation_data_from_bam(
|
|
|
354
526
|
if aligned_segment.flag & _SKIP_FLAGS:
|
|
355
527
|
continue
|
|
356
528
|
|
|
529
|
+
# Per-read buffers. We only flush these into the global
|
|
530
|
+
# coo_* arrays once the read passes all filters (including
|
|
531
|
+
# the post-CpG EM over-conversion filter).
|
|
532
|
+
read_cpg_cols: list[int] = []
|
|
533
|
+
read_cpg_data: list[int] = []
|
|
534
|
+
|
|
357
535
|
# ============================================================
|
|
358
536
|
# Bismark path: XM tag contains pre-resolved methylation calls.
|
|
359
537
|
# No strand filtering needed — Bismark already resolved strand
|
|
@@ -363,6 +541,13 @@ def extract_methylation_data_from_bam(
|
|
|
363
541
|
if aligned_segment.has_tag("XM"):
|
|
364
542
|
xm_tag: str = aligned_segment.get_tag("XM") # type: ignore[assignment]
|
|
365
543
|
|
|
544
|
+
# Non-converted filter (Bismark): XM tag already encodes
|
|
545
|
+
# retained non-CpG cytosines as H/X/U. Apply before any
|
|
546
|
+
# CpG work so we bail as early as possible.
|
|
547
|
+
if filter_non_converted:
|
|
548
|
+
if count_non_cpg_retained_xm(xm_tag) >= non_converted_threshold:
|
|
549
|
+
continue
|
|
550
|
+
|
|
366
551
|
# Find CpG sites covered by this read
|
|
367
552
|
start_idx = bisect.bisect_left(
|
|
368
553
|
cpg_sites, aligned_segment.reference_start + 1
|
|
@@ -385,7 +570,6 @@ def extract_methylation_data_from_bam(
|
|
|
385
570
|
if debug:
|
|
386
571
|
print(f"Query (Bismark): {aligned_segment.query_name}")
|
|
387
572
|
|
|
388
|
-
has_cpg_data = False
|
|
389
573
|
for query_pos, ref_pos in this_segment_cpgs:
|
|
390
574
|
# Bounds check: XM tag should match query length, but be defensive
|
|
391
575
|
if query_pos >= len(xm_tag):
|
|
@@ -393,39 +577,49 @@ def extract_methylation_data_from_bam(
|
|
|
393
577
|
|
|
394
578
|
xm_char = xm_tag[query_pos]
|
|
395
579
|
if xm_char == "Z":
|
|
396
|
-
|
|
580
|
+
read_cpg_data.append(1) # Methylated CpG
|
|
397
581
|
elif xm_char == "z":
|
|
398
|
-
|
|
582
|
+
read_cpg_data.append(0) # Unmethylated CpG
|
|
399
583
|
else:
|
|
400
584
|
# Non-CpG context at a CpG site (shouldn't happen
|
|
401
585
|
# normally, but possible with edge-case alignments)
|
|
402
|
-
|
|
586
|
+
read_cpg_data.append(-1)
|
|
403
587
|
|
|
404
|
-
|
|
405
|
-
coo_col.append(
|
|
588
|
+
read_cpg_cols.append(
|
|
406
589
|
genome_methylation_embedding.genomic_position_to_embedding(
|
|
407
590
|
chrom,
|
|
408
591
|
ref_pos + 1,
|
|
409
592
|
)
|
|
410
593
|
)
|
|
411
|
-
has_cpg_data = True
|
|
412
594
|
|
|
413
595
|
if debug:
|
|
414
596
|
print(f"\t{query_pos} {ref_pos} XM={xm_char}")
|
|
415
597
|
|
|
416
|
-
if
|
|
598
|
+
if not read_cpg_data:
|
|
599
|
+
continue
|
|
600
|
+
|
|
601
|
+
if filter_em_overconversion and is_em_overconversion_read(
|
|
602
|
+
read_cpg_data, em_overconversion_min_cpgs
|
|
603
|
+
):
|
|
417
604
|
if debug:
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
605
|
+
print("\tEM over-conversion filter: dropping read.")
|
|
606
|
+
continue
|
|
607
|
+
|
|
608
|
+
if debug:
|
|
609
|
+
read_key = aligned_segment.query_name + ( # type: ignore
|
|
610
|
+
"_1" if aligned_segment.is_read1 else "_2"
|
|
611
|
+
)
|
|
612
|
+
assert (
|
|
613
|
+
read_key not in debug_read_name_to_row_number
|
|
614
|
+
), "Read seen twice!"
|
|
615
|
+
debug_read_name_to_row_number[read_key] = read_number
|
|
616
|
+
print("************************************************\n")
|
|
617
|
+
|
|
618
|
+
coo_row.extend([read_number] * len(read_cpg_cols))
|
|
619
|
+
coo_col.extend(read_cpg_cols)
|
|
620
|
+
coo_data.extend(read_cpg_data)
|
|
621
|
+
tlen_list.append(aligned_segment.template_length)
|
|
622
|
+
read_number += 1
|
|
429
623
|
|
|
430
624
|
continue # Skip the Biscuit/bwameth/gem3 path below
|
|
431
625
|
|
|
@@ -460,6 +654,22 @@ def extract_methylation_data_from_bam(
|
|
|
460
654
|
print("\tNot on methylated strand, ignoring.")
|
|
461
655
|
continue
|
|
462
656
|
|
|
657
|
+
# Non-converted filter (Biscuit/bwameth/gem3): count retained
|
|
658
|
+
# non-CpG Cs (forward parent) or Gs (reverse parent) validated
|
|
659
|
+
# against the reference via the MD tag. Applied after the
|
|
660
|
+
# strand check so we don't waste work on daughter-strand reads.
|
|
661
|
+
if filter_non_converted:
|
|
662
|
+
if (
|
|
663
|
+
count_non_cpg_retained_reference(
|
|
664
|
+
aligned_segment,
|
|
665
|
+
bool(bisulfite_parent_strand_is_reverse),
|
|
666
|
+
)
|
|
667
|
+
>= non_converted_threshold
|
|
668
|
+
):
|
|
669
|
+
if debug:
|
|
670
|
+
print("\tNon-converted filter: dropping read.")
|
|
671
|
+
continue
|
|
672
|
+
|
|
463
673
|
# Use bisect to find CpGs covered by this read
|
|
464
674
|
# aligned_segment.reference_start is 0-based inclusive
|
|
465
675
|
# aligned_segment.reference_end is 0-based exclusive
|
|
@@ -492,15 +702,6 @@ def extract_methylation_data_from_bam(
|
|
|
492
702
|
"XB"
|
|
493
703
|
) # Bisulfite strand tag (YD for Biscuit/bwameth, XB for gem3)
|
|
494
704
|
|
|
495
|
-
# Ensure each read is only seen once
|
|
496
|
-
assert (
|
|
497
|
-
aligned_segment.query_name not in debug_read_name_to_row_number
|
|
498
|
-
), "Read seen twice!"
|
|
499
|
-
debug_read_name_to_row_number[
|
|
500
|
-
aligned_segment.query_name # type: ignore
|
|
501
|
-
+ ("_1" if aligned_segment.is_read1 else "_2")
|
|
502
|
-
] = read_number
|
|
503
|
-
|
|
504
705
|
# TODO: We ignore paired/unpaired read status for now. Should we treat paired reads / overlapping reads differently?
|
|
505
706
|
|
|
506
707
|
# get_aligned_pairs returns a list of tuples of (read_pos, ref_pos)
|
|
@@ -524,12 +725,7 @@ def extract_methylation_data_from_bam(
|
|
|
524
725
|
# query_base_raw = aligned_segment.get_forward_sequence()[query_pos] # raw off sequencer
|
|
525
726
|
# query_base_no_offset = aligned_segment.query_alignment_sequence[query_pos] # this needs to be offset by the soft clip
|
|
526
727
|
|
|
527
|
-
|
|
528
|
-
coo_row.append(read_number)
|
|
529
|
-
|
|
530
|
-
# Store the CpG site in our sparse array
|
|
531
|
-
# TODO: Object orient these inputs? -- lots of bad inheritence style here
|
|
532
|
-
coo_col.append(
|
|
728
|
+
read_cpg_cols.append(
|
|
533
729
|
genome_methylation_embedding.genomic_position_to_embedding(
|
|
534
730
|
chrom,
|
|
535
731
|
ref_pos + 1,
|
|
@@ -538,30 +734,47 @@ def extract_methylation_data_from_bam(
|
|
|
538
734
|
|
|
539
735
|
if query_base == "C":
|
|
540
736
|
# Methylated
|
|
541
|
-
|
|
737
|
+
read_cpg_data.append(1)
|
|
542
738
|
if debug:
|
|
543
739
|
print(f"\t{query_pos} {ref_pos} C->{query_base} [Methylated]")
|
|
544
740
|
elif query_base == "T":
|
|
545
|
-
|
|
741
|
+
read_cpg_data.append(0)
|
|
546
742
|
# Unmethylated
|
|
547
743
|
if debug:
|
|
548
744
|
print(f"\t{query_pos} {ref_pos} C->{query_base} [Unmethylated]")
|
|
549
745
|
else:
|
|
550
|
-
|
|
746
|
+
read_cpg_data.append(-1)
|
|
551
747
|
if debug:
|
|
552
748
|
print(
|
|
553
749
|
f"\t{query_pos} {ref_pos} C->{query_base} [Unknown! SNV? Indel?]"
|
|
554
750
|
)
|
|
555
751
|
|
|
752
|
+
if filter_em_overconversion and is_em_overconversion_read(
|
|
753
|
+
read_cpg_data, em_overconversion_min_cpgs
|
|
754
|
+
):
|
|
755
|
+
if debug:
|
|
756
|
+
print("\tEM over-conversion filter: dropping read.")
|
|
757
|
+
continue
|
|
758
|
+
|
|
759
|
+
if debug:
|
|
760
|
+
# Ensure each read is only seen once
|
|
761
|
+
assert (
|
|
762
|
+
aligned_segment.query_name not in debug_read_name_to_row_number
|
|
763
|
+
), "Read seen twice!"
|
|
764
|
+
debug_read_name_to_row_number[
|
|
765
|
+
aligned_segment.query_name # type: ignore
|
|
766
|
+
+ ("_1" if aligned_segment.is_read1 else "_2")
|
|
767
|
+
] = read_number
|
|
768
|
+
|
|
769
|
+
coo_row.extend([read_number] * len(read_cpg_cols))
|
|
770
|
+
coo_col.extend(read_cpg_cols)
|
|
771
|
+
coo_data.extend(read_cpg_data)
|
|
556
772
|
tlen_list.append(aligned_segment.template_length)
|
|
557
773
|
read_number += 1
|
|
558
774
|
|
|
559
775
|
if debug:
|
|
560
776
|
print("************************************************\n")
|
|
561
777
|
|
|
562
|
-
# query_bp = aligned_segment.query_sequence[pileupread.query_position]
|
|
563
|
-
# reference_bp = aligned_segment.get_reference_sequence()[aligned_segment.reference_start - pileupcolumn.reference_pos].upper()
|
|
564
|
-
|
|
565
778
|
## IIRC there's still a critical edge here, where sometimes we raise ValueError('row index exceeds matrix dimensions')
|
|
566
779
|
|
|
567
780
|
if debug:
|
|
@@ -116,6 +116,25 @@ def inspect_npz(npz_path: str) -> None:
|
|
|
116
116
|
else:
|
|
117
117
|
print(" Fragment len: all zero (single-end data)")
|
|
118
118
|
|
|
119
|
+
if meta and "filters" in meta:
|
|
120
|
+
filters = meta["filters"]
|
|
121
|
+
active = []
|
|
122
|
+
nc = filters.get("non_converted_reads", {})
|
|
123
|
+
if nc.get("enabled"):
|
|
124
|
+
active.append(f"non-converted (>= {nc.get('threshold')} non-CpG Cs)")
|
|
125
|
+
em = filters.get("em_overconversion", {})
|
|
126
|
+
if em.get("enabled"):
|
|
127
|
+
active.append(
|
|
128
|
+
f"EM over-conversion (all-unmethylated, >= "
|
|
129
|
+
f"{em.get('min_cpgs')} CpGs)"
|
|
130
|
+
)
|
|
131
|
+
if active:
|
|
132
|
+
print(f" Filters: {active[0]}")
|
|
133
|
+
for extra in active[1:]:
|
|
134
|
+
print(f" {extra}")
|
|
135
|
+
else:
|
|
136
|
+
print(" Filters: none")
|
|
137
|
+
|
|
119
138
|
if meta and "cpg_index_crc32" in meta:
|
|
120
139
|
print(f" CpG index CRC32: {meta['cpg_index_crc32']}")
|
|
121
140
|
if meta and "bam2tensor_version" in meta:
|