PyPI - bam2tensor - Versions diffs - 2.4__tar.gz → 2.5__tar.gz - Mend

bam2tensor 2.4tar.gz → 2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

{bam2tensor-2.4 → bam2tensor-2.5}/CLAUDE.md RENAMED Viewed

@@ -40,7 +40,7 @@ uv run mypy src
 ```
 src/bam2tensor/
-  __init__.py      # Package version (2.4)
+  __init__.py      # Package version (2.5)
   __main__.py      # Click CLI entry point (bam2tensor command)
   inspect.py       # Inspect CLI entry point (bam2tensor-inspect command)
   embedding.py     # GenomeMethylationEmbedding class (FASTA parsing, CpG indexing)

{bam2tensor-2.4 → bam2tensor-2.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: bam2tensor
-Version: 2.4
+Version: 2.5
 Summary: Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation
 Project-URL: Homepage, https://github.com/mcwdsi/bam2tensor
 Project-URL: Repository, https://github.com/mcwdsi/bam2tensor

{bam2tensor-2.4 → bam2tensor-2.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "bam2tensor"
-version = "2.4"
+version = "2.5"
 description = "Convert bisulfite-seq and EM-seq BAM files to sparse tensor representations of DNA methylation"
 authors = [{ name = "Nick Semenkovich", email = "semenko@alum.mit.edu" }]
 license = "MIT"

{bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/__init__.py RENAMED Viewed

@@ -50,4 +50,4 @@ See Also:
     - https://mcwdsi.github.io/bam2tensor for full documentation
 """
-__version__ = "2.4"
+__version__ = "2.5"

{bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/__main__.py RENAMED Viewed

@@ -229,6 +229,43 @@ def validate_input_output(
     default=20,
     type=int,
 )
+@click.option(
+    "--filter-non-converted",
+    help=(
+        "Drop reads with >= --non-converted-threshold retained non-CpG "
+        "cytosines, the signature of incomplete bisulfite/EM-seq conversion "
+        "(port of nebiolabs/mark-nonconverted-reads). Default: off."
+    ),
+    is_flag=True,
+)
+@click.option(
+    "--non-converted-threshold",
+    help=(
+        "Minimum count of retained non-CpG cytosines to drop a read "
+        "(default = 3, matches NEB mark-nonconverted-reads)."
+    ),
+    default=3,
+    type=int,
+)
+@click.option(
+    "--filter-em-overconversion",
+    help=(
+        "Drop EM-seq reads whose covered CpGs are all called unmethylated "
+        "and cover at least --em-overconversion-min-cpgs sites (heuristic "
+        "for the fragment-level over-conversion artifact described in "
+        "Loyfer et al. bioRxiv 2026.03.24.713040). Default: off."
+    ),
+    is_flag=True,
+)
+@click.option(
+    "--em-overconversion-min-cpgs",
+    help=(
+        "Minimum covered CpG count required before the EM over-conversion "
+        "filter will drop a read (default = 3)."
+    ),
+    default=3,
+    type=int,
+)
 @click.option("--verbose", help="Verbose output.", is_flag=True)
 @click.option("--skip-cache", help="De-novo generate CpG sites (slow).", is_flag=True)
 @click.option(
@@ -263,6 +300,10 @@ def main(
     expected_chromosomes: str | None,
     reference_fasta: str | None,
     quality_limit: int,
+    filter_non_converted: bool,
+    non_converted_threshold: int,
+    filter_em_overconversion: bool,
+    em_overconversion_min_cpgs: int,
     verbose: bool,
     skip_cache: bool,
     debug: bool,
@@ -300,6 +341,17 @@ def main(
             ``--download-reference`` is used.
         quality_limit: Minimum mapping quality (MAPQ) threshold. Reads below
             this quality are excluded.
+        filter_non_converted: If True, drop reads with at least
+            ``non_converted_threshold`` retained non-CpG cytosines —
+            indicating incomplete bisulfite/EM-seq conversion.
+        non_converted_threshold: Threshold used by the non-converted
+            read filter.
+        filter_em_overconversion: If True, drop reads whose covered CpGs
+            are all called unmethylated and cover at least
+            ``em_overconversion_min_cpgs`` sites — heuristic for EM-seq
+            fragment-level over-conversion (Loyfer et al. 2026).
+        em_overconversion_min_cpgs: Minimum covered CpG count required
+            before the over-conversion filter will drop a read.
         verbose: If True, print detailed progress information.
         skip_cache: If True, regenerate the CpG site index even if a cache
             file exists.
@@ -382,6 +434,16 @@ def main(
     print(f"  Reference:     {reference_fasta}")
     print(f"  Chromosomes:   {chrom_display}")
     print(f"  Quality limit: MAPQ >= {quality_limit}")
+    if filter_non_converted:
+        print(
+            f"  Filters:       non-converted reads (>= "
+            f"{non_converted_threshold} retained non-CpG Cs)"
+        )
+    if filter_em_overconversion:
+        print(
+            f"                 EM over-conversion (all-unmethylated, >= "
+            f"{em_overconversion_min_cpgs} CpGs)"
+        )
     if output_dir:
         print(f"  Output dir:    {output_dir}")
     else:
@@ -448,6 +510,10 @@ def main(
                 input_bam=input_bam,
                 genome_methylation_embedding=genome_methylation_embedding,
                 quality_limit=quality_limit,
+                filter_non_converted=filter_non_converted,
+                non_converted_threshold=non_converted_threshold,
+                filter_em_overconversion=filter_em_overconversion,
+                em_overconversion_min_cpgs=em_overconversion_min_cpgs,
                 verbose=verbose,
                 debug=debug,
             )
@@ -476,6 +542,16 @@ def main(
                 "expected_chromosomes": chrom_list,
                 "total_cpg_sites": genome_methylation_embedding.total_cpg_sites,
                 "cpg_index_crc32": cpg_crc32,
+                "filters": {
+                    "non_converted_reads": {
+                        "enabled": filter_non_converted,
+                        "threshold": non_converted_threshold,
+                    },
+                    "em_overconversion": {
+                        "enabled": filter_em_overconversion,
+                        "min_cpgs": em_overconversion_min_cpgs,
+                    },
+                },
             },
         )
         print(f"  Output:        {output_file}")

{bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/functions.py RENAMED Viewed

@@ -80,6 +80,146 @@ class ExtractionResult(NamedTuple):
 _SKIP_FLAGS = 0x400 | 0x200 | 0x100 | 0x800
+def count_non_cpg_retained_xm(xm_tag: str) -> int:
+    """Count retained non-CpG cytosines in a Bismark XM methylation string.
+    Bismark's ``XM`` tag encodes per-base methylation context. Uppercase
+    letters indicate a cytosine that remained as ``C`` in the read
+    (i.e., was *not* converted by bisulfite/EM-seq treatment). ``H``,
+    ``X`` and ``U`` correspond to retained cytosines in CHH, CHG and
+    unknown-context positions respectively. A high count of these on a
+    single read is a strong signal of incomplete conversion.
+    Args:
+        xm_tag: The value of a read's Bismark ``XM`` tag.
+    Returns:
+        The count of ``H``, ``X`` and ``U`` characters in ``xm_tag``.
+    Example:
+        >>> count_non_cpg_retained_xm("..Z..hhh..HHH..z..")
+        3
+    """
+    return xm_tag.count("H") + xm_tag.count("X") + xm_tag.count("U")
+def count_non_cpg_retained_reference(
+    aligned_segment: pysam.AlignedSegment,
+    is_reverse_parent_strand: bool,
+) -> int:
+    """Count retained non-CpG bases validated against the reference.
+    For a correctly bisulfite- or EM-seq-converted read, every
+    non-CpG cytosine on the parent strand should have been converted.
+    On the forward-parent strand that means every non-CpG ``C`` in the
+    reference should appear as ``T`` in the read; on the reverse-parent
+    strand every non-CpG ``G`` should appear as ``A``. Positions where
+    the read still carries the unconverted base *and* the reference
+    genuinely has a ``C``/``G`` (i.e., the mismatch is not a SNP) count
+    as retained.
+    This is a faithful port of the logic in
+    ``nebiolabs/mark-nonconverted-reads``, re-using the read's existing
+    ``MD`` tag (via :py:meth:`pysam.AlignedSegment.get_aligned_pairs`
+    with ``with_seq=True``) instead of requiring a separate reference
+    FASTA.
+    Args:
+        aligned_segment: A pysam aligned read. Must carry an ``MD``
+            tag; BAMs produced by Bismark, Biscuit, bwameth and gem3
+            all set this tag by default.
+        is_reverse_parent_strand: ``True`` if the read derives from the
+            reverse (OB/CTOB) bisulfite parent strand, ``False`` for
+            the forward (OT/CTOT) strand.
+    Returns:
+        The number of reference-validated retained non-CpG
+        cytosines (or guanines, for the reverse parent strand). Returns
+        ``0`` when the read has no sequence or no ``MD`` tag is present.
+    """
+    if aligned_segment.query_sequence is None:
+        return 0
+    try:
+        pairs = aligned_segment.get_aligned_pairs(matches_only=True, with_seq=True)
+    except ValueError:
+        # MD tag missing — cannot validate against reference.
+        return 0
+    # Map ref_pos → reference base (uppercase) for CpG-context lookup.
+    # matches_only=True guarantees query_pos, ref_pos, ref_base are all set.
+    ref_pos_to_base = {rpos: rb.upper() for _, rpos, rb in pairs}
+    # On match, pysam returns ref_base uppercase (query matches ref).
+    # On mismatch (SNP), it returns lowercase. We only care about matches
+    # where ref is C/G — those are genuine retained, non-converted bases.
+    target = "G" if is_reverse_parent_strand else "C"
+    count = 0
+    for _, rpos, ref_base in pairs:
+        if ref_base != target:
+            # Not a match, or ref is not C/G. This rejects SNPs (lowercase)
+            # and converted positions (read has T/A, match has different base).
+            continue
+        # Exclude CpG context: on forward strand, next ref base == G;
+        # on reverse strand, previous ref base == C.
+        if is_reverse_parent_strand:
+            if ref_pos_to_base.get(rpos - 1) == "C":
+                continue
+        else:
+            if ref_pos_to_base.get(rpos + 1) == "G":
+                continue
+        count += 1
+    return count
+def is_em_overconversion_read(
+    read_cpg_states: list[int],
+    min_cpgs: int,
+) -> bool:
+    """Identify reads flagged as EM-seq fragment-level over-conversion.
+    Loyfer et al. (bioRxiv 2026.03.24.713040) report that EM-seq
+    produces a reproducible ~1–2.5% of multi-CpG fragments that appear
+    fully unmethylated across every covered CpG, driven by failed TET
+    protection and subsequent APOBEC hyper-conversion of an entire
+    molecule. At constitutively methylated loci these reads are purely
+    technical. Without a per-region methylation prior, the simplest
+    correction consistent with their observation is: drop reads whose
+    covered CpGs are all called unmethylated *and* cover at least
+    ``min_cpgs`` sites (the paper's Fig. 1C regime where the artifact
+    diverges clearly from WGBS).
+    This heuristic also drops genuinely fully-unmethylated biological
+    fragments, so callers should opt in only when the downstream
+    application can tolerate that trade-off.
+    Args:
+        read_cpg_states: Per-CpG methylation state values for a single
+            read, in column-order, using the bam2tensor encoding
+            (``1``=methylated, ``0``=unmethylated, ``-1``=no data).
+        min_cpgs: Minimum number of covered CpGs required to apply the
+            filter. Reads with fewer covered CpGs are never flagged.
+    Returns:
+        ``True`` when the read has at least ``min_cpgs`` covered CpGs
+        and every covered CpG is called unmethylated (value ``0``).
+        ``-1`` (no-data) values do not count as unmethylated.
+    Example:
+        >>> is_em_overconversion_read([0, 0, 0], min_cpgs=3)
+        True
+        >>> is_em_overconversion_read([0, 0, 1], min_cpgs=3)
+        False
+        >>> is_em_overconversion_read([0, 0], min_cpgs=3)
+        False
+    """
+    if len(read_cpg_states) < min_cpgs:
+        return False
+    return all(state == 0 for state in read_cpg_states)
 def detect_aligner(input_bam: str, sample_size: int = 1000) -> str:
     """Detect the aligner used to produce a BAM file by checking read tags.
@@ -198,6 +338,10 @@ def extract_methylation_data_from_bam(
     input_bam: str,
     genome_methylation_embedding: GenomeMethylationEmbedding,
     quality_limit: int = 20,
+    filter_non_converted: bool = False,
+    non_converted_threshold: int = 3,
+    filter_em_overconversion: bool = False,
+    em_overconversion_min_cpgs: int = 3,
     verbose: bool = False,
     debug: bool = False,
 ) -> ExtractionResult:
@@ -216,6 +360,17 @@ def extract_methylation_data_from_bam(
         - For Biscuit/bwameth/gem3: only parent-strand reads are processed
         - For Bismark: all reads are processed (XM tag has pre-resolved calls)
+    Two additional, opt-in per-read filters are available:
+        - Non-converted reads (``filter_non_converted``): drops reads with
+          too many retained non-CpG cytosines, the hallmark of incomplete
+          bisulfite/EM-seq conversion. Ports the logic of
+          ``nebiolabs/mark-nonconverted-reads``.
+        - EM-seq fragment-level over-conversion
+          (``filter_em_overconversion``): drops reads whose covered CpGs
+          are all called unmethylated, a heuristic for the EM-seq
+          artifact described by Loyfer et al.
+          (bioRxiv 2026.03.24.713040).
     Two extraction paths are supported, detected automatically per-read:
     **Bismark path** (XM tag present):
@@ -238,6 +393,23 @@ def extract_methylation_data_from_bam(
         quality_limit: Minimum mapping quality (MAPQ) threshold for reads.
             Reads with MAPQ below this value are skipped. Default is 20,
             which excludes reads mapping to multiple locations equally well.
+        filter_non_converted: If True, drop reads that carry at least
+            ``non_converted_threshold`` retained non-CpG cytosines, a
+            signature of incomplete bisulfite/EM-seq conversion. Default
+            False.
+        non_converted_threshold: Minimum count of retained non-CpG
+            cytosines required for the non-converted filter to drop a
+            read. Matches the NEB ``mark-nonconverted-reads`` default of
+            3.
+        filter_em_overconversion: If True, drop reads whose covered CpGs
+            are all called unmethylated and cover at least
+            ``em_overconversion_min_cpgs`` sites — the Loyfer et al.
+            EM-seq fragment-level over-conversion heuristic. Default
+            False.
+        em_overconversion_min_cpgs: Minimum covered CpG count required
+            before the over-conversion filter will drop a read. Matches
+            the regime in Loyfer et al. Fig. 1C where the EM-seq
+            artifact is clearly separable from WGBS.
         verbose: If True, display a progress bar and print the total read
             count. Useful for monitoring progress on large files.
         debug: If True, enable extensive validation and debug output.
@@ -354,6 +526,12 @@ def extract_methylation_data_from_bam(
             if aligned_segment.flag & _SKIP_FLAGS:
                 continue
+            # Per-read buffers. We only flush these into the global
+            # coo_* arrays once the read passes all filters (including
+            # the post-CpG EM over-conversion filter).
+            read_cpg_cols: list[int] = []
+            read_cpg_data: list[int] = []
             # ============================================================
             # Bismark path: XM tag contains pre-resolved methylation calls.
             # No strand filtering needed — Bismark already resolved strand
@@ -363,6 +541,13 @@ def extract_methylation_data_from_bam(
             if aligned_segment.has_tag("XM"):
                 xm_tag: str = aligned_segment.get_tag("XM")  # type: ignore[assignment]
+                # Non-converted filter (Bismark): XM tag already encodes
+                # retained non-CpG cytosines as H/X/U. Apply before any
+                # CpG work so we bail as early as possible.
+                if filter_non_converted:
+                    if count_non_cpg_retained_xm(xm_tag) >= non_converted_threshold:
+                        continue
                 # Find CpG sites covered by this read
                 start_idx = bisect.bisect_left(
                     cpg_sites, aligned_segment.reference_start + 1
@@ -385,7 +570,6 @@ def extract_methylation_data_from_bam(
                 if debug:
                     print(f"Query (Bismark): {aligned_segment.query_name}")
-                has_cpg_data = False
                 for query_pos, ref_pos in this_segment_cpgs:
                     # Bounds check: XM tag should match query length, but be defensive
                     if query_pos >= len(xm_tag):
@@ -393,39 +577,49 @@ def extract_methylation_data_from_bam(
                     xm_char = xm_tag[query_pos]
                     if xm_char == "Z":
-                        coo_data.append(1)  # Methylated CpG
+                        read_cpg_data.append(1)  # Methylated CpG
                     elif xm_char == "z":
-                        coo_data.append(0)  # Unmethylated CpG
+                        read_cpg_data.append(0)  # Unmethylated CpG
                     else:
                         # Non-CpG context at a CpG site (shouldn't happen
                         # normally, but possible with edge-case alignments)
-                        coo_data.append(-1)
+                        read_cpg_data.append(-1)
-                    coo_row.append(read_number)
-                    coo_col.append(
+                    read_cpg_cols.append(
                         genome_methylation_embedding.genomic_position_to_embedding(
                             chrom,
                             ref_pos + 1,
                         )
                     )
-                    has_cpg_data = True
                     if debug:
                         print(f"\t{query_pos} {ref_pos} XM={xm_char}")
-                if has_cpg_data:
+                if not read_cpg_data:
+                    continue
+                if filter_em_overconversion and is_em_overconversion_read(
+                    read_cpg_data, em_overconversion_min_cpgs
+                ):
                     if debug:
-                        # Ensure each read is only seen once
-                        read_key = aligned_segment.query_name + (  # type: ignore
-                            "_1" if aligned_segment.is_read1 else "_2"
-                        )
-                        assert (
-                            read_key not in debug_read_name_to_row_number
-                        ), "Read seen twice!"
-                        debug_read_name_to_row_number[read_key] = read_number
-                        print("************************************************\n")
-                    tlen_list.append(aligned_segment.template_length)
-                    read_number += 1
+                        print("\tEM over-conversion filter: dropping read.")
+                    continue
+                if debug:
+                    read_key = aligned_segment.query_name + (  # type: ignore
+                        "_1" if aligned_segment.is_read1 else "_2"
+                    )
+                    assert (
+                        read_key not in debug_read_name_to_row_number
+                    ), "Read seen twice!"
+                    debug_read_name_to_row_number[read_key] = read_number
+                    print("************************************************\n")
+                coo_row.extend([read_number] * len(read_cpg_cols))
+                coo_col.extend(read_cpg_cols)
+                coo_data.extend(read_cpg_data)
+                tlen_list.append(aligned_segment.template_length)
+                read_number += 1
                 continue  # Skip the Biscuit/bwameth/gem3 path below
@@ -460,6 +654,22 @@ def extract_methylation_data_from_bam(
                     print("\tNot on methylated strand, ignoring.")
                 continue
+            # Non-converted filter (Biscuit/bwameth/gem3): count retained
+            # non-CpG Cs (forward parent) or Gs (reverse parent) validated
+            # against the reference via the MD tag. Applied after the
+            # strand check so we don't waste work on daughter-strand reads.
+            if filter_non_converted:
+                if (
+                    count_non_cpg_retained_reference(
+                        aligned_segment,
+                        bool(bisulfite_parent_strand_is_reverse),
+                    )
+                    >= non_converted_threshold
+                ):
+                    if debug:
+                        print("\tNon-converted filter: dropping read.")
+                    continue
             # Use bisect to find CpGs covered by this read
             # aligned_segment.reference_start is 0-based inclusive
             # aligned_segment.reference_end is 0-based exclusive
@@ -492,15 +702,6 @@ def extract_methylation_data_from_bam(
                     "XB"
                 )  # Bisulfite strand tag (YD for Biscuit/bwameth, XB for gem3)
-                # Ensure each read is only seen once
-                assert (
-                    aligned_segment.query_name not in debug_read_name_to_row_number
-                ), "Read seen twice!"
-                debug_read_name_to_row_number[
-                    aligned_segment.query_name  # type: ignore
-                    + ("_1" if aligned_segment.is_read1 else "_2")
-                ] = read_number
             # TODO: We ignore paired/unpaired read status for now. Should we treat paired reads / overlapping reads differently?
             # get_aligned_pairs returns a list of tuples of (read_pos, ref_pos)
@@ -524,12 +725,7 @@ def extract_methylation_data_from_bam(
                 # query_base_raw = aligned_segment.get_forward_sequence()[query_pos] # raw off sequencer
                 # query_base_no_offset = aligned_segment.query_alignment_sequence[query_pos] # this needs to be offset by the soft clip
-                # Store the read # in our sparse array
-                coo_row.append(read_number)
-                # Store the CpG site in our sparse array
-                # TODO: Object orient these inputs? -- lots of bad inheritence style here
-                coo_col.append(
+                read_cpg_cols.append(
                     genome_methylation_embedding.genomic_position_to_embedding(
                         chrom,
                         ref_pos + 1,
@@ -538,30 +734,47 @@ def extract_methylation_data_from_bam(
                 if query_base == "C":
                     # Methylated
-                    coo_data.append(1)
+                    read_cpg_data.append(1)
                     if debug:
                         print(f"\t{query_pos} {ref_pos} C->{query_base} [Methylated]")
                 elif query_base == "T":
-                    coo_data.append(0)
+                    read_cpg_data.append(0)
                     # Unmethylated
                     if debug:
                         print(f"\t{query_pos} {ref_pos} C->{query_base} [Unmethylated]")
                 else:
-                    coo_data.append(-1)  # or just 0?
+                    read_cpg_data.append(-1)
                     if debug:
                         print(
                             f"\t{query_pos} {ref_pos} C->{query_base} [Unknown! SNV? Indel?]"
                         )
+            if filter_em_overconversion and is_em_overconversion_read(
+                read_cpg_data, em_overconversion_min_cpgs
+            ):
+                if debug:
+                    print("\tEM over-conversion filter: dropping read.")
+                continue
+            if debug:
+                # Ensure each read is only seen once
+                assert (
+                    aligned_segment.query_name not in debug_read_name_to_row_number
+                ), "Read seen twice!"
+                debug_read_name_to_row_number[
+                    aligned_segment.query_name  # type: ignore
+                    + ("_1" if aligned_segment.is_read1 else "_2")
+                ] = read_number
+            coo_row.extend([read_number] * len(read_cpg_cols))
+            coo_col.extend(read_cpg_cols)
+            coo_data.extend(read_cpg_data)
             tlen_list.append(aligned_segment.template_length)
             read_number += 1
             if debug:
                 print("************************************************\n")
-                # query_bp = aligned_segment.query_sequence[pileupread.query_position]
-                # reference_bp = aligned_segment.get_reference_sequence()[aligned_segment.reference_start - pileupcolumn.reference_pos].upper()
     ## IIRC there's still a critical edge here, where sometimes we raise ValueError('row index exceeds matrix dimensions')
     if debug:

{bam2tensor-2.4 → bam2tensor-2.5}/src/bam2tensor/inspect.py RENAMED Viewed

@@ -116,6 +116,25 @@ def inspect_npz(npz_path: str) -> None:
         else:
             print("  Fragment len:    all zero (single-end data)")
+    if meta and "filters" in meta:
+        filters = meta["filters"]
+        active = []
+        nc = filters.get("non_converted_reads", {})
+        if nc.get("enabled"):
+            active.append(f"non-converted (>= {nc.get('threshold')} non-CpG Cs)")
+        em = filters.get("em_overconversion", {})
+        if em.get("enabled"):
+            active.append(
+                f"EM over-conversion (all-unmethylated, >= "
+                f"{em.get('min_cpgs')} CpGs)"
+            )
+        if active:
+            print(f"  Filters:         {active[0]}")
+            for extra in active[1:]:
+                print(f"                   {extra}")
+        else:
+            print("  Filters:         none")
     if meta and "cpg_index_crc32" in meta:
         print(f"  CpG index CRC32: {meta['cpg_index_crc32']}")
     if meta and "bam2tensor_version" in meta:

bam2tensor 2.4__tar.gz → 2.5__tar.gz

bam2tensor 2.4tar.gz → 2.5tar.gz