pymethyl2sam 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ """Genome loader for FASTA files."""
2
+
3
+ import os
4
+ from typing import Any, Dict, Optional, List
5
+
6
+ from ..utils.logging import setup_logging
7
+
8
+ logger = setup_logging(__name__)
9
+
10
+
11
+ class GenomeLoader:
12
+ """Handles loading genome sequences from FASTA files."""
13
+
14
+ def __init__(self):
15
+ """Initialize genome loader."""
16
+
17
+ def load_genome(
18
+ self, genome_file: str, chromosome: Optional[str] = None
19
+ ) -> Dict[str, Any]:
20
+ """Load genome sequences from FASTA file.
21
+
22
+ Args:
23
+ genome_file: Path to FASTA file
24
+ chromosome: Specific chromosome to load (optional)
25
+
26
+ Returns:
27
+ Dictionary with genome data
28
+ """
29
+ if not os.path.exists(genome_file):
30
+ raise FileNotFoundError(f"Genome file not found: {genome_file}")
31
+
32
+ logger.info(f"Loading genome from: {genome_file}")
33
+
34
+ sequences = {}
35
+ current_chr = None
36
+ current_seq = []
37
+
38
+ with open(genome_file, "r", encoding="utf-8") as file_handle:
39
+ for line in file_handle:
40
+ line = line.strip()
41
+
42
+ if line.startswith(">"):
43
+ # Save previous sequence
44
+ if current_chr and current_seq:
45
+ sequences[current_chr] = "".join(current_seq)
46
+
47
+ # Parse header
48
+ header = line[1:] # Remove '>'
49
+ current_chr = self._parse_chromosome_name(header)
50
+ current_seq = []
51
+
52
+ # Skip if specific chromosome requested and this isn't it
53
+ if chromosome and current_chr != chromosome:
54
+ current_chr = None
55
+ current_seq = []
56
+
57
+ elif current_chr and line:
58
+ current_seq.append(line.upper())
59
+
60
+ # Save last sequence
61
+ if current_chr and current_seq:
62
+ sequences[current_chr] = "".join(current_seq)
63
+
64
+ if chromosome and chromosome not in sequences:
65
+ raise ValueError(f"Chromosome {chromosome} not found in genome file")
66
+
67
+ if not sequences:
68
+ raise ValueError(f"No sequences found in genome file: {genome_file}")
69
+
70
+ genome_data = {
71
+ "file": genome_file,
72
+ "sequences": sequences,
73
+ "total_length": sum(len(seq) for seq in sequences.values()),
74
+ }
75
+
76
+ logger.info(
77
+ f"Loaded {len(sequences)} sequences with total length {genome_data['total_length']}"
78
+ )
79
+ return genome_data
80
+
81
+ def _parse_chromosome_name(self, header: str) -> str:
82
+ """Parse chromosome name from FASTA header.
83
+
84
+ Args:
85
+ header: FASTA header line
86
+
87
+ Returns:
88
+ Chromosome name
89
+ """
90
+ # Simple parsing - take first word after '>'
91
+ # This can be enhanced for different header formats
92
+ parts = header.split()
93
+ if parts:
94
+ return parts[0]
95
+ return "unknown"
96
+
97
+ def get_chromosome_list(self, genome_file: str) -> List[str]:
98
+ """Get list of chromosomes in genome file.
99
+
100
+ Args:
101
+ genome_file: Path to FASTA file
102
+
103
+ Returns:
104
+ List of chromosome names
105
+ """
106
+ chromosomes = []
107
+
108
+ with open(genome_file, "r", encoding="utf-8") as file_handle:
109
+ for line in file_handle:
110
+ if line.startswith(">"):
111
+ header = line[1:].strip()
112
+ chr_name = self._parse_chromosome_name(header)
113
+ chromosomes.append(chr_name)
114
+
115
+ return chromosomes
116
+
117
+ def get_sequence_length(self, genome_file: str, chromosome: str) -> int:
118
+ """Get length of a specific chromosome.
119
+
120
+ Args:
121
+ genome_file: Path to FASTA file
122
+ chromosome: Chromosome name
123
+
124
+ Returns:
125
+ Sequence length
126
+ """
127
+ genome_data = self.load_genome(genome_file, chromosome)
128
+ if chromosome in genome_data["sequences"]:
129
+ return len(genome_data["sequences"][chromosome])
130
+ return 0
131
+
132
+ def validate_genome_file(self, genome_file: str) -> Dict[str, Any]:
133
+ """Validate genome file and return statistics.
134
+
135
+ Args:
136
+ genome_file: Path to FASTA file
137
+
138
+ Returns:
139
+ Dictionary with validation results
140
+ """
141
+ validation_results = {
142
+ "file_exists": False,
143
+ "total_sequences": 0,
144
+ "total_length": 0,
145
+ "chromosomes": [],
146
+ "errors": [],
147
+ }
148
+
149
+ try:
150
+ if not os.path.exists(genome_file):
151
+ validation_results["errors"].append(f"File not found: {genome_file}")
152
+ return validation_results
153
+
154
+ validation_results["file_exists"] = True
155
+
156
+ genome_data = self.load_genome(genome_file)
157
+ validation_results["total_sequences"] = len(genome_data["sequences"])
158
+ validation_results["total_length"] = genome_data["total_length"]
159
+ validation_results["chromosomes"] = list(genome_data["sequences"].keys())
160
+
161
+ except RuntimeError as ex:
162
+ validation_results["errors"].append(
163
+ f"Error validating genome file: {str(ex)}"
164
+ )
165
+
166
+ return validation_results
@@ -0,0 +1,15 @@
1
+ """Orchestration of read and methylation simulation."""
2
+
3
+ from ..core.reference_genome import ConstantReferenceGenome
4
+ from .simulator import (
5
+ MethylationSimulator,
6
+ SequencedChromosome,
7
+ SequencedRegion,
8
+ )
9
+
10
+ __all__ = [
11
+ "MethylationSimulator",
12
+ "SequencedChromosome",
13
+ "SequencedRegion",
14
+ "ConstantReferenceGenome",
15
+ ]
@@ -0,0 +1,208 @@
1
+ """Simulator class for generating aligned reads from simulated CpG methylation data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass, field
7
+ from importlib.metadata import version, PackageNotFoundError
8
+ from random import seed as random_seed, random
9
+ from typing import Generator, List, Optional, Set
10
+
11
+ import pysam
12
+ from pysam.libcalignedsegment import AlignedSegment
13
+ from pysam.libcalignmentfile import AlignmentFile, AlignmentHeader
14
+
15
+ from pymethyl2sam.core.genomics import GenomicInterval, Chromosome, MethylationSite
16
+ from pymethyl2sam.core.reference_genome import ReferenceGenomeProvider
17
+ from pymethyl2sam.core.sequencing import ReadTemplate, ReadGenerator, ReadQuality
18
+ from pymethyl2sam.utils.pysam import make_sam_dict, generate_mm_tag
19
+
20
+ logger = logging.getLogger(__name__)
21
+ logging.basicConfig(level=logging.INFO)
22
+
23
+
24
+ @dataclass(frozen=True, init=False)
25
+ class SequencedRegion:
26
+ """Represents a genomic region with associated read generation strategy."""
27
+
28
+ interval: GenomicInterval
29
+ read_generator: ReadGenerator
30
+
31
+ def __init__(self, start: int, end: int, read_generator: ReadGenerator):
32
+ object.__setattr__(self, "interval", GenomicInterval(start, end, False))
33
+ object.__setattr__(self, "read_generator", read_generator)
34
+
35
+
36
+ @dataclass(frozen=True)
37
+ class SequencedChromosome(Chromosome):
38
+ regions: List[SequencedRegion]
39
+ cpg_sites: List[MethylationSite] = field(default_factory=list)
40
+
41
+ def __post_init__(self):
42
+ """Validate that methylation sites do not overlap."""
43
+ sorted_sites = sorted(self.cpg_sites, key=lambda s: s.position)
44
+ for prev, curr in zip(sorted_sites, sorted_sites[1:]):
45
+ prev_end = prev.position + len(prev.context)
46
+ curr_start = curr.position
47
+ if curr_start < prev_end:
48
+ raise ValueError(
49
+ f"Overlapping methylation sites detected:\n"
50
+ f" Site 1: pos={prev.position}, context='{prev.context}'\n"
51
+ f" Site 2: pos={curr.position}, context='{curr.context}'"
52
+ )
53
+
54
+ def generate_reads(self) -> Generator[ReadTemplate, None, None]:
55
+ """Yield reads for each defined region in the chromosome."""
56
+ for region in self.regions:
57
+ logger.debug(f"Generating reads for region: {region}")
58
+ yield from region.read_generator.generate_reads(
59
+ region.interval, self.cpg_sites
60
+ )
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class SimulatedRead(GenomicInterval):
65
+ """Represents a simulated read with sequence, methylation sites, and quality."""
66
+
67
+ chrom: str
68
+ sequence: str
69
+ methylated_sites: Set[MethylationSite]
70
+ quality: ReadQuality
71
+
72
+ @staticmethod
73
+ def from_template(
74
+ template: ReadTemplate, chrom: str, reference_genome: ReferenceGenomeProvider
75
+ ) -> SimulatedRead:
76
+ """Construct a simulated read from a template and reference sequence."""
77
+ sequence = bytearray(
78
+ reference_genome.get_sequence(chrom, template).upper(), "ascii"
79
+ )
80
+ sampled_sites: Set[MethylationSite] = set()
81
+
82
+ for site in template.local_methylated_sites:
83
+ SimulatedRead._apply_site(sequence, site)
84
+ if site.methylation_prob >= 1.0 or random() < site.methylation_prob:
85
+ sampled_sites.add(site)
86
+
87
+ return SimulatedRead(
88
+ chrom=chrom,
89
+ start=template.start,
90
+ end=template.end,
91
+ is_reverse=template.is_reverse,
92
+ sequence=sequence.decode("ascii"),
93
+ quality=template.quality,
94
+ methylated_sites=sampled_sites,
95
+ )
96
+
97
+ @staticmethod
98
+ def _apply_site(sequence: bytearray, site: MethylationSite) -> None:
99
+ """Apply a methylation site's context to the sequence in-place."""
100
+ start = max(0, site.position)
101
+ end = min(start + len(site.context), len(sequence))
102
+ sequence[start:end] = site.context[: end - start].encode("ascii")
103
+
104
+ def to_aligned_segment(self, header: AlignmentHeader) -> AlignedSegment:
105
+ """Convert this simulated read into a pysam AlignedSegment."""
106
+ segment = AlignedSegment.from_dict(
107
+ make_sam_dict(
108
+ chrome=self.chrom,
109
+ start=self.start,
110
+ sequence=self.sequence,
111
+ quality=self.quality,
112
+ is_reverse=self.is_reverse,
113
+ ),
114
+ header=header,
115
+ )
116
+ segment.set_tag("MD", str(self.length), value_type="Z")
117
+ segment.set_tag("NM", 0, value_type="i")
118
+
119
+ if self.methylated_sites and self.sequence:
120
+ mm_tag = generate_mm_tag(
121
+ self.sequence,
122
+ self.methylated_sites,
123
+ self.is_reverse,
124
+ )
125
+ segment.set_tag("MM", mm_tag, value_type="Z")
126
+
127
+ return segment
128
+
129
+
130
+ @dataclass
131
+ class MethylationSimulator:
132
+ """Main simulator class for generating methylation-aware reads."""
133
+
134
+ chromosomes: List[SequencedChromosome]
135
+ reference_genome: ReferenceGenomeProvider
136
+
137
+ def simulate_reads(
138
+ self,
139
+ output_file: str,
140
+ seed: Optional[int] = None,
141
+ is_sorted: bool = True,
142
+ ) -> None:
143
+ """Simulate reads and write to a BAM file.
144
+
145
+ Args:
146
+ output_file: Path to output BAM file
147
+ seed: Random seed for reproducibility
148
+ is_sorted: Whether to sort the BAM file after writing
149
+ """
150
+ if seed is not None:
151
+ random_seed(seed)
152
+
153
+ header = self.create_header()
154
+ with AlignmentFile(output_file, "wb", header=header) as out_bam:
155
+ logger.info("Beginning read simulation...")
156
+ for chrom in self.chromosomes:
157
+ logger.debug(f"Simulating reads for chromosome: {chrom.name}")
158
+ for template in chrom.generate_reads():
159
+ read = SimulatedRead.from_template(
160
+ template=template,
161
+ chrom=chrom.name,
162
+ reference_genome=self.reference_genome,
163
+ )
164
+ out_bam.write(read.to_aligned_segment(header))
165
+ logger.info("Read simulation complete.")
166
+
167
+ if is_sorted:
168
+ logger.info(f"Sorting output file: {output_file}")
169
+ pysam.sort(
170
+ "-o", output_file, output_file, "--write-index", catch_stdout=False
171
+ )
172
+
173
+ def create_header(self) -> AlignmentHeader:
174
+ """Create a SAM/BAM header based on the simulated chromosomes."""
175
+ try:
176
+ tool_version = version("pymethyl2sam")
177
+ except PackageNotFoundError:
178
+ tool_version = "unknown"
179
+
180
+ header_dict = {
181
+ "HD": {"VN": "1.6", "SO": "coordinate"},
182
+ "SQ": [
183
+ {"SN": chrom.name, "LN": chrom.length} for chrom in self.chromosomes
184
+ ],
185
+ "RG": [],
186
+ "PG": [
187
+ {
188
+ "ID": "pymethyl2sam",
189
+ "VN": tool_version,
190
+ "CL": "pymethyl2sam",
191
+ }
192
+ ],
193
+ }
194
+ return AlignmentHeader.from_dict(header_dict)
195
+
196
+ @property
197
+ def total_reads(self) -> int:
198
+ """Total number of reads across all chromosomes and regions."""
199
+ return sum(
200
+ region.read_generator.strategy.total_reads
201
+ for chrom in self.chromosomes
202
+ for region in chrom.regions
203
+ )
204
+
205
+ @property
206
+ def total_methylation_sites(self) -> int:
207
+ """Total number of methylation sites across all chromosomes."""
208
+ return sum(len(chrom.cpg_sites) for chrom in self.chromosomes)
@@ -0,0 +1,67 @@
1
+ from collections import defaultdict
2
+ from typing import Dict, Any, Set, Tuple
3
+
4
+ from pysam import SamtoolsError
5
+ from pysam.libcalignmentfile import AlignmentFile
6
+
7
+ from pymethyl2sam.utils import setup_logging
8
+ from pymethyl2sam.utils.pysam import get_read_to_reference_mapping, ReadPosition
9
+
10
+ logger = setup_logging(__name__)
11
+
12
+
13
+ def get_simulation_summary(bam_path: str) -> Dict[str, Any]:
14
+ stats: Dict[str, Any] = {
15
+ "total_reads": 0,
16
+ "reads_with_methylation": 0,
17
+ "chromosome_count": 0,
18
+ "regions_per_chromosome": defaultdict(int),
19
+ "total_methylation_sites": 0,
20
+ }
21
+
22
+ methylation_sites: Set[Tuple[str, int]] = set()
23
+
24
+ logger.info(f"Opening BAM file: {bam_path}")
25
+
26
+ try:
27
+ with AlignmentFile(bam_path, "rb") as bamfile:
28
+ chromosomes: Set[str] = set()
29
+ logger.info("Processing reads...")
30
+
31
+ for read in bamfile.fetch(until_eof=True):
32
+ reverse_strand_offset = 1 if read.is_reverse else 0
33
+ stats["total_reads"] += 1
34
+
35
+ if read.is_unmapped:
36
+ continue
37
+
38
+ chrom = bamfile.get_reference_name(read.reference_id)
39
+ chromosomes.add(chrom)
40
+ stats["regions_per_chromosome"][chrom] += 1
41
+
42
+ if read.has_tag("MM") and read.modified_bases:
43
+ read2ref = get_read_to_reference_mapping(read)
44
+ stats["reads_with_methylation"] += 1
45
+
46
+ modified_read_positions: Set[ReadPosition] = {
47
+ pos - reverse_strand_offset
48
+ for mods in read.modified_bases.values()
49
+ for pos, _ in mods
50
+ }
51
+
52
+ for pos in modified_read_positions:
53
+ if pos in read2ref:
54
+ methylation_sites.add((chrom, read2ref[pos]))
55
+
56
+ stats["total_methylation_sites"] = len(methylation_sites)
57
+ stats["chromosome_count"] = len(chromosomes)
58
+ logger.info("Finished processing BAM file.")
59
+
60
+ except (FileNotFoundError, PermissionError) as io_err:
61
+ logger.error(f"File error: {bam_path}", exc_info=io_err)
62
+ except (ValueError, TypeError, KeyError, AttributeError) as data_err:
63
+ logger.error("Data format or logic error while parsing BAM.", exc_info=data_err)
64
+ except (SamtoolsError, EOFError) as bam_err:
65
+ logger.error("BAM file appears corrupted or unreadable.", exc_info=bam_err)
66
+
67
+ return stats
@@ -0,0 +1,6 @@
1
+ """Logging, constants, shared helpers."""
2
+
3
+ from .constants import *
4
+ from .logging import setup_logging
5
+
6
+ __all__ = ["setup_logging"]
@@ -0,0 +1,55 @@
1
+ """Constants used throughout the package."""
2
+
3
+ from typing import Dict
4
+
5
+ _ORIG_BASES = "ACGTacgt"
6
+ _COMP_BASES = "TGCAtgca"
7
+ COMPLEMENT_TABLE: Dict[int, int] = str.maketrans(_ORIG_BASES, _COMP_BASES)
8
+
9
+ INSTRUMENT = "A00000"
10
+ RUN_NUMBER = 1
11
+ FLOWCELL_ID = "ABCD1234"
12
+
13
+ # Strand orientations
14
+ STRANDS = ["+", "-"]
15
+
16
+ # SAM/BAM tag names
17
+ MM_TAG = "MM" # Methylation modification
18
+ ML_TAG = "ML" # Methylation level
19
+
20
+ # Default values
21
+ DEFAULT_COVERAGE = 10.0
22
+ DEFAULT_READ_LENGTH = 100
23
+ DEFAULT_ERROR_RATE = 0.005
24
+ DEFAULT_METHYLATION_RATIO = 0.7
25
+ DEFAULT_BASE_QUALITY = 30
26
+
27
+ # File extensions
28
+ SUPPORTED_GENOME_FORMATS = [".fasta", ".fa", ".fna"]
29
+ SUPPORTED_TEMPLATE_FORMATS = [".yaml", ".yml", ".json"]
30
+ SUPPORTED_CONFIG_FORMATS = [".yaml", ".yml", ".json"]
31
+
32
+ # Coordinate system
33
+ COORDINATE_SYSTEM = "zero-based"
34
+ INTERVAL_FORMAT = "half-open" # [start, end)
35
+
36
+ # Error types
37
+ ERROR_TYPES = ["mismatch", "insertion", "deletion"]
38
+
39
+ # Quality score ranges
40
+ MIN_QUALITY_SCORE = 0
41
+ MAX_QUALITY_SCORE = 93
42
+
43
+ # SAM/BAM flags
44
+ SAM_FLAG_PAIRED = 1
45
+ SAM_FLAG_PROPER_PAIR = 2
46
+ SAM_FLAG_UNMAP = 4
47
+ SAM_FLAG_MUNMAP = 8
48
+ SAM_FLAG_REVERSE = 16
49
+ SAM_FLAG_MREVERSE = 32
50
+ SAM_FLAG_READ1 = 64
51
+ SAM_FLAG_READ2 = 128
52
+ SAM_FLAG_SECONDARY = 256
53
+ SAM_FLAG_QCFAIL = 512
54
+ SAM_FLAG_DUP = 1024
55
+ SAM_FLAG_SUPPLEMENTARY = 2048
@@ -0,0 +1,60 @@
1
+ """Logging setup and configuration."""
2
+
3
+ import logging
4
+ import sys
5
+ from typing import Optional
6
+
7
+
8
+ def setup_logging(name: str, level: str = "INFO") -> logging.Logger:
9
+ """Setup logging for a module.
10
+
11
+ Args:
12
+ name: Logger name (usually __name__)
13
+ level: Logging level
14
+
15
+ Returns:
16
+ Configured logger
17
+ """
18
+ logger = logging.getLogger(name)
19
+
20
+ # Avoid adding handlers multiple times
21
+ if logger.handlers:
22
+ return logger
23
+
24
+ # Set log level
25
+ logger.setLevel(getattr(logging, level.upper()))
26
+
27
+ # Create console handler
28
+ handler = logging.StreamHandler(sys.stdout)
29
+ handler.setLevel(getattr(logging, level.upper()))
30
+
31
+ # Create formatter
32
+ formatter = logging.Formatter(
33
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
34
+ datefmt="%Y-%m-%d %H:%M:%S",
35
+ )
36
+ handler.setFormatter(formatter)
37
+
38
+ # Add handler to logger
39
+ logger.addHandler(handler)
40
+
41
+ return logger
42
+
43
+
44
+ def configure_logging(level: str = "INFO", log_file: Optional[str] = None):
45
+ """Configure global logging.
46
+
47
+ Args:
48
+ level: Logging level
49
+ log_file: Optional log file path
50
+ """
51
+ # Configure root logger
52
+ logging.basicConfig(
53
+ level=getattr(logging, level.upper()),
54
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
55
+ datefmt="%Y-%m-%d %H:%M:%S",
56
+ handlers=[
57
+ logging.StreamHandler(sys.stdout),
58
+ logging.FileHandler(log_file) if log_file else logging.NullHandler(),
59
+ ],
60
+ )
@@ -0,0 +1,96 @@
1
+ from random import randint
2
+ from typing import List, Dict
3
+ from typing import Tuple, Set
4
+
5
+ from pysam.libcalignedsegment import AlignedSegment
6
+ from pysam.libcutils import qualities_to_qualitystring
7
+
8
+ from . import COMPLEMENT_TABLE, RUN_NUMBER, INSTRUMENT, FLOWCELL_ID
9
+ from ..core.genomics import MethylationSite
10
+ from ..core.sequencing import ReadQuality
11
+
12
+ ReadPosition = ReferencePosition = int
13
+ AlignedPairs = List[Tuple[ReadPosition, ReferencePosition]]
14
+
15
+
16
+ def get_read_to_reference_mapping(read: AlignedSegment):
17
+ pairs: AlignedPairs = read.get_aligned_pairs(matches_only=False)
18
+ return {read_pos: ref if ref else None for read_pos, ref in pairs}
19
+
20
+
21
+ def make_sam_dict(
22
+ chrome: str,
23
+ sequence: str,
24
+ start: int,
25
+ is_reverse: bool,
26
+ quality: ReadQuality,
27
+ ) -> Dict[str, str]:
28
+ read_name = _generate_read_name()
29
+ read_length = len(sequence)
30
+ flag = 16 if is_reverse else 0
31
+ ref_pos = start + 1
32
+ qual = qualities_to_qualitystring([quality.sequencing_score] * read_length)
33
+ return {
34
+ "name": read_name,
35
+ "flag": str(flag),
36
+ "ref_name": chrome,
37
+ "ref_pos": str(ref_pos),
38
+ "next_ref_name": "*",
39
+ "next_ref_pos": "0",
40
+ "map_quality": str(quality.mapping_score),
41
+ "length": "0",
42
+ "cigar": f"{read_length}M",
43
+ "seq": sequence,
44
+ "qual": qual,
45
+ }
46
+
47
+
48
+ def _generate_read_name() -> str:
49
+ lane = randint(1, 4)
50
+ tile = randint(1101, 1200)
51
+ x_pos = randint(0, 3000)
52
+ y_pos = randint(0, 3000)
53
+
54
+ return (
55
+ f"{INSTRUMENT}:"
56
+ f"{RUN_NUMBER:03d}:" # 3-digit zero padded
57
+ f"{FLOWCELL_ID}:"
58
+ f"{lane}:"
59
+ f"{tile:04d}:" # 4-digit zero padded
60
+ f"{x_pos:04d}:"
61
+ f"{y_pos:04d}"
62
+ )
63
+
64
+
65
+ def generate_mm_tag(
66
+ fw_sequence: str, methylated_sites: Set[MethylationSite], is_reverse_strand: bool
67
+ ) -> str:
68
+ """
69
+ Generates a SAM MM tag for methylation.
70
+ """
71
+ target_base = "C"
72
+ strand_tag = "+" if not is_reverse_strand else "-"
73
+ sequence = (
74
+ fw_sequence.translate(COMPLEMENT_TABLE)[::-1]
75
+ if is_reverse_strand
76
+ else fw_sequence
77
+ )
78
+ transform_pos = (
79
+ (lambda x: len(sequence) - 1 - x) if is_reverse_strand else lambda x: x
80
+ )
81
+ methylated_positions = set(
82
+ transform_pos(site.get_cytosine_position(is_reverse_strand))
83
+ for site in methylated_sites
84
+ )
85
+
86
+ candidate_positions = (i for i, char in enumerate(sequence) if char == target_base)
87
+
88
+ offsets = []
89
+ last_methyl_idx = -1
90
+ for candidate_idx, pos in enumerate(candidate_positions):
91
+ if pos in methylated_positions:
92
+ offsets.append(candidate_idx - last_methyl_idx - 1)
93
+ last_methyl_idx = candidate_idx
94
+ methylated_positions.remove(pos)
95
+
96
+ return f"{target_base}{strand_tag}m,{','.join(map(str, offsets))};"