pymethyl2sam 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ """
2
+ PyMethyl2Sam - A Python library for methylation data processing and BAM file generation.
3
+
4
+ Copyright (c) 2025
5
+ """
6
+
7
+ from importlib.metadata import version
8
+
9
+ __version__ = version("pymethyl2sam")
10
+
11
+ # Import core components
12
+ from .core.genomics import *
13
+ from .core.sequencing import *
14
+ from .core.errors import *
15
+ from .simulator import MethylationSimulator
16
+
17
+ __all__ = ["MethylationSimulator"]
@@ -0,0 +1,6 @@
1
+ """Core domain logic for methylation modeling, sequencing, and errors."""
2
+
3
+ from .errors import ErrorModel
4
+ from .genomics import MethylationSite
5
+
6
+ __all__ = ["MethylationSite", "ErrorModel"]
@@ -0,0 +1,201 @@
1
+ """Error model for simulating sequencing errors and mutations."""
2
+
3
+ import random
4
+ from dataclasses import dataclass
5
+ from typing import Dict, List, Tuple
6
+
7
+
8
+ @dataclass
9
+ class ErrorParameters:
10
+ """Parameters for error simulation."""
11
+
12
+ mismatch_rate: float
13
+ insertion_rate: float
14
+ deletion_rate: float
15
+
16
+ def __post_init__(self):
17
+ """Validate error parameters."""
18
+ if not 0 <= self.mismatch_rate <= 1:
19
+ raise ValueError("Mismatch rate must be between 0 and 1")
20
+ if not 0 <= self.insertion_rate <= 1:
21
+ raise ValueError("Insertion rate must be between 0 and 1")
22
+ if not 0 <= self.deletion_rate <= 1:
23
+ raise ValueError("Deletion rate must be between 0 and 1")
24
+
25
+
26
+ class ErrorModel:
27
+ """Handles sequencing error simulation and mutations."""
28
+
29
+ def __init__(self, parameters: ErrorParameters):
30
+ """Initialize error model.
31
+
32
+ Args:
33
+ parameters: Error simulation parameters
34
+ """
35
+ self.parameters = parameters
36
+ self._mismatch_bases = {
37
+ "A": ["C", "G", "T"],
38
+ "C": ["A", "G", "T"],
39
+ "G": ["A", "C", "T"],
40
+ "T": ["A", "C", "G"],
41
+ "N": ["A", "C", "G", "T"], # Handle 'N' bases
42
+ }
43
+
44
+ def introduce_errors(self, sequence: str) -> Tuple[str, List[Dict]]:
45
+ """Introduce sequencing errors into a sequence.
46
+
47
+ Args:
48
+ sequence: Original DNA sequence
49
+
50
+ Returns:
51
+ Tuple of (modified_sequence, error_log)
52
+ """
53
+ modified_sequence = list(sequence)
54
+ error_log = []
55
+ i = 0
56
+ total_error_rate = (
57
+ self.parameters.mismatch_rate
58
+ + self.parameters.insertion_rate
59
+ + self.parameters.deletion_rate
60
+ )
61
+
62
+ while i < len(modified_sequence):
63
+ # First, determine if any error should occur at this position
64
+ if random.random() < total_error_rate:
65
+ error_type = self._choose_error_type()
66
+ base = modified_sequence[i]
67
+
68
+ if error_type == "mismatch":
69
+ new_base = random.choice(
70
+ self._mismatch_bases.get(base, ["A", "C", "G", "T"])
71
+ )
72
+ modified_sequence[i] = new_base
73
+ error_log.append(
74
+ {
75
+ "position": i,
76
+ "type": "mismatch",
77
+ "original": base,
78
+ "new": new_base,
79
+ }
80
+ )
81
+ elif error_type == "insertion":
82
+ inserted_base = random.choice(["A", "C", "G", "T"])
83
+ modified_sequence.insert(i, inserted_base)
84
+ error_log.append(
85
+ {"position": i, "type": "insertion", "inserted": inserted_base}
86
+ )
87
+ i += 1 # Skip the newly inserted base to avoid cascading errors
88
+ elif error_type == "deletion":
89
+ deleted_base = modified_sequence.pop(i)
90
+ error_log.append(
91
+ {"position": i, "type": "deletion", "deleted": deleted_base}
92
+ )
93
+ continue # Loop again at the same index `i` on the now-shorter list
94
+ i += 1
95
+ return "".join(modified_sequence), error_log
96
+
97
+ def _choose_error_type(self) -> str:
98
+ """Choose an error type based on the relative rates of configured errors.
99
+
100
+ Returns:
101
+ Error type ("mismatch", "insertion", or "deletion")
102
+ """
103
+ total_rate = (
104
+ self.parameters.mismatch_rate
105
+ + self.parameters.insertion_rate
106
+ + self.parameters.deletion_rate
107
+ )
108
+ if total_rate == 0:
109
+ return "none" # No errors to choose from
110
+
111
+ # Normalize rates to create a probability distribution
112
+ norm_mismatch = self.parameters.mismatch_rate / total_rate
113
+ norm_insertion = self.parameters.insertion_rate / total_rate
114
+ rand = random.random()
115
+
116
+ if rand < norm_mismatch:
117
+ return "mismatch"
118
+ if rand < norm_mismatch + norm_insertion:
119
+ return "insertion"
120
+ return "deletion"
121
+
122
+ @staticmethod
123
+ def calculate_quality_scores(
124
+ sequence_length: int, base_quality: int = 30
125
+ ) -> List[int]:
126
+ """Calculate quality scores for a sequence.
127
+
128
+ Args:
129
+ sequence_length: Length of the sequence
130
+ base_quality: Base quality score (Phred scale)
131
+
132
+ Returns:
133
+ List of quality scores
134
+ """
135
+ if sequence_length < 0:
136
+ raise ValueError("Sequence length cannot be negative.")
137
+ return [base_quality] * sequence_length
138
+
139
+ def introduce_methylation_errors(
140
+ self, methylation_sites: List[Dict], error_rate: float = 0.01
141
+ ) -> List[Dict]:
142
+ """Introduce errors in methylation detection.
143
+
144
+ Args:
145
+ methylation_sites: List of methylation site dictionaries
146
+ error_rate: Rate of methylation detection errors
147
+
148
+ Returns:
149
+ Modified list of methylation sites without altering the original.
150
+ """
151
+ if not 0 <= error_rate <= 1:
152
+ raise ValueError("Methylation error rate must be between 0 and 1")
153
+
154
+ modified_sites = []
155
+ for site in methylation_sites:
156
+ site_copy = site.copy() # Avoid modifying the original list of dicts
157
+ if random.random() < error_rate:
158
+ # Flip methylation state
159
+ site_copy["detected"] = not site_copy.get("detected", True)
160
+ site_copy["error"] = True
161
+ else:
162
+ site_copy["error"] = False
163
+ modified_sites.append(site_copy)
164
+ return modified_sites
165
+
166
+ def simulate_sequencing_artifacts(
167
+ self, sequence: str, position: int
168
+ ) -> Tuple[str, Dict]:
169
+ """Simulate sequencing artifacts at a specific position.
170
+
171
+ Args:
172
+ sequence: DNA sequence
173
+ position: Position to introduce artifact
174
+
175
+ Returns:
176
+ Tuple of (modified_sequence, artifact_info)
177
+ """
178
+ if not 0 <= position < len(sequence):
179
+ raise ValueError("Position must be a valid index in the sequence.")
180
+
181
+ artifact_types = ["stutter", "dropout", "amplification_bias"]
182
+ artifact_type = random.choice(artifact_types)
183
+ modified_sequence = list(sequence)
184
+ artifact_info = {"type": artifact_type, "position": position}
185
+
186
+ if artifact_type == "stutter":
187
+ base = modified_sequence[position]
188
+ modified_sequence.insert(position, base)
189
+ artifact_info["repeated_base"] = base
190
+ elif artifact_type == "dropout":
191
+ deleted_base = modified_sequence.pop(position)
192
+ artifact_info["deleted_base"] = deleted_base
193
+ elif artifact_type == "amplification_bias":
194
+ base_to_change = modified_sequence[position]
195
+ bias_bases = ["A", "T"] # Example bias
196
+ if base_to_change not in bias_bases:
197
+ new_base = random.choice(bias_bases)
198
+ modified_sequence[position] = new_base
199
+ artifact_info["original_base"] = base_to_change
200
+ artifact_info["biased_base"] = new_base
201
+ return "".join(modified_sequence), artifact_info
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ from dataclasses import dataclass
5
+ from enum import Enum
6
+
7
+
8
+ # Methylation modifications
9
+ class ModificationType(Enum):
10
+ C5MC = "5mC"
11
+ C5HMC = "5hmC"
12
+ C5FC = "5fC"
13
+ C5CAC = "5caC"
14
+
15
+
16
+ class StrandOrientation(Enum):
17
+ FORWARD = "+"
18
+ BACKWARD = "-"
19
+ RANDOM = "?"
20
+
21
+ def to_is_reverse(self) -> bool:
22
+ if self is StrandOrientation.FORWARD:
23
+ return False
24
+ if self is StrandOrientation.BACKWARD:
25
+ return True
26
+ if self is StrandOrientation.RANDOM:
27
+ return random.choice([True, False])
28
+ raise ValueError(f"Unsupported strand orientation: {self}")
29
+
30
+ def __repr__(self):
31
+ return self.value
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class Chromosome:
36
+ """
37
+ Represents a simulated chromosome with defined CpG sites and simulation regions.
38
+ """
39
+
40
+ name: str
41
+ length: int
42
+
43
+
44
+ @dataclass(frozen=True, order=True)
45
+ class MethylationSite:
46
+ """
47
+ Represents a CpG site in a genome with a probability of being methylated.
48
+ """
49
+
50
+ position: int # Zero-based position in reference
51
+ context: str = "CG" # Methylation context (e.g., "CG", "CHG")
52
+ modification: ModificationType = ModificationType("5mC")
53
+ methylation_prob: float = 0.5
54
+
55
+ def __post_init__(self):
56
+ """Validate methylation site parameters."""
57
+ if not 0.0 <= self.methylation_prob <= 1.0:
58
+ raise ValueError(
59
+ f"methylation_prob must be in [0, 1], got {self.methylation_prob}"
60
+ )
61
+ if self.position < 0:
62
+ raise ValueError("Position must be non-negative")
63
+ if not self.context:
64
+ raise ValueError("Context cannot be empty")
65
+
66
+ def with_position(self, new_position: int) -> MethylationSite:
67
+ return MethylationSite(
68
+ methylation_prob=self.methylation_prob,
69
+ position=new_position,
70
+ context=self.context,
71
+ modification=self.modification,
72
+ )
73
+
74
+ def get_cytosine_position(self, is_reverse_strand: bool):
75
+ return self.position + 1 if is_reverse_strand else self.position
76
+
77
+
78
+ @dataclass(frozen=True)
79
+ class GenomicInterval:
80
+ """
81
+ Represents a genomic interval using half-open coordinates [start, end).
82
+
83
+ Attributes:
84
+ start (int): Start coordinate (inclusive).
85
+ end (int): End coordinate (exclusive).
86
+ """
87
+
88
+ start: int # inclusive
89
+ end: int # exclusive (half-open)
90
+ is_reverse: bool
91
+
92
+ def __post_init__(self):
93
+ """Validate region parameters after initialization."""
94
+ if self.start < 0:
95
+ raise ValueError("Start position must be non-negative")
96
+ if self.end < self.start:
97
+ raise ValueError("End position must not be smaller than start position")
98
+
99
+ @property
100
+ def length(self) -> int:
101
+ """Return the length of the region (0-based, half-open)."""
102
+ return self.end - self.start
103
+
104
+ def contains(self, position: int) -> bool:
105
+ """
106
+ Check if a specific position is within the region.
107
+
108
+ Args:
109
+ position (int): Genomic position to check.
110
+
111
+ Returns:
112
+ bool: True if the position is within the region.
113
+ """
114
+ # pylint: disable=W0511
115
+ # TODO: validate consistency on BW strand
116
+ return self.start <= position < self.end
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ from abc import ABC, abstractmethod
6
+ from functools import lru_cache
7
+ from urllib.request import urlretrieve
8
+ from urllib.error import URLError, HTTPError
9
+ from typing import Optional
10
+
11
+ from pysam.libcfaidx import FastxFile
12
+ from pysam import SamtoolsError # more specific error
13
+
14
+ from pymethyl2sam.core.genomics import GenomicInterval
15
+
16
+
17
+ # pylint: disable=R0903
18
+ class ReferenceGenomeProvider(ABC):
19
+ @abstractmethod
20
+ def get_sequence(self, chromosome: str, region: GenomicInterval) -> str:
21
+ """Get the reference sequence for a specific region."""
22
+
23
+
24
+ # pylint: disable=R0903
25
+ class ConstantReferenceGenome(ReferenceGenomeProvider):
26
+ def get_sequence(self, chromosome: str, region: GenomicInterval) -> str:
27
+ return "A" * region.length
28
+
29
+
30
+ # pylint: disable=R0903
31
+ class Hg38ReferenceGenome(ReferenceGenomeProvider):
32
+ def __init__(
33
+ self,
34
+ base_url: str = "http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/",
35
+ cache_dir: str = "hg38_cache",
36
+ ):
37
+ self.base_url = base_url
38
+ self.cache_dir = cache_dir
39
+ os.makedirs(self.cache_dir, exist_ok=True)
40
+
41
+ def get_sequence(self, chromosome: str, region: GenomicInterval) -> str:
42
+ """Public method to get a sequence slice with boundary checks."""
43
+ chromosome_sequence = self._get_full_chromosome_sequence(chromosome)
44
+ if chromosome_sequence is None:
45
+ raise FileNotFoundError(
46
+ f"Could not retrieve sequence for chromosome '{chromosome}'. "
47
+ f"Expected file: {chromosome}.fa.gz in {self.cache_dir}"
48
+ )
49
+
50
+ chromosome_length = len(chromosome_sequence)
51
+ if region.end > chromosome_length:
52
+ raise ValueError(
53
+ f"Requested interval {region.start}-{region.end} is outside the "
54
+ f"bounds of chromosome '{chromosome}' (length: {chromosome_length})."
55
+ )
56
+
57
+ return chromosome_sequence[region.start : region.end]
58
+
59
+ @lru_cache(maxsize=32)
60
+ def _get_full_chromosome_sequence(self, chromosome: str) -> Optional[str]:
61
+ """
62
+ Downloads, reads, and caches the entire sequence for a given chromosome.
63
+ """
64
+ fasta_filename = f"{chromosome}.fa.gz"
65
+ fasta_path = os.path.join(self.cache_dir, fasta_filename)
66
+
67
+ if not os.path.exists(fasta_path):
68
+ try:
69
+ logging.info(
70
+ f"Downloading {fasta_filename} to cache directory '{self.cache_dir}'..."
71
+ )
72
+ urlretrieve(self.base_url + fasta_filename, fasta_path)
73
+ except (URLError, HTTPError) as e:
74
+ raise FileNotFoundError(
75
+ f"Failed to download {fasta_filename}: {e.reason}"
76
+ ) from e
77
+
78
+ try:
79
+ with FastxFile(fasta_path) as ref:
80
+ for entry in ref:
81
+ if entry.name == chromosome:
82
+ logging.info(f"Caching full sequence for {entry.name}")
83
+ return entry.sequence
84
+ except (OSError, SamtoolsError) as e:
85
+ logging.error(f"Error reading FASTA file '{fasta_path}': {e}")
86
+
87
+ return None
@@ -0,0 +1,221 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+ from random import randint
6
+ from typing import Iterable
7
+
8
+ from pymethyl2sam.core.genomics import (
9
+ MethylationSite,
10
+ StrandOrientation,
11
+ GenomicInterval,
12
+ )
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ReadQuality:
17
+ """Quality scores for sequencing and mapping."""
18
+
19
+ sequencing_score: int = 32
20
+ mapping_score: int = 100
21
+
22
+ def __post_init__(self):
23
+ if not 0 <= self.sequencing_score <= 50:
24
+ raise ValueError("sequencing_score must be in the range [0, 50].")
25
+ if not 0 <= self.mapping_score <= 255:
26
+ raise ValueError("mapping_score must be in the range [0, 255].")
27
+
28
+
29
+ @dataclass(frozen=True, order=True)
30
+ class ReadTemplate(GenomicInterval):
31
+ """Template for generating a single read with quality and methylation sites."""
32
+
33
+ quality: ReadQuality
34
+ local_methylated_sites: list[MethylationSite]
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class ReadGenerationStrategy(ABC):
39
+ """Abstract base class for read generation strategies."""
40
+
41
+ @abstractmethod
42
+ def generate_read_intervals(
43
+ self, region: GenomicInterval, read_length: int
44
+ ) -> Iterable[GenomicInterval]:
45
+ pass
46
+
47
+ @property
48
+ @abstractmethod
49
+ def total_reads(self) -> int:
50
+ pass
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class PatternedReadData:
55
+ """Data class representing a single read pattern with offset and orientation."""
56
+
57
+ offset: int
58
+ orientation: StrandOrientation
59
+
60
+
61
+ @dataclass(frozen=True)
62
+ class PatternStrategy(ReadGenerationStrategy):
63
+ data: Iterable[PatternedReadData]
64
+
65
+ def generate_read_intervals(
66
+ self, region: GenomicInterval, read_length: int
67
+ ) -> Iterable[GenomicInterval]:
68
+ for read in self.data:
69
+ start = region.start + read.offset
70
+ yield GenomicInterval(
71
+ start=start,
72
+ end=start + read_length,
73
+ is_reverse=read.orientation.to_is_reverse(),
74
+ )
75
+
76
+ @property
77
+ def total_reads(self) -> int:
78
+ return len(list(self.data))
79
+
80
+ @staticmethod
81
+ def from_offset_orientation_pairs(
82
+ offsets: Iterable[int], orientations: Iterable[StrandOrientation | str]
83
+ ) -> PatternStrategy:
84
+ """Create a PatternStrategy from paired offsets and orientations.
85
+
86
+ Args:
87
+ offsets: Iterable of offset positions
88
+ orientations: Iterable of strand orientations (can be strings or StrandOrientation)
89
+
90
+ Returns:
91
+ PatternStrategy configured with the provided data
92
+ """
93
+ data = [
94
+ PatternedReadData(
95
+ offset=offset,
96
+ orientation=(
97
+ orientation
98
+ if isinstance(orientation, StrandOrientation)
99
+ else StrandOrientation(orientation)
100
+ ),
101
+ )
102
+ for offset, orientation in zip(offsets, orientations)
103
+ ]
104
+ return PatternStrategy(data)
105
+
106
+ @staticmethod
107
+ def from_offsets(
108
+ offsets: Iterable[int], orientation: StrandOrientation
109
+ ) -> PatternStrategy:
110
+ """Create a PatternStrategy from offsets with a single orientation.
111
+
112
+ Args:
113
+ offsets: Iterable of offset positions
114
+ orientation: Single strand orientation for all reads
115
+
116
+ Returns:
117
+ PatternStrategy configured with the provided data
118
+ """
119
+ data = [
120
+ PatternedReadData(
121
+ offset=offset,
122
+ orientation=orientation,
123
+ )
124
+ for offset in offsets
125
+ ]
126
+ return PatternStrategy(data)
127
+
128
+
129
+ @dataclass(frozen=True)
130
+ class RandomStrategy(ReadGenerationStrategy):
131
+ """Strategy for generating reads at random positions."""
132
+
133
+ reads_per_region: int = 200
134
+ orientation: StrandOrientation = StrandOrientation.RANDOM
135
+
136
+ def __post_init__(self):
137
+ if self.reads_per_region <= 0:
138
+ raise ValueError("reads_per_region must be a positive integer.")
139
+
140
+ def generate_read_intervals(
141
+ self, region: GenomicInterval, read_length: int
142
+ ) -> Iterable[GenomicInterval]:
143
+ min_start = max(region.start - read_length + 1, 0)
144
+ max_start = region.end + read_length - 1
145
+ for _ in range(self.reads_per_region):
146
+ start = randint(min_start, max_start)
147
+ yield GenomicInterval(
148
+ start=start,
149
+ end=start + read_length,
150
+ is_reverse=self.orientation.to_is_reverse(),
151
+ )
152
+
153
+ @property
154
+ def total_reads(self) -> int:
155
+ return self.reads_per_region
156
+
157
+
158
+ @dataclass(frozen=True)
159
+ class EmptyStrategy(ReadGenerationStrategy):
160
+ """Strategy that generates no reads."""
161
+
162
+ def generate_read_intervals(
163
+ self, region: GenomicInterval, read_length: int
164
+ ) -> Iterable[GenomicInterval]:
165
+ return []
166
+
167
+ @property
168
+ def total_reads(self) -> int:
169
+ return 0
170
+
171
+
172
+ @dataclass(frozen=True)
173
+ class ReadGenerator:
174
+ """Generates reads using a specified strategy."""
175
+
176
+ strategy: ReadGenerationStrategy
177
+ read_length: int = 150
178
+ quality: ReadQuality = ReadQuality()
179
+
180
+ def __post_init__(self):
181
+ if self.read_length <= 0:
182
+ raise ValueError("read_length must be a positive integer.")
183
+
184
+ def generate_reads(
185
+ self,
186
+ region: GenomicInterval,
187
+ cpg_sites: Iterable[MethylationSite],
188
+ ) -> Iterable[ReadTemplate]:
189
+ for interval in self.strategy.generate_read_intervals(region, self.read_length):
190
+ local_methylated_sites = [
191
+ site.with_position(site.position - interval.start)
192
+ for site in cpg_sites
193
+ if interval.contains(site.position)
194
+ ]
195
+ yield ReadTemplate(
196
+ quality=self.quality,
197
+ local_methylated_sites=local_methylated_sites,
198
+ **interval.__dict__,
199
+ )
200
+
201
+ def __repr__(self):
202
+ """Return a string representation of the ReadGenerator."""
203
+
204
+ def format_value(val) -> str:
205
+ """Format a value for display in the representation."""
206
+ if isinstance(val, list) and len(val) == 1:
207
+ val = val[0]
208
+ if (
209
+ isinstance(val, str)
210
+ and val.isascii()
211
+ and 0 < len(val) <= 3
212
+ and val.isprintable()
213
+ ):
214
+ return val
215
+ return repr(val)
216
+
217
+ parts = [
218
+ f"{key.replace('_', ' ')}:{format_value(value)}"
219
+ for key, value in self.__dict__.items()
220
+ ]
221
+ return f"{self.__class__.__name__}({', '.join(parts)})"
@@ -0,0 +1,5 @@
1
+ """Optional parsers and I/O for FASTA, BED, YAML, JSON."""
2
+
3
+ from .genome_loader import GenomeLoader
4
+
5
+ __all__ = ["GenomeLoader"]