pymethyl2sam 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymethyl2sam/__init__.py +17 -0
- pymethyl2sam/core/__init__.py +6 -0
- pymethyl2sam/core/errors.py +201 -0
- pymethyl2sam/core/genomics.py +116 -0
- pymethyl2sam/core/reference_genome.py +87 -0
- pymethyl2sam/core/sequencing.py +221 -0
- pymethyl2sam/io/__init__.py +5 -0
- pymethyl2sam/io/genome_loader.py +166 -0
- pymethyl2sam/simulator/__init__.py +15 -0
- pymethyl2sam/simulator/simulator.py +208 -0
- pymethyl2sam/simulator/summary.py +67 -0
- pymethyl2sam/utils/__init__.py +6 -0
- pymethyl2sam/utils/constants.py +55 -0
- pymethyl2sam/utils/logging.py +60 -0
- pymethyl2sam/utils/pysam.py +96 -0
- pymethyl2sam-0.1.2.dist-info/METADATA +267 -0
- pymethyl2sam-0.1.2.dist-info/RECORD +19 -0
- pymethyl2sam-0.1.2.dist-info/WHEEL +5 -0
- pymethyl2sam-0.1.2.dist-info/top_level.txt +1 -0
pymethyl2sam/__init__.py
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
"""
|
2
|
+
PyMethyl2Sam - A Python library for methylation data processing and BAM file generation.
|
3
|
+
|
4
|
+
Copyright (c) 2025
|
5
|
+
"""
|
6
|
+
|
7
|
+
from importlib.metadata import version
|
8
|
+
|
9
|
+
__version__ = version("pymethyl2sam")
|
10
|
+
|
11
|
+
# Import core components
|
12
|
+
from .core.genomics import *
|
13
|
+
from .core.sequencing import *
|
14
|
+
from .core.errors import *
|
15
|
+
from .simulator import MethylationSimulator
|
16
|
+
|
17
|
+
__all__ = ["MethylationSimulator"]
|
@@ -0,0 +1,201 @@
|
|
1
|
+
"""Error model for simulating sequencing errors and mutations."""
|
2
|
+
|
3
|
+
import random
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from typing import Dict, List, Tuple
|
6
|
+
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class ErrorParameters:
|
10
|
+
"""Parameters for error simulation."""
|
11
|
+
|
12
|
+
mismatch_rate: float
|
13
|
+
insertion_rate: float
|
14
|
+
deletion_rate: float
|
15
|
+
|
16
|
+
def __post_init__(self):
|
17
|
+
"""Validate error parameters."""
|
18
|
+
if not 0 <= self.mismatch_rate <= 1:
|
19
|
+
raise ValueError("Mismatch rate must be between 0 and 1")
|
20
|
+
if not 0 <= self.insertion_rate <= 1:
|
21
|
+
raise ValueError("Insertion rate must be between 0 and 1")
|
22
|
+
if not 0 <= self.deletion_rate <= 1:
|
23
|
+
raise ValueError("Deletion rate must be between 0 and 1")
|
24
|
+
|
25
|
+
|
26
|
+
class ErrorModel:
|
27
|
+
"""Handles sequencing error simulation and mutations."""
|
28
|
+
|
29
|
+
def __init__(self, parameters: ErrorParameters):
|
30
|
+
"""Initialize error model.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
parameters: Error simulation parameters
|
34
|
+
"""
|
35
|
+
self.parameters = parameters
|
36
|
+
self._mismatch_bases = {
|
37
|
+
"A": ["C", "G", "T"],
|
38
|
+
"C": ["A", "G", "T"],
|
39
|
+
"G": ["A", "C", "T"],
|
40
|
+
"T": ["A", "C", "G"],
|
41
|
+
"N": ["A", "C", "G", "T"], # Handle 'N' bases
|
42
|
+
}
|
43
|
+
|
44
|
+
def introduce_errors(self, sequence: str) -> Tuple[str, List[Dict]]:
|
45
|
+
"""Introduce sequencing errors into a sequence.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
sequence: Original DNA sequence
|
49
|
+
|
50
|
+
Returns:
|
51
|
+
Tuple of (modified_sequence, error_log)
|
52
|
+
"""
|
53
|
+
modified_sequence = list(sequence)
|
54
|
+
error_log = []
|
55
|
+
i = 0
|
56
|
+
total_error_rate = (
|
57
|
+
self.parameters.mismatch_rate
|
58
|
+
+ self.parameters.insertion_rate
|
59
|
+
+ self.parameters.deletion_rate
|
60
|
+
)
|
61
|
+
|
62
|
+
while i < len(modified_sequence):
|
63
|
+
# First, determine if any error should occur at this position
|
64
|
+
if random.random() < total_error_rate:
|
65
|
+
error_type = self._choose_error_type()
|
66
|
+
base = modified_sequence[i]
|
67
|
+
|
68
|
+
if error_type == "mismatch":
|
69
|
+
new_base = random.choice(
|
70
|
+
self._mismatch_bases.get(base, ["A", "C", "G", "T"])
|
71
|
+
)
|
72
|
+
modified_sequence[i] = new_base
|
73
|
+
error_log.append(
|
74
|
+
{
|
75
|
+
"position": i,
|
76
|
+
"type": "mismatch",
|
77
|
+
"original": base,
|
78
|
+
"new": new_base,
|
79
|
+
}
|
80
|
+
)
|
81
|
+
elif error_type == "insertion":
|
82
|
+
inserted_base = random.choice(["A", "C", "G", "T"])
|
83
|
+
modified_sequence.insert(i, inserted_base)
|
84
|
+
error_log.append(
|
85
|
+
{"position": i, "type": "insertion", "inserted": inserted_base}
|
86
|
+
)
|
87
|
+
i += 1 # Skip the newly inserted base to avoid cascading errors
|
88
|
+
elif error_type == "deletion":
|
89
|
+
deleted_base = modified_sequence.pop(i)
|
90
|
+
error_log.append(
|
91
|
+
{"position": i, "type": "deletion", "deleted": deleted_base}
|
92
|
+
)
|
93
|
+
continue # Loop again at the same index `i` on the now-shorter list
|
94
|
+
i += 1
|
95
|
+
return "".join(modified_sequence), error_log
|
96
|
+
|
97
|
+
def _choose_error_type(self) -> str:
|
98
|
+
"""Choose an error type based on the relative rates of configured errors.
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
Error type ("mismatch", "insertion", or "deletion")
|
102
|
+
"""
|
103
|
+
total_rate = (
|
104
|
+
self.parameters.mismatch_rate
|
105
|
+
+ self.parameters.insertion_rate
|
106
|
+
+ self.parameters.deletion_rate
|
107
|
+
)
|
108
|
+
if total_rate == 0:
|
109
|
+
return "none" # No errors to choose from
|
110
|
+
|
111
|
+
# Normalize rates to create a probability distribution
|
112
|
+
norm_mismatch = self.parameters.mismatch_rate / total_rate
|
113
|
+
norm_insertion = self.parameters.insertion_rate / total_rate
|
114
|
+
rand = random.random()
|
115
|
+
|
116
|
+
if rand < norm_mismatch:
|
117
|
+
return "mismatch"
|
118
|
+
if rand < norm_mismatch + norm_insertion:
|
119
|
+
return "insertion"
|
120
|
+
return "deletion"
|
121
|
+
|
122
|
+
@staticmethod
|
123
|
+
def calculate_quality_scores(
|
124
|
+
sequence_length: int, base_quality: int = 30
|
125
|
+
) -> List[int]:
|
126
|
+
"""Calculate quality scores for a sequence.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
sequence_length: Length of the sequence
|
130
|
+
base_quality: Base quality score (Phred scale)
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
List of quality scores
|
134
|
+
"""
|
135
|
+
if sequence_length < 0:
|
136
|
+
raise ValueError("Sequence length cannot be negative.")
|
137
|
+
return [base_quality] * sequence_length
|
138
|
+
|
139
|
+
def introduce_methylation_errors(
|
140
|
+
self, methylation_sites: List[Dict], error_rate: float = 0.01
|
141
|
+
) -> List[Dict]:
|
142
|
+
"""Introduce errors in methylation detection.
|
143
|
+
|
144
|
+
Args:
|
145
|
+
methylation_sites: List of methylation site dictionaries
|
146
|
+
error_rate: Rate of methylation detection errors
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
Modified list of methylation sites without altering the original.
|
150
|
+
"""
|
151
|
+
if not 0 <= error_rate <= 1:
|
152
|
+
raise ValueError("Methylation error rate must be between 0 and 1")
|
153
|
+
|
154
|
+
modified_sites = []
|
155
|
+
for site in methylation_sites:
|
156
|
+
site_copy = site.copy() # Avoid modifying the original list of dicts
|
157
|
+
if random.random() < error_rate:
|
158
|
+
# Flip methylation state
|
159
|
+
site_copy["detected"] = not site_copy.get("detected", True)
|
160
|
+
site_copy["error"] = True
|
161
|
+
else:
|
162
|
+
site_copy["error"] = False
|
163
|
+
modified_sites.append(site_copy)
|
164
|
+
return modified_sites
|
165
|
+
|
166
|
+
def simulate_sequencing_artifacts(
|
167
|
+
self, sequence: str, position: int
|
168
|
+
) -> Tuple[str, Dict]:
|
169
|
+
"""Simulate sequencing artifacts at a specific position.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
sequence: DNA sequence
|
173
|
+
position: Position to introduce artifact
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
Tuple of (modified_sequence, artifact_info)
|
177
|
+
"""
|
178
|
+
if not 0 <= position < len(sequence):
|
179
|
+
raise ValueError("Position must be a valid index in the sequence.")
|
180
|
+
|
181
|
+
artifact_types = ["stutter", "dropout", "amplification_bias"]
|
182
|
+
artifact_type = random.choice(artifact_types)
|
183
|
+
modified_sequence = list(sequence)
|
184
|
+
artifact_info = {"type": artifact_type, "position": position}
|
185
|
+
|
186
|
+
if artifact_type == "stutter":
|
187
|
+
base = modified_sequence[position]
|
188
|
+
modified_sequence.insert(position, base)
|
189
|
+
artifact_info["repeated_base"] = base
|
190
|
+
elif artifact_type == "dropout":
|
191
|
+
deleted_base = modified_sequence.pop(position)
|
192
|
+
artifact_info["deleted_base"] = deleted_base
|
193
|
+
elif artifact_type == "amplification_bias":
|
194
|
+
base_to_change = modified_sequence[position]
|
195
|
+
bias_bases = ["A", "T"] # Example bias
|
196
|
+
if base_to_change not in bias_bases:
|
197
|
+
new_base = random.choice(bias_bases)
|
198
|
+
modified_sequence[position] = new_base
|
199
|
+
artifact_info["original_base"] = base_to_change
|
200
|
+
artifact_info["biased_base"] = new_base
|
201
|
+
return "".join(modified_sequence), artifact_info
|
@@ -0,0 +1,116 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import random
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from enum import Enum
|
6
|
+
|
7
|
+
|
8
|
+
# Methylation modifications
|
9
|
+
class ModificationType(Enum):
|
10
|
+
C5MC = "5mC"
|
11
|
+
C5HMC = "5hmC"
|
12
|
+
C5FC = "5fC"
|
13
|
+
C5CAC = "5caC"
|
14
|
+
|
15
|
+
|
16
|
+
class StrandOrientation(Enum):
|
17
|
+
FORWARD = "+"
|
18
|
+
BACKWARD = "-"
|
19
|
+
RANDOM = "?"
|
20
|
+
|
21
|
+
def to_is_reverse(self) -> bool:
|
22
|
+
if self is StrandOrientation.FORWARD:
|
23
|
+
return False
|
24
|
+
if self is StrandOrientation.BACKWARD:
|
25
|
+
return True
|
26
|
+
if self is StrandOrientation.RANDOM:
|
27
|
+
return random.choice([True, False])
|
28
|
+
raise ValueError(f"Unsupported strand orientation: {self}")
|
29
|
+
|
30
|
+
def __repr__(self):
|
31
|
+
return self.value
|
32
|
+
|
33
|
+
|
34
|
+
@dataclass(frozen=True)
|
35
|
+
class Chromosome:
|
36
|
+
"""
|
37
|
+
Represents a simulated chromosome with defined CpG sites and simulation regions.
|
38
|
+
"""
|
39
|
+
|
40
|
+
name: str
|
41
|
+
length: int
|
42
|
+
|
43
|
+
|
44
|
+
@dataclass(frozen=True, order=True)
|
45
|
+
class MethylationSite:
|
46
|
+
"""
|
47
|
+
Represents a CpG site in a genome with a probability of being methylated.
|
48
|
+
"""
|
49
|
+
|
50
|
+
position: int # Zero-based position in reference
|
51
|
+
context: str = "CG" # Methylation context (e.g., "CG", "CHG")
|
52
|
+
modification: ModificationType = ModificationType("5mC")
|
53
|
+
methylation_prob: float = 0.5
|
54
|
+
|
55
|
+
def __post_init__(self):
|
56
|
+
"""Validate methylation site parameters."""
|
57
|
+
if not 0.0 <= self.methylation_prob <= 1.0:
|
58
|
+
raise ValueError(
|
59
|
+
f"methylation_prob must be in [0, 1], got {self.methylation_prob}"
|
60
|
+
)
|
61
|
+
if self.position < 0:
|
62
|
+
raise ValueError("Position must be non-negative")
|
63
|
+
if not self.context:
|
64
|
+
raise ValueError("Context cannot be empty")
|
65
|
+
|
66
|
+
def with_position(self, new_position: int) -> MethylationSite:
|
67
|
+
return MethylationSite(
|
68
|
+
methylation_prob=self.methylation_prob,
|
69
|
+
position=new_position,
|
70
|
+
context=self.context,
|
71
|
+
modification=self.modification,
|
72
|
+
)
|
73
|
+
|
74
|
+
def get_cytosine_position(self, is_reverse_strand: bool):
|
75
|
+
return self.position + 1 if is_reverse_strand else self.position
|
76
|
+
|
77
|
+
|
78
|
+
@dataclass(frozen=True)
|
79
|
+
class GenomicInterval:
|
80
|
+
"""
|
81
|
+
Represents a genomic interval using half-open coordinates [start, end).
|
82
|
+
|
83
|
+
Attributes:
|
84
|
+
start (int): Start coordinate (inclusive).
|
85
|
+
end (int): End coordinate (exclusive).
|
86
|
+
"""
|
87
|
+
|
88
|
+
start: int # inclusive
|
89
|
+
end: int # exclusive (half-open)
|
90
|
+
is_reverse: bool
|
91
|
+
|
92
|
+
def __post_init__(self):
|
93
|
+
"""Validate region parameters after initialization."""
|
94
|
+
if self.start < 0:
|
95
|
+
raise ValueError("Start position must be non-negative")
|
96
|
+
if self.end < self.start:
|
97
|
+
raise ValueError("End position must not be smaller than start position")
|
98
|
+
|
99
|
+
@property
|
100
|
+
def length(self) -> int:
|
101
|
+
"""Return the length of the region (0-based, half-open)."""
|
102
|
+
return self.end - self.start
|
103
|
+
|
104
|
+
def contains(self, position: int) -> bool:
|
105
|
+
"""
|
106
|
+
Check if a specific position is within the region.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
position (int): Genomic position to check.
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
bool: True if the position is within the region.
|
113
|
+
"""
|
114
|
+
# pylint: disable=W0511
|
115
|
+
# TODO: validate consistency on BW strand
|
116
|
+
return self.start <= position < self.end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
from abc import ABC, abstractmethod
|
6
|
+
from functools import lru_cache
|
7
|
+
from urllib.request import urlretrieve
|
8
|
+
from urllib.error import URLError, HTTPError
|
9
|
+
from typing import Optional
|
10
|
+
|
11
|
+
from pysam.libcfaidx import FastxFile
|
12
|
+
from pysam import SamtoolsError # more specific error
|
13
|
+
|
14
|
+
from pymethyl2sam.core.genomics import GenomicInterval
|
15
|
+
|
16
|
+
|
17
|
+
# pylint: disable=R0903
|
18
|
+
class ReferenceGenomeProvider(ABC):
|
19
|
+
@abstractmethod
|
20
|
+
def get_sequence(self, chromosome: str, region: GenomicInterval) -> str:
|
21
|
+
"""Get the reference sequence for a specific region."""
|
22
|
+
|
23
|
+
|
24
|
+
# pylint: disable=R0903
|
25
|
+
class ConstantReferenceGenome(ReferenceGenomeProvider):
|
26
|
+
def get_sequence(self, chromosome: str, region: GenomicInterval) -> str:
|
27
|
+
return "A" * region.length
|
28
|
+
|
29
|
+
|
30
|
+
# pylint: disable=R0903
|
31
|
+
class Hg38ReferenceGenome(ReferenceGenomeProvider):
|
32
|
+
def __init__(
|
33
|
+
self,
|
34
|
+
base_url: str = "http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/",
|
35
|
+
cache_dir: str = "hg38_cache",
|
36
|
+
):
|
37
|
+
self.base_url = base_url
|
38
|
+
self.cache_dir = cache_dir
|
39
|
+
os.makedirs(self.cache_dir, exist_ok=True)
|
40
|
+
|
41
|
+
def get_sequence(self, chromosome: str, region: GenomicInterval) -> str:
|
42
|
+
"""Public method to get a sequence slice with boundary checks."""
|
43
|
+
chromosome_sequence = self._get_full_chromosome_sequence(chromosome)
|
44
|
+
if chromosome_sequence is None:
|
45
|
+
raise FileNotFoundError(
|
46
|
+
f"Could not retrieve sequence for chromosome '{chromosome}'. "
|
47
|
+
f"Expected file: {chromosome}.fa.gz in {self.cache_dir}"
|
48
|
+
)
|
49
|
+
|
50
|
+
chromosome_length = len(chromosome_sequence)
|
51
|
+
if region.end > chromosome_length:
|
52
|
+
raise ValueError(
|
53
|
+
f"Requested interval {region.start}-{region.end} is outside the "
|
54
|
+
f"bounds of chromosome '{chromosome}' (length: {chromosome_length})."
|
55
|
+
)
|
56
|
+
|
57
|
+
return chromosome_sequence[region.start : region.end]
|
58
|
+
|
59
|
+
@lru_cache(maxsize=32)
|
60
|
+
def _get_full_chromosome_sequence(self, chromosome: str) -> Optional[str]:
|
61
|
+
"""
|
62
|
+
Downloads, reads, and caches the entire sequence for a given chromosome.
|
63
|
+
"""
|
64
|
+
fasta_filename = f"{chromosome}.fa.gz"
|
65
|
+
fasta_path = os.path.join(self.cache_dir, fasta_filename)
|
66
|
+
|
67
|
+
if not os.path.exists(fasta_path):
|
68
|
+
try:
|
69
|
+
logging.info(
|
70
|
+
f"Downloading {fasta_filename} to cache directory '{self.cache_dir}'..."
|
71
|
+
)
|
72
|
+
urlretrieve(self.base_url + fasta_filename, fasta_path)
|
73
|
+
except (URLError, HTTPError) as e:
|
74
|
+
raise FileNotFoundError(
|
75
|
+
f"Failed to download {fasta_filename}: {e.reason}"
|
76
|
+
) from e
|
77
|
+
|
78
|
+
try:
|
79
|
+
with FastxFile(fasta_path) as ref:
|
80
|
+
for entry in ref:
|
81
|
+
if entry.name == chromosome:
|
82
|
+
logging.info(f"Caching full sequence for {entry.name}")
|
83
|
+
return entry.sequence
|
84
|
+
except (OSError, SamtoolsError) as e:
|
85
|
+
logging.error(f"Error reading FASTA file '{fasta_path}': {e}")
|
86
|
+
|
87
|
+
return None
|
@@ -0,0 +1,221 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from random import randint
|
6
|
+
from typing import Iterable
|
7
|
+
|
8
|
+
from pymethyl2sam.core.genomics import (
|
9
|
+
MethylationSite,
|
10
|
+
StrandOrientation,
|
11
|
+
GenomicInterval,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass(frozen=True)
|
16
|
+
class ReadQuality:
|
17
|
+
"""Quality scores for sequencing and mapping."""
|
18
|
+
|
19
|
+
sequencing_score: int = 32
|
20
|
+
mapping_score: int = 100
|
21
|
+
|
22
|
+
def __post_init__(self):
|
23
|
+
if not 0 <= self.sequencing_score <= 50:
|
24
|
+
raise ValueError("sequencing_score must be in the range [0, 50].")
|
25
|
+
if not 0 <= self.mapping_score <= 255:
|
26
|
+
raise ValueError("mapping_score must be in the range [0, 255].")
|
27
|
+
|
28
|
+
|
29
|
+
@dataclass(frozen=True, order=True)
|
30
|
+
class ReadTemplate(GenomicInterval):
|
31
|
+
"""Template for generating a single read with quality and methylation sites."""
|
32
|
+
|
33
|
+
quality: ReadQuality
|
34
|
+
local_methylated_sites: list[MethylationSite]
|
35
|
+
|
36
|
+
|
37
|
+
@dataclass(frozen=True)
|
38
|
+
class ReadGenerationStrategy(ABC):
|
39
|
+
"""Abstract base class for read generation strategies."""
|
40
|
+
|
41
|
+
@abstractmethod
|
42
|
+
def generate_read_intervals(
|
43
|
+
self, region: GenomicInterval, read_length: int
|
44
|
+
) -> Iterable[GenomicInterval]:
|
45
|
+
pass
|
46
|
+
|
47
|
+
@property
|
48
|
+
@abstractmethod
|
49
|
+
def total_reads(self) -> int:
|
50
|
+
pass
|
51
|
+
|
52
|
+
|
53
|
+
@dataclass(frozen=True)
|
54
|
+
class PatternedReadData:
|
55
|
+
"""Data class representing a single read pattern with offset and orientation."""
|
56
|
+
|
57
|
+
offset: int
|
58
|
+
orientation: StrandOrientation
|
59
|
+
|
60
|
+
|
61
|
+
@dataclass(frozen=True)
|
62
|
+
class PatternStrategy(ReadGenerationStrategy):
|
63
|
+
data: Iterable[PatternedReadData]
|
64
|
+
|
65
|
+
def generate_read_intervals(
|
66
|
+
self, region: GenomicInterval, read_length: int
|
67
|
+
) -> Iterable[GenomicInterval]:
|
68
|
+
for read in self.data:
|
69
|
+
start = region.start + read.offset
|
70
|
+
yield GenomicInterval(
|
71
|
+
start=start,
|
72
|
+
end=start + read_length,
|
73
|
+
is_reverse=read.orientation.to_is_reverse(),
|
74
|
+
)
|
75
|
+
|
76
|
+
@property
|
77
|
+
def total_reads(self) -> int:
|
78
|
+
return len(list(self.data))
|
79
|
+
|
80
|
+
@staticmethod
|
81
|
+
def from_offset_orientation_pairs(
|
82
|
+
offsets: Iterable[int], orientations: Iterable[StrandOrientation | str]
|
83
|
+
) -> PatternStrategy:
|
84
|
+
"""Create a PatternStrategy from paired offsets and orientations.
|
85
|
+
|
86
|
+
Args:
|
87
|
+
offsets: Iterable of offset positions
|
88
|
+
orientations: Iterable of strand orientations (can be strings or StrandOrientation)
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
PatternStrategy configured with the provided data
|
92
|
+
"""
|
93
|
+
data = [
|
94
|
+
PatternedReadData(
|
95
|
+
offset=offset,
|
96
|
+
orientation=(
|
97
|
+
orientation
|
98
|
+
if isinstance(orientation, StrandOrientation)
|
99
|
+
else StrandOrientation(orientation)
|
100
|
+
),
|
101
|
+
)
|
102
|
+
for offset, orientation in zip(offsets, orientations)
|
103
|
+
]
|
104
|
+
return PatternStrategy(data)
|
105
|
+
|
106
|
+
@staticmethod
|
107
|
+
def from_offsets(
|
108
|
+
offsets: Iterable[int], orientation: StrandOrientation
|
109
|
+
) -> PatternStrategy:
|
110
|
+
"""Create a PatternStrategy from offsets with a single orientation.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
offsets: Iterable of offset positions
|
114
|
+
orientation: Single strand orientation for all reads
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
PatternStrategy configured with the provided data
|
118
|
+
"""
|
119
|
+
data = [
|
120
|
+
PatternedReadData(
|
121
|
+
offset=offset,
|
122
|
+
orientation=orientation,
|
123
|
+
)
|
124
|
+
for offset in offsets
|
125
|
+
]
|
126
|
+
return PatternStrategy(data)
|
127
|
+
|
128
|
+
|
129
|
+
@dataclass(frozen=True)
|
130
|
+
class RandomStrategy(ReadGenerationStrategy):
|
131
|
+
"""Strategy for generating reads at random positions."""
|
132
|
+
|
133
|
+
reads_per_region: int = 200
|
134
|
+
orientation: StrandOrientation = StrandOrientation.RANDOM
|
135
|
+
|
136
|
+
def __post_init__(self):
|
137
|
+
if self.reads_per_region <= 0:
|
138
|
+
raise ValueError("reads_per_region must be a positive integer.")
|
139
|
+
|
140
|
+
def generate_read_intervals(
|
141
|
+
self, region: GenomicInterval, read_length: int
|
142
|
+
) -> Iterable[GenomicInterval]:
|
143
|
+
min_start = max(region.start - read_length + 1, 0)
|
144
|
+
max_start = region.end + read_length - 1
|
145
|
+
for _ in range(self.reads_per_region):
|
146
|
+
start = randint(min_start, max_start)
|
147
|
+
yield GenomicInterval(
|
148
|
+
start=start,
|
149
|
+
end=start + read_length,
|
150
|
+
is_reverse=self.orientation.to_is_reverse(),
|
151
|
+
)
|
152
|
+
|
153
|
+
@property
|
154
|
+
def total_reads(self) -> int:
|
155
|
+
return self.reads_per_region
|
156
|
+
|
157
|
+
|
158
|
+
@dataclass(frozen=True)
|
159
|
+
class EmptyStrategy(ReadGenerationStrategy):
|
160
|
+
"""Strategy that generates no reads."""
|
161
|
+
|
162
|
+
def generate_read_intervals(
|
163
|
+
self, region: GenomicInterval, read_length: int
|
164
|
+
) -> Iterable[GenomicInterval]:
|
165
|
+
return []
|
166
|
+
|
167
|
+
@property
|
168
|
+
def total_reads(self) -> int:
|
169
|
+
return 0
|
170
|
+
|
171
|
+
|
172
|
+
@dataclass(frozen=True)
|
173
|
+
class ReadGenerator:
|
174
|
+
"""Generates reads using a specified strategy."""
|
175
|
+
|
176
|
+
strategy: ReadGenerationStrategy
|
177
|
+
read_length: int = 150
|
178
|
+
quality: ReadQuality = ReadQuality()
|
179
|
+
|
180
|
+
def __post_init__(self):
|
181
|
+
if self.read_length <= 0:
|
182
|
+
raise ValueError("read_length must be a positive integer.")
|
183
|
+
|
184
|
+
def generate_reads(
|
185
|
+
self,
|
186
|
+
region: GenomicInterval,
|
187
|
+
cpg_sites: Iterable[MethylationSite],
|
188
|
+
) -> Iterable[ReadTemplate]:
|
189
|
+
for interval in self.strategy.generate_read_intervals(region, self.read_length):
|
190
|
+
local_methylated_sites = [
|
191
|
+
site.with_position(site.position - interval.start)
|
192
|
+
for site in cpg_sites
|
193
|
+
if interval.contains(site.position)
|
194
|
+
]
|
195
|
+
yield ReadTemplate(
|
196
|
+
quality=self.quality,
|
197
|
+
local_methylated_sites=local_methylated_sites,
|
198
|
+
**interval.__dict__,
|
199
|
+
)
|
200
|
+
|
201
|
+
def __repr__(self):
|
202
|
+
"""Return a string representation of the ReadGenerator."""
|
203
|
+
|
204
|
+
def format_value(val) -> str:
|
205
|
+
"""Format a value for display in the representation."""
|
206
|
+
if isinstance(val, list) and len(val) == 1:
|
207
|
+
val = val[0]
|
208
|
+
if (
|
209
|
+
isinstance(val, str)
|
210
|
+
and val.isascii()
|
211
|
+
and 0 < len(val) <= 3
|
212
|
+
and val.isprintable()
|
213
|
+
):
|
214
|
+
return val
|
215
|
+
return repr(val)
|
216
|
+
|
217
|
+
parts = [
|
218
|
+
f"{key.replace('_', ' ')}:{format_value(value)}"
|
219
|
+
for key, value in self.__dict__.items()
|
220
|
+
]
|
221
|
+
return f"{self.__class__.__name__}({', '.join(parts)})"
|