pymethyl2sam 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymethyl2sam/__init__.py +17 -0
- pymethyl2sam/core/__init__.py +6 -0
- pymethyl2sam/core/errors.py +201 -0
- pymethyl2sam/core/genomics.py +116 -0
- pymethyl2sam/core/reference_genome.py +87 -0
- pymethyl2sam/core/sequencing.py +221 -0
- pymethyl2sam/io/__init__.py +5 -0
- pymethyl2sam/io/genome_loader.py +166 -0
- pymethyl2sam/simulator/__init__.py +15 -0
- pymethyl2sam/simulator/simulator.py +208 -0
- pymethyl2sam/simulator/summary.py +67 -0
- pymethyl2sam/utils/__init__.py +6 -0
- pymethyl2sam/utils/constants.py +55 -0
- pymethyl2sam/utils/logging.py +60 -0
- pymethyl2sam/utils/pysam.py +96 -0
- pymethyl2sam-0.1.2.dist-info/METADATA +267 -0
- pymethyl2sam-0.1.2.dist-info/RECORD +19 -0
- pymethyl2sam-0.1.2.dist-info/WHEEL +5 -0
- pymethyl2sam-0.1.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,166 @@
|
|
1
|
+
"""Genome loader for FASTA files."""
|
2
|
+
|
3
|
+
import os
|
4
|
+
from typing import Any, Dict, Optional, List
|
5
|
+
|
6
|
+
from ..utils.logging import setup_logging
|
7
|
+
|
8
|
+
logger = setup_logging(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class GenomeLoader:
|
12
|
+
"""Handles loading genome sequences from FASTA files."""
|
13
|
+
|
14
|
+
def __init__(self):
|
15
|
+
"""Initialize genome loader."""
|
16
|
+
|
17
|
+
def load_genome(
|
18
|
+
self, genome_file: str, chromosome: Optional[str] = None
|
19
|
+
) -> Dict[str, Any]:
|
20
|
+
"""Load genome sequences from FASTA file.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
genome_file: Path to FASTA file
|
24
|
+
chromosome: Specific chromosome to load (optional)
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
Dictionary with genome data
|
28
|
+
"""
|
29
|
+
if not os.path.exists(genome_file):
|
30
|
+
raise FileNotFoundError(f"Genome file not found: {genome_file}")
|
31
|
+
|
32
|
+
logger.info(f"Loading genome from: {genome_file}")
|
33
|
+
|
34
|
+
sequences = {}
|
35
|
+
current_chr = None
|
36
|
+
current_seq = []
|
37
|
+
|
38
|
+
with open(genome_file, "r", encoding="utf-8") as file_handle:
|
39
|
+
for line in file_handle:
|
40
|
+
line = line.strip()
|
41
|
+
|
42
|
+
if line.startswith(">"):
|
43
|
+
# Save previous sequence
|
44
|
+
if current_chr and current_seq:
|
45
|
+
sequences[current_chr] = "".join(current_seq)
|
46
|
+
|
47
|
+
# Parse header
|
48
|
+
header = line[1:] # Remove '>'
|
49
|
+
current_chr = self._parse_chromosome_name(header)
|
50
|
+
current_seq = []
|
51
|
+
|
52
|
+
# Skip if specific chromosome requested and this isn't it
|
53
|
+
if chromosome and current_chr != chromosome:
|
54
|
+
current_chr = None
|
55
|
+
current_seq = []
|
56
|
+
|
57
|
+
elif current_chr and line:
|
58
|
+
current_seq.append(line.upper())
|
59
|
+
|
60
|
+
# Save last sequence
|
61
|
+
if current_chr and current_seq:
|
62
|
+
sequences[current_chr] = "".join(current_seq)
|
63
|
+
|
64
|
+
if chromosome and chromosome not in sequences:
|
65
|
+
raise ValueError(f"Chromosome {chromosome} not found in genome file")
|
66
|
+
|
67
|
+
if not sequences:
|
68
|
+
raise ValueError(f"No sequences found in genome file: {genome_file}")
|
69
|
+
|
70
|
+
genome_data = {
|
71
|
+
"file": genome_file,
|
72
|
+
"sequences": sequences,
|
73
|
+
"total_length": sum(len(seq) for seq in sequences.values()),
|
74
|
+
}
|
75
|
+
|
76
|
+
logger.info(
|
77
|
+
f"Loaded {len(sequences)} sequences with total length {genome_data['total_length']}"
|
78
|
+
)
|
79
|
+
return genome_data
|
80
|
+
|
81
|
+
def _parse_chromosome_name(self, header: str) -> str:
|
82
|
+
"""Parse chromosome name from FASTA header.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
header: FASTA header line
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
Chromosome name
|
89
|
+
"""
|
90
|
+
# Simple parsing - take first word after '>'
|
91
|
+
# This can be enhanced for different header formats
|
92
|
+
parts = header.split()
|
93
|
+
if parts:
|
94
|
+
return parts[0]
|
95
|
+
return "unknown"
|
96
|
+
|
97
|
+
def get_chromosome_list(self, genome_file: str) -> List[str]:
|
98
|
+
"""Get list of chromosomes in genome file.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
genome_file: Path to FASTA file
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
List of chromosome names
|
105
|
+
"""
|
106
|
+
chromosomes = []
|
107
|
+
|
108
|
+
with open(genome_file, "r", encoding="utf-8") as file_handle:
|
109
|
+
for line in file_handle:
|
110
|
+
if line.startswith(">"):
|
111
|
+
header = line[1:].strip()
|
112
|
+
chr_name = self._parse_chromosome_name(header)
|
113
|
+
chromosomes.append(chr_name)
|
114
|
+
|
115
|
+
return chromosomes
|
116
|
+
|
117
|
+
def get_sequence_length(self, genome_file: str, chromosome: str) -> int:
|
118
|
+
"""Get length of a specific chromosome.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
genome_file: Path to FASTA file
|
122
|
+
chromosome: Chromosome name
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
Sequence length
|
126
|
+
"""
|
127
|
+
genome_data = self.load_genome(genome_file, chromosome)
|
128
|
+
if chromosome in genome_data["sequences"]:
|
129
|
+
return len(genome_data["sequences"][chromosome])
|
130
|
+
return 0
|
131
|
+
|
132
|
+
def validate_genome_file(self, genome_file: str) -> Dict[str, Any]:
|
133
|
+
"""Validate genome file and return statistics.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
genome_file: Path to FASTA file
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
Dictionary with validation results
|
140
|
+
"""
|
141
|
+
validation_results = {
|
142
|
+
"file_exists": False,
|
143
|
+
"total_sequences": 0,
|
144
|
+
"total_length": 0,
|
145
|
+
"chromosomes": [],
|
146
|
+
"errors": [],
|
147
|
+
}
|
148
|
+
|
149
|
+
try:
|
150
|
+
if not os.path.exists(genome_file):
|
151
|
+
validation_results["errors"].append(f"File not found: {genome_file}")
|
152
|
+
return validation_results
|
153
|
+
|
154
|
+
validation_results["file_exists"] = True
|
155
|
+
|
156
|
+
genome_data = self.load_genome(genome_file)
|
157
|
+
validation_results["total_sequences"] = len(genome_data["sequences"])
|
158
|
+
validation_results["total_length"] = genome_data["total_length"]
|
159
|
+
validation_results["chromosomes"] = list(genome_data["sequences"].keys())
|
160
|
+
|
161
|
+
except RuntimeError as ex:
|
162
|
+
validation_results["errors"].append(
|
163
|
+
f"Error validating genome file: {str(ex)}"
|
164
|
+
)
|
165
|
+
|
166
|
+
return validation_results
|
@@ -0,0 +1,15 @@
|
|
1
|
+
"""Orchestration of read and methylation simulation."""
|
2
|
+
|
3
|
+
from ..core.reference_genome import ConstantReferenceGenome
|
4
|
+
from .simulator import (
|
5
|
+
MethylationSimulator,
|
6
|
+
SequencedChromosome,
|
7
|
+
SequencedRegion,
|
8
|
+
)
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"MethylationSimulator",
|
12
|
+
"SequencedChromosome",
|
13
|
+
"SequencedRegion",
|
14
|
+
"ConstantReferenceGenome",
|
15
|
+
]
|
@@ -0,0 +1,208 @@
|
|
1
|
+
"""Simulator class for generating aligned reads from simulated CpG methylation data."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
from importlib.metadata import version, PackageNotFoundError
|
8
|
+
from random import seed as random_seed, random
|
9
|
+
from typing import Generator, List, Optional, Set
|
10
|
+
|
11
|
+
import pysam
|
12
|
+
from pysam.libcalignedsegment import AlignedSegment
|
13
|
+
from pysam.libcalignmentfile import AlignmentFile, AlignmentHeader
|
14
|
+
|
15
|
+
from pymethyl2sam.core.genomics import GenomicInterval, Chromosome, MethylationSite
|
16
|
+
from pymethyl2sam.core.reference_genome import ReferenceGenomeProvider
|
17
|
+
from pymethyl2sam.core.sequencing import ReadTemplate, ReadGenerator, ReadQuality
|
18
|
+
from pymethyl2sam.utils.pysam import make_sam_dict, generate_mm_tag
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
logging.basicConfig(level=logging.INFO)
|
22
|
+
|
23
|
+
|
24
|
+
@dataclass(frozen=True, init=False)
|
25
|
+
class SequencedRegion:
|
26
|
+
"""Represents a genomic region with associated read generation strategy."""
|
27
|
+
|
28
|
+
interval: GenomicInterval
|
29
|
+
read_generator: ReadGenerator
|
30
|
+
|
31
|
+
def __init__(self, start: int, end: int, read_generator: ReadGenerator):
|
32
|
+
object.__setattr__(self, "interval", GenomicInterval(start, end, False))
|
33
|
+
object.__setattr__(self, "read_generator", read_generator)
|
34
|
+
|
35
|
+
|
36
|
+
@dataclass(frozen=True)
|
37
|
+
class SequencedChromosome(Chromosome):
|
38
|
+
regions: List[SequencedRegion]
|
39
|
+
cpg_sites: List[MethylationSite] = field(default_factory=list)
|
40
|
+
|
41
|
+
def __post_init__(self):
|
42
|
+
"""Validate that methylation sites do not overlap."""
|
43
|
+
sorted_sites = sorted(self.cpg_sites, key=lambda s: s.position)
|
44
|
+
for prev, curr in zip(sorted_sites, sorted_sites[1:]):
|
45
|
+
prev_end = prev.position + len(prev.context)
|
46
|
+
curr_start = curr.position
|
47
|
+
if curr_start < prev_end:
|
48
|
+
raise ValueError(
|
49
|
+
f"Overlapping methylation sites detected:\n"
|
50
|
+
f" Site 1: pos={prev.position}, context='{prev.context}'\n"
|
51
|
+
f" Site 2: pos={curr.position}, context='{curr.context}'"
|
52
|
+
)
|
53
|
+
|
54
|
+
def generate_reads(self) -> Generator[ReadTemplate, None, None]:
|
55
|
+
"""Yield reads for each defined region in the chromosome."""
|
56
|
+
for region in self.regions:
|
57
|
+
logger.debug(f"Generating reads for region: {region}")
|
58
|
+
yield from region.read_generator.generate_reads(
|
59
|
+
region.interval, self.cpg_sites
|
60
|
+
)
|
61
|
+
|
62
|
+
|
63
|
+
@dataclass(frozen=True)
|
64
|
+
class SimulatedRead(GenomicInterval):
|
65
|
+
"""Represents a simulated read with sequence, methylation sites, and quality."""
|
66
|
+
|
67
|
+
chrom: str
|
68
|
+
sequence: str
|
69
|
+
methylated_sites: Set[MethylationSite]
|
70
|
+
quality: ReadQuality
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
def from_template(
|
74
|
+
template: ReadTemplate, chrom: str, reference_genome: ReferenceGenomeProvider
|
75
|
+
) -> SimulatedRead:
|
76
|
+
"""Construct a simulated read from a template and reference sequence."""
|
77
|
+
sequence = bytearray(
|
78
|
+
reference_genome.get_sequence(chrom, template).upper(), "ascii"
|
79
|
+
)
|
80
|
+
sampled_sites: Set[MethylationSite] = set()
|
81
|
+
|
82
|
+
for site in template.local_methylated_sites:
|
83
|
+
SimulatedRead._apply_site(sequence, site)
|
84
|
+
if site.methylation_prob >= 1.0 or random() < site.methylation_prob:
|
85
|
+
sampled_sites.add(site)
|
86
|
+
|
87
|
+
return SimulatedRead(
|
88
|
+
chrom=chrom,
|
89
|
+
start=template.start,
|
90
|
+
end=template.end,
|
91
|
+
is_reverse=template.is_reverse,
|
92
|
+
sequence=sequence.decode("ascii"),
|
93
|
+
quality=template.quality,
|
94
|
+
methylated_sites=sampled_sites,
|
95
|
+
)
|
96
|
+
|
97
|
+
@staticmethod
|
98
|
+
def _apply_site(sequence: bytearray, site: MethylationSite) -> None:
|
99
|
+
"""Apply a methylation site's context to the sequence in-place."""
|
100
|
+
start = max(0, site.position)
|
101
|
+
end = min(start + len(site.context), len(sequence))
|
102
|
+
sequence[start:end] = site.context[: end - start].encode("ascii")
|
103
|
+
|
104
|
+
def to_aligned_segment(self, header: AlignmentHeader) -> AlignedSegment:
|
105
|
+
"""Convert this simulated read into a pysam AlignedSegment."""
|
106
|
+
segment = AlignedSegment.from_dict(
|
107
|
+
make_sam_dict(
|
108
|
+
chrome=self.chrom,
|
109
|
+
start=self.start,
|
110
|
+
sequence=self.sequence,
|
111
|
+
quality=self.quality,
|
112
|
+
is_reverse=self.is_reverse,
|
113
|
+
),
|
114
|
+
header=header,
|
115
|
+
)
|
116
|
+
segment.set_tag("MD", str(self.length), value_type="Z")
|
117
|
+
segment.set_tag("NM", 0, value_type="i")
|
118
|
+
|
119
|
+
if self.methylated_sites and self.sequence:
|
120
|
+
mm_tag = generate_mm_tag(
|
121
|
+
self.sequence,
|
122
|
+
self.methylated_sites,
|
123
|
+
self.is_reverse,
|
124
|
+
)
|
125
|
+
segment.set_tag("MM", mm_tag, value_type="Z")
|
126
|
+
|
127
|
+
return segment
|
128
|
+
|
129
|
+
|
130
|
+
@dataclass
|
131
|
+
class MethylationSimulator:
|
132
|
+
"""Main simulator class for generating methylation-aware reads."""
|
133
|
+
|
134
|
+
chromosomes: List[SequencedChromosome]
|
135
|
+
reference_genome: ReferenceGenomeProvider
|
136
|
+
|
137
|
+
def simulate_reads(
|
138
|
+
self,
|
139
|
+
output_file: str,
|
140
|
+
seed: Optional[int] = None,
|
141
|
+
is_sorted: bool = True,
|
142
|
+
) -> None:
|
143
|
+
"""Simulate reads and write to a BAM file.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
output_file: Path to output BAM file
|
147
|
+
seed: Random seed for reproducibility
|
148
|
+
is_sorted: Whether to sort the BAM file after writing
|
149
|
+
"""
|
150
|
+
if seed is not None:
|
151
|
+
random_seed(seed)
|
152
|
+
|
153
|
+
header = self.create_header()
|
154
|
+
with AlignmentFile(output_file, "wb", header=header) as out_bam:
|
155
|
+
logger.info("Beginning read simulation...")
|
156
|
+
for chrom in self.chromosomes:
|
157
|
+
logger.debug(f"Simulating reads for chromosome: {chrom.name}")
|
158
|
+
for template in chrom.generate_reads():
|
159
|
+
read = SimulatedRead.from_template(
|
160
|
+
template=template,
|
161
|
+
chrom=chrom.name,
|
162
|
+
reference_genome=self.reference_genome,
|
163
|
+
)
|
164
|
+
out_bam.write(read.to_aligned_segment(header))
|
165
|
+
logger.info("Read simulation complete.")
|
166
|
+
|
167
|
+
if is_sorted:
|
168
|
+
logger.info(f"Sorting output file: {output_file}")
|
169
|
+
pysam.sort(
|
170
|
+
"-o", output_file, output_file, "--write-index", catch_stdout=False
|
171
|
+
)
|
172
|
+
|
173
|
+
def create_header(self) -> AlignmentHeader:
|
174
|
+
"""Create a SAM/BAM header based on the simulated chromosomes."""
|
175
|
+
try:
|
176
|
+
tool_version = version("pymethyl2sam")
|
177
|
+
except PackageNotFoundError:
|
178
|
+
tool_version = "unknown"
|
179
|
+
|
180
|
+
header_dict = {
|
181
|
+
"HD": {"VN": "1.6", "SO": "coordinate"},
|
182
|
+
"SQ": [
|
183
|
+
{"SN": chrom.name, "LN": chrom.length} for chrom in self.chromosomes
|
184
|
+
],
|
185
|
+
"RG": [],
|
186
|
+
"PG": [
|
187
|
+
{
|
188
|
+
"ID": "pymethyl2sam",
|
189
|
+
"VN": tool_version,
|
190
|
+
"CL": "pymethyl2sam",
|
191
|
+
}
|
192
|
+
],
|
193
|
+
}
|
194
|
+
return AlignmentHeader.from_dict(header_dict)
|
195
|
+
|
196
|
+
@property
|
197
|
+
def total_reads(self) -> int:
|
198
|
+
"""Total number of reads across all chromosomes and regions."""
|
199
|
+
return sum(
|
200
|
+
region.read_generator.strategy.total_reads
|
201
|
+
for chrom in self.chromosomes
|
202
|
+
for region in chrom.regions
|
203
|
+
)
|
204
|
+
|
205
|
+
@property
|
206
|
+
def total_methylation_sites(self) -> int:
|
207
|
+
"""Total number of methylation sites across all chromosomes."""
|
208
|
+
return sum(len(chrom.cpg_sites) for chrom in self.chromosomes)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from typing import Dict, Any, Set, Tuple
|
3
|
+
|
4
|
+
from pysam import SamtoolsError
|
5
|
+
from pysam.libcalignmentfile import AlignmentFile
|
6
|
+
|
7
|
+
from pymethyl2sam.utils import setup_logging
|
8
|
+
from pymethyl2sam.utils.pysam import get_read_to_reference_mapping, ReadPosition
|
9
|
+
|
10
|
+
logger = setup_logging(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
def get_simulation_summary(bam_path: str) -> Dict[str, Any]:
|
14
|
+
stats: Dict[str, Any] = {
|
15
|
+
"total_reads": 0,
|
16
|
+
"reads_with_methylation": 0,
|
17
|
+
"chromosome_count": 0,
|
18
|
+
"regions_per_chromosome": defaultdict(int),
|
19
|
+
"total_methylation_sites": 0,
|
20
|
+
}
|
21
|
+
|
22
|
+
methylation_sites: Set[Tuple[str, int]] = set()
|
23
|
+
|
24
|
+
logger.info(f"Opening BAM file: {bam_path}")
|
25
|
+
|
26
|
+
try:
|
27
|
+
with AlignmentFile(bam_path, "rb") as bamfile:
|
28
|
+
chromosomes: Set[str] = set()
|
29
|
+
logger.info("Processing reads...")
|
30
|
+
|
31
|
+
for read in bamfile.fetch(until_eof=True):
|
32
|
+
reverse_strand_offset = 1 if read.is_reverse else 0
|
33
|
+
stats["total_reads"] += 1
|
34
|
+
|
35
|
+
if read.is_unmapped:
|
36
|
+
continue
|
37
|
+
|
38
|
+
chrom = bamfile.get_reference_name(read.reference_id)
|
39
|
+
chromosomes.add(chrom)
|
40
|
+
stats["regions_per_chromosome"][chrom] += 1
|
41
|
+
|
42
|
+
if read.has_tag("MM") and read.modified_bases:
|
43
|
+
read2ref = get_read_to_reference_mapping(read)
|
44
|
+
stats["reads_with_methylation"] += 1
|
45
|
+
|
46
|
+
modified_read_positions: Set[ReadPosition] = {
|
47
|
+
pos - reverse_strand_offset
|
48
|
+
for mods in read.modified_bases.values()
|
49
|
+
for pos, _ in mods
|
50
|
+
}
|
51
|
+
|
52
|
+
for pos in modified_read_positions:
|
53
|
+
if pos in read2ref:
|
54
|
+
methylation_sites.add((chrom, read2ref[pos]))
|
55
|
+
|
56
|
+
stats["total_methylation_sites"] = len(methylation_sites)
|
57
|
+
stats["chromosome_count"] = len(chromosomes)
|
58
|
+
logger.info("Finished processing BAM file.")
|
59
|
+
|
60
|
+
except (FileNotFoundError, PermissionError) as io_err:
|
61
|
+
logger.error(f"File error: {bam_path}", exc_info=io_err)
|
62
|
+
except (ValueError, TypeError, KeyError, AttributeError) as data_err:
|
63
|
+
logger.error("Data format or logic error while parsing BAM.", exc_info=data_err)
|
64
|
+
except (SamtoolsError, EOFError) as bam_err:
|
65
|
+
logger.error("BAM file appears corrupted or unreadable.", exc_info=bam_err)
|
66
|
+
|
67
|
+
return stats
|
@@ -0,0 +1,55 @@
|
|
1
|
+
"""Constants used throughout the package."""
|
2
|
+
|
3
|
+
from typing import Dict
|
4
|
+
|
5
|
+
_ORIG_BASES = "ACGTacgt"
|
6
|
+
_COMP_BASES = "TGCAtgca"
|
7
|
+
COMPLEMENT_TABLE: Dict[int, int] = str.maketrans(_ORIG_BASES, _COMP_BASES)
|
8
|
+
|
9
|
+
INSTRUMENT = "A00000"
|
10
|
+
RUN_NUMBER = 1
|
11
|
+
FLOWCELL_ID = "ABCD1234"
|
12
|
+
|
13
|
+
# Strand orientations
|
14
|
+
STRANDS = ["+", "-"]
|
15
|
+
|
16
|
+
# SAM/BAM tag names
|
17
|
+
MM_TAG = "MM" # Methylation modification
|
18
|
+
ML_TAG = "ML" # Methylation level
|
19
|
+
|
20
|
+
# Default values
|
21
|
+
DEFAULT_COVERAGE = 10.0
|
22
|
+
DEFAULT_READ_LENGTH = 100
|
23
|
+
DEFAULT_ERROR_RATE = 0.005
|
24
|
+
DEFAULT_METHYLATION_RATIO = 0.7
|
25
|
+
DEFAULT_BASE_QUALITY = 30
|
26
|
+
|
27
|
+
# File extensions
|
28
|
+
SUPPORTED_GENOME_FORMATS = [".fasta", ".fa", ".fna"]
|
29
|
+
SUPPORTED_TEMPLATE_FORMATS = [".yaml", ".yml", ".json"]
|
30
|
+
SUPPORTED_CONFIG_FORMATS = [".yaml", ".yml", ".json"]
|
31
|
+
|
32
|
+
# Coordinate system
|
33
|
+
COORDINATE_SYSTEM = "zero-based"
|
34
|
+
INTERVAL_FORMAT = "half-open" # [start, end)
|
35
|
+
|
36
|
+
# Error types
|
37
|
+
ERROR_TYPES = ["mismatch", "insertion", "deletion"]
|
38
|
+
|
39
|
+
# Quality score ranges
|
40
|
+
MIN_QUALITY_SCORE = 0
|
41
|
+
MAX_QUALITY_SCORE = 93
|
42
|
+
|
43
|
+
# SAM/BAM flags
|
44
|
+
SAM_FLAG_PAIRED = 1
|
45
|
+
SAM_FLAG_PROPER_PAIR = 2
|
46
|
+
SAM_FLAG_UNMAP = 4
|
47
|
+
SAM_FLAG_MUNMAP = 8
|
48
|
+
SAM_FLAG_REVERSE = 16
|
49
|
+
SAM_FLAG_MREVERSE = 32
|
50
|
+
SAM_FLAG_READ1 = 64
|
51
|
+
SAM_FLAG_READ2 = 128
|
52
|
+
SAM_FLAG_SECONDARY = 256
|
53
|
+
SAM_FLAG_QCFAIL = 512
|
54
|
+
SAM_FLAG_DUP = 1024
|
55
|
+
SAM_FLAG_SUPPLEMENTARY = 2048
|
@@ -0,0 +1,60 @@
|
|
1
|
+
"""Logging setup and configuration."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import sys
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
|
8
|
+
def setup_logging(name: str, level: str = "INFO") -> logging.Logger:
|
9
|
+
"""Setup logging for a module.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
name: Logger name (usually __name__)
|
13
|
+
level: Logging level
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
Configured logger
|
17
|
+
"""
|
18
|
+
logger = logging.getLogger(name)
|
19
|
+
|
20
|
+
# Avoid adding handlers multiple times
|
21
|
+
if logger.handlers:
|
22
|
+
return logger
|
23
|
+
|
24
|
+
# Set log level
|
25
|
+
logger.setLevel(getattr(logging, level.upper()))
|
26
|
+
|
27
|
+
# Create console handler
|
28
|
+
handler = logging.StreamHandler(sys.stdout)
|
29
|
+
handler.setLevel(getattr(logging, level.upper()))
|
30
|
+
|
31
|
+
# Create formatter
|
32
|
+
formatter = logging.Formatter(
|
33
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
34
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
35
|
+
)
|
36
|
+
handler.setFormatter(formatter)
|
37
|
+
|
38
|
+
# Add handler to logger
|
39
|
+
logger.addHandler(handler)
|
40
|
+
|
41
|
+
return logger
|
42
|
+
|
43
|
+
|
44
|
+
def configure_logging(level: str = "INFO", log_file: Optional[str] = None):
|
45
|
+
"""Configure global logging.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
level: Logging level
|
49
|
+
log_file: Optional log file path
|
50
|
+
"""
|
51
|
+
# Configure root logger
|
52
|
+
logging.basicConfig(
|
53
|
+
level=getattr(logging, level.upper()),
|
54
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
55
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
56
|
+
handlers=[
|
57
|
+
logging.StreamHandler(sys.stdout),
|
58
|
+
logging.FileHandler(log_file) if log_file else logging.NullHandler(),
|
59
|
+
],
|
60
|
+
)
|
@@ -0,0 +1,96 @@
|
|
1
|
+
from random import randint
|
2
|
+
from typing import List, Dict
|
3
|
+
from typing import Tuple, Set
|
4
|
+
|
5
|
+
from pysam.libcalignedsegment import AlignedSegment
|
6
|
+
from pysam.libcutils import qualities_to_qualitystring
|
7
|
+
|
8
|
+
from . import COMPLEMENT_TABLE, RUN_NUMBER, INSTRUMENT, FLOWCELL_ID
|
9
|
+
from ..core.genomics import MethylationSite
|
10
|
+
from ..core.sequencing import ReadQuality
|
11
|
+
|
12
|
+
ReadPosition = ReferencePosition = int
|
13
|
+
AlignedPairs = List[Tuple[ReadPosition, ReferencePosition]]
|
14
|
+
|
15
|
+
|
16
|
+
def get_read_to_reference_mapping(read: AlignedSegment):
|
17
|
+
pairs: AlignedPairs = read.get_aligned_pairs(matches_only=False)
|
18
|
+
return {read_pos: ref if ref else None for read_pos, ref in pairs}
|
19
|
+
|
20
|
+
|
21
|
+
def make_sam_dict(
|
22
|
+
chrome: str,
|
23
|
+
sequence: str,
|
24
|
+
start: int,
|
25
|
+
is_reverse: bool,
|
26
|
+
quality: ReadQuality,
|
27
|
+
) -> Dict[str, str]:
|
28
|
+
read_name = _generate_read_name()
|
29
|
+
read_length = len(sequence)
|
30
|
+
flag = 16 if is_reverse else 0
|
31
|
+
ref_pos = start + 1
|
32
|
+
qual = qualities_to_qualitystring([quality.sequencing_score] * read_length)
|
33
|
+
return {
|
34
|
+
"name": read_name,
|
35
|
+
"flag": str(flag),
|
36
|
+
"ref_name": chrome,
|
37
|
+
"ref_pos": str(ref_pos),
|
38
|
+
"next_ref_name": "*",
|
39
|
+
"next_ref_pos": "0",
|
40
|
+
"map_quality": str(quality.mapping_score),
|
41
|
+
"length": "0",
|
42
|
+
"cigar": f"{read_length}M",
|
43
|
+
"seq": sequence,
|
44
|
+
"qual": qual,
|
45
|
+
}
|
46
|
+
|
47
|
+
|
48
|
+
def _generate_read_name() -> str:
|
49
|
+
lane = randint(1, 4)
|
50
|
+
tile = randint(1101, 1200)
|
51
|
+
x_pos = randint(0, 3000)
|
52
|
+
y_pos = randint(0, 3000)
|
53
|
+
|
54
|
+
return (
|
55
|
+
f"{INSTRUMENT}:"
|
56
|
+
f"{RUN_NUMBER:03d}:" # 3-digit zero padded
|
57
|
+
f"{FLOWCELL_ID}:"
|
58
|
+
f"{lane}:"
|
59
|
+
f"{tile:04d}:" # 4-digit zero padded
|
60
|
+
f"{x_pos:04d}:"
|
61
|
+
f"{y_pos:04d}"
|
62
|
+
)
|
63
|
+
|
64
|
+
|
65
|
+
def generate_mm_tag(
|
66
|
+
fw_sequence: str, methylated_sites: Set[MethylationSite], is_reverse_strand: bool
|
67
|
+
) -> str:
|
68
|
+
"""
|
69
|
+
Generates a SAM MM tag for methylation.
|
70
|
+
"""
|
71
|
+
target_base = "C"
|
72
|
+
strand_tag = "+" if not is_reverse_strand else "-"
|
73
|
+
sequence = (
|
74
|
+
fw_sequence.translate(COMPLEMENT_TABLE)[::-1]
|
75
|
+
if is_reverse_strand
|
76
|
+
else fw_sequence
|
77
|
+
)
|
78
|
+
transform_pos = (
|
79
|
+
(lambda x: len(sequence) - 1 - x) if is_reverse_strand else lambda x: x
|
80
|
+
)
|
81
|
+
methylated_positions = set(
|
82
|
+
transform_pos(site.get_cytosine_position(is_reverse_strand))
|
83
|
+
for site in methylated_sites
|
84
|
+
)
|
85
|
+
|
86
|
+
candidate_positions = (i for i, char in enumerate(sequence) if char == target_base)
|
87
|
+
|
88
|
+
offsets = []
|
89
|
+
last_methyl_idx = -1
|
90
|
+
for candidate_idx, pos in enumerate(candidate_positions):
|
91
|
+
if pos in methylated_positions:
|
92
|
+
offsets.append(candidate_idx - last_methyl_idx - 1)
|
93
|
+
last_methyl_idx = candidate_idx
|
94
|
+
methylated_positions.remove(pos)
|
95
|
+
|
96
|
+
return f"{target_base}{strand_tag}m,{','.join(map(str, offsets))};"
|