smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +43 -13
- smftools/_settings.py +6 -6
- smftools/_version.py +3 -1
- smftools/cli/__init__.py +1 -0
- smftools/cli/archived/cli_flows.py +2 -0
- smftools/cli/helpers.py +9 -1
- smftools/cli/hmm_adata.py +905 -242
- smftools/cli/load_adata.py +432 -280
- smftools/cli/preprocess_adata.py +287 -171
- smftools/cli/spatial_adata.py +141 -53
- smftools/cli_entry.py +119 -178
- smftools/config/__init__.py +3 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +26 -18
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +511 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +4 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2133 -1428
- smftools/hmm/__init__.py +24 -14
- smftools/hmm/archived/apply_hmm_batched.py +2 -0
- smftools/hmm/archived/calculate_distances.py +2 -0
- smftools/hmm/archived/call_hmm_peaks.py +18 -1
- smftools/hmm/archived/train_hmm.py +2 -0
- smftools/hmm/call_hmm_peaks.py +176 -193
- smftools/hmm/display_hmm.py +23 -7
- smftools/hmm/hmm_readwrite.py +20 -6
- smftools/hmm/nucleosome_hmm_refinement.py +104 -14
- smftools/informatics/__init__.py +55 -13
- smftools/informatics/archived/bam_conversion.py +2 -0
- smftools/informatics/archived/bam_direct.py +2 -0
- smftools/informatics/archived/basecall_pod5s.py +2 -0
- smftools/informatics/archived/basecalls_to_adata.py +2 -0
- smftools/informatics/archived/conversion_smf.py +2 -0
- smftools/informatics/archived/deaminase_smf.py +1 -0
- smftools/informatics/archived/direct_smf.py +2 -0
- smftools/informatics/archived/fast5_to_pod5.py +2 -0
- smftools/informatics/archived/helpers/archived/__init__.py +2 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
- smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
- smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
- smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
- smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
- smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
- smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
- smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
- smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
- smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
- smftools/informatics/archived/helpers/archived/informatics.py +2 -0
- smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
- smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
- smftools/informatics/archived/helpers/archived/modQC.py +2 -0
- smftools/informatics/archived/helpers/archived/modcall.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
- smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
- smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
- smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
- smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
- smftools/informatics/archived/print_bam_query_seq.py +9 -1
- smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
- smftools/informatics/archived/subsample_pod5.py +2 -0
- smftools/informatics/bam_functions.py +1059 -269
- smftools/informatics/basecalling.py +53 -9
- smftools/informatics/bed_functions.py +357 -114
- smftools/informatics/binarize_converted_base_identities.py +21 -7
- smftools/informatics/complement_base_list.py +9 -6
- smftools/informatics/converted_BAM_to_adata.py +324 -137
- smftools/informatics/fasta_functions.py +251 -89
- smftools/informatics/h5ad_functions.py +202 -30
- smftools/informatics/modkit_extract_to_adata.py +623 -274
- smftools/informatics/modkit_functions.py +87 -44
- smftools/informatics/ohe.py +46 -21
- smftools/informatics/pod5_functions.py +114 -74
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +23 -12
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +157 -50
- smftools/machine_learning/data/preprocessing.py +4 -1
- smftools/machine_learning/evaluation/__init__.py +3 -1
- smftools/machine_learning/evaluation/eval_utils.py +13 -14
- smftools/machine_learning/evaluation/evaluators.py +52 -34
- smftools/machine_learning/inference/__init__.py +3 -1
- smftools/machine_learning/inference/inference_utils.py +9 -4
- smftools/machine_learning/inference/lightning_inference.py +14 -13
- smftools/machine_learning/inference/sklearn_inference.py +8 -8
- smftools/machine_learning/inference/sliding_window_inference.py +37 -25
- smftools/machine_learning/models/__init__.py +12 -5
- smftools/machine_learning/models/base.py +34 -43
- smftools/machine_learning/models/cnn.py +22 -13
- smftools/machine_learning/models/lightning_base.py +78 -42
- smftools/machine_learning/models/mlp.py +18 -5
- smftools/machine_learning/models/positional.py +10 -4
- smftools/machine_learning/models/rnn.py +8 -3
- smftools/machine_learning/models/sklearn_models.py +46 -24
- smftools/machine_learning/models/transformer.py +75 -55
- smftools/machine_learning/models/wrappers.py +8 -3
- smftools/machine_learning/training/__init__.py +4 -2
- smftools/machine_learning/training/train_lightning_model.py +42 -23
- smftools/machine_learning/training/train_sklearn_model.py +11 -15
- smftools/machine_learning/utils/__init__.py +3 -1
- smftools/machine_learning/utils/device.py +12 -5
- smftools/machine_learning/utils/grl.py +8 -2
- smftools/metadata.py +443 -0
- smftools/optional_imports.py +31 -0
- smftools/plotting/__init__.py +32 -17
- smftools/plotting/autocorrelation_plotting.py +153 -48
- smftools/plotting/classifiers.py +175 -73
- smftools/plotting/general_plotting.py +350 -168
- smftools/plotting/hmm_plotting.py +53 -14
- smftools/plotting/position_stats.py +155 -87
- smftools/plotting/qc_plotting.py +25 -12
- smftools/preprocessing/__init__.py +35 -37
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
- smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
- smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
- smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +18 -11
- smftools/preprocessing/calculate_complexity_II.py +89 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +4 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
- smftools/preprocessing/calculate_position_Youden.py +110 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
- smftools/preprocessing/flag_duplicate_reads.py +708 -303
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +9 -3
- smftools/preprocessing/min_non_diagonal.py +4 -1
- smftools/preprocessing/recipes.py +58 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +25 -18
- smftools/tools/archived/apply_hmm.py +2 -0
- smftools/tools/archived/classifiers.py +165 -0
- smftools/tools/archived/classify_methylated_features.py +2 -0
- smftools/tools/archived/classify_non_methylated_features.py +2 -0
- smftools/tools/archived/subset_adata_v1.py +12 -1
- smftools/tools/archived/subset_adata_v2.py +14 -1
- smftools/tools/calculate_umap.py +56 -15
- smftools/tools/cluster_adata_on_methylation.py +122 -47
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +220 -99
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- smftools-0.3.0.dist-info/METADATA +147 -0
- smftools-0.3.0.dist-info/RECORD +182 -0
- smftools-0.2.4.dist-info/METADATA +0 -141
- smftools-0.2.4.dist-info/RECORD +0 -176
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,36 +1,127 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import gzip
|
|
4
|
+
import shutil
|
|
4
5
|
import subprocess
|
|
6
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
7
|
+
from importlib.util import find_spec
|
|
5
8
|
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
from typing import Union, List, Dict, Tuple
|
|
9
|
+
from typing import TYPE_CHECKING, Dict, Iterable, Tuple
|
|
8
10
|
|
|
9
11
|
import numpy as np
|
|
10
|
-
import gzip
|
|
11
|
-
|
|
12
12
|
from Bio import SeqIO
|
|
13
|
-
from Bio.SeqRecord import SeqRecord
|
|
14
13
|
from Bio.Seq import Seq
|
|
15
|
-
from
|
|
16
|
-
import pysam
|
|
14
|
+
from Bio.SeqRecord import SeqRecord
|
|
17
15
|
|
|
18
|
-
from
|
|
19
|
-
from
|
|
16
|
+
from smftools.logging_utils import get_logger
|
|
17
|
+
from smftools.optional_imports import require
|
|
18
|
+
|
|
19
|
+
from ..readwrite import time_string
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
import pysam as pysam_module
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _require_pysam() -> "pysam_module":
|
|
28
|
+
if pysam_types is not None:
|
|
29
|
+
return pysam_types
|
|
30
|
+
return require("pysam", extra="pysam", purpose="FASTA access")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
pysam_types = None
|
|
34
|
+
if find_spec("pysam") is not None:
|
|
35
|
+
pysam_types = require("pysam", extra="pysam", purpose="FASTA access")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _resolve_fasta_backend() -> str:
|
|
39
|
+
"""Resolve the backend to use for FASTA access."""
|
|
40
|
+
if pysam_types is not None:
|
|
41
|
+
return "python"
|
|
42
|
+
if shutil is not None and shutil.which("samtools"):
|
|
43
|
+
return "cli"
|
|
44
|
+
raise RuntimeError("FASTA access requires pysam or samtools in PATH.")
|
|
20
45
|
|
|
21
|
-
|
|
22
|
-
|
|
46
|
+
|
|
47
|
+
def _ensure_fasta_index(fasta: Path) -> None:
|
|
48
|
+
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
49
|
+
if fai.exists():
|
|
50
|
+
return
|
|
51
|
+
if subprocess is None or shutil is None or not shutil.which("samtools"):
|
|
52
|
+
pysam_mod = _require_pysam()
|
|
53
|
+
pysam_mod.faidx(str(fasta))
|
|
54
|
+
return
|
|
55
|
+
cp = subprocess.run(
|
|
56
|
+
["samtools", "faidx", str(fasta)],
|
|
57
|
+
stdout=subprocess.DEVNULL,
|
|
58
|
+
stderr=subprocess.PIPE,
|
|
59
|
+
text=True,
|
|
60
|
+
)
|
|
61
|
+
if cp.returncode != 0:
|
|
62
|
+
raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _bed_to_faidx_region(chrom: str, start: int, end: int) -> str:
|
|
66
|
+
"""Convert 0-based half-open BED coords to samtools faidx region."""
|
|
67
|
+
start1 = start + 1
|
|
68
|
+
end1 = end
|
|
69
|
+
if start1 > end1:
|
|
70
|
+
start1, end1 = end1, start1
|
|
71
|
+
return f"{chrom}:{start1}-{end1}"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _fetch_sequence_with_samtools(fasta: Path, chrom: str, start: int, end: int) -> str:
|
|
75
|
+
if subprocess is None or shutil is None:
|
|
76
|
+
raise RuntimeError("samtools backend is unavailable.")
|
|
77
|
+
if not shutil.which("samtools"):
|
|
78
|
+
raise RuntimeError("samtools is required but not available in PATH.")
|
|
79
|
+
region = _bed_to_faidx_region(chrom, start, end)
|
|
80
|
+
cp = subprocess.run(
|
|
81
|
+
["samtools", "faidx", str(fasta), region],
|
|
82
|
+
stdout=subprocess.PIPE,
|
|
83
|
+
stderr=subprocess.PIPE,
|
|
84
|
+
text=True,
|
|
85
|
+
)
|
|
86
|
+
if cp.returncode != 0:
|
|
87
|
+
raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
|
|
88
|
+
lines = [line.strip() for line in cp.stdout.splitlines() if line and not line.startswith(">")]
|
|
89
|
+
return "".join(lines)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _convert_FASTA_record(
|
|
93
|
+
record: SeqRecord,
|
|
94
|
+
modification_type: str,
|
|
95
|
+
strand: str,
|
|
96
|
+
unconverted: str,
|
|
97
|
+
) -> SeqRecord:
|
|
98
|
+
"""Convert a FASTA record based on modification type and strand.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
record: Input FASTA record.
|
|
102
|
+
modification_type: Modification type (e.g., ``5mC`` or ``6mA``).
|
|
103
|
+
strand: Strand label (``top`` or ``bottom``).
|
|
104
|
+
unconverted: Label for the unconverted record type.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Bio.SeqRecord.SeqRecord: Converted FASTA record.
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
ValueError: If the modification type/strand combination is invalid.
|
|
111
|
+
"""
|
|
23
112
|
conversion_maps = {
|
|
24
|
-
(
|
|
25
|
-
(
|
|
26
|
-
(
|
|
27
|
-
(
|
|
113
|
+
("5mC", "top"): ("C", "T"),
|
|
114
|
+
("5mC", "bottom"): ("G", "A"),
|
|
115
|
+
("6mA", "top"): ("A", "G"),
|
|
116
|
+
("6mA", "bottom"): ("T", "C"),
|
|
28
117
|
}
|
|
29
118
|
|
|
30
119
|
sequence = str(record.seq).upper()
|
|
31
120
|
|
|
32
121
|
if modification_type == unconverted:
|
|
33
|
-
return SeqRecord(
|
|
122
|
+
return SeqRecord(
|
|
123
|
+
Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description
|
|
124
|
+
)
|
|
34
125
|
|
|
35
126
|
if (modification_type, strand) not in conversion_maps:
|
|
36
127
|
raise ValueError(f"Invalid combination: {modification_type}, {strand}")
|
|
@@ -38,62 +129,80 @@ def _convert_FASTA_record(record, modification_type, strand, unconverted):
|
|
|
38
129
|
original_base, converted_base = conversion_maps[(modification_type, strand)]
|
|
39
130
|
new_seq = sequence.replace(original_base, converted_base)
|
|
40
131
|
|
|
41
|
-
return SeqRecord(
|
|
132
|
+
return SeqRecord(
|
|
133
|
+
Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _process_fasta_record(
|
|
138
|
+
args: tuple[SeqRecord, Iterable[str], Iterable[str], str],
|
|
139
|
+
) -> list[SeqRecord]:
|
|
140
|
+
"""Process a single FASTA record for parallel conversion.
|
|
42
141
|
|
|
43
|
-
def _process_fasta_record(args):
|
|
44
|
-
"""
|
|
45
|
-
Processes a single FASTA record for parallel execution.
|
|
46
142
|
Args:
|
|
47
|
-
args
|
|
143
|
+
args: Tuple containing ``(record, modification_types, strands, unconverted)``.
|
|
144
|
+
|
|
48
145
|
Returns:
|
|
49
|
-
list
|
|
146
|
+
list[Bio.SeqRecord.SeqRecord]: Converted FASTA records.
|
|
50
147
|
"""
|
|
51
148
|
record, modification_types, strands, unconverted = args
|
|
52
149
|
modified_records = []
|
|
53
|
-
|
|
150
|
+
|
|
54
151
|
for modification_type in modification_types:
|
|
55
152
|
for i, strand in enumerate(strands):
|
|
56
153
|
if i > 0 and modification_type == unconverted:
|
|
57
154
|
continue # Ensure unconverted is added only once
|
|
58
155
|
|
|
59
|
-
modified_records.append(
|
|
156
|
+
modified_records.append(
|
|
157
|
+
_convert_FASTA_record(record, modification_type, strand, unconverted)
|
|
158
|
+
)
|
|
60
159
|
|
|
61
160
|
return modified_records
|
|
62
161
|
|
|
63
|
-
def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
|
|
64
|
-
"""
|
|
65
|
-
Converts an input FASTA file and writes a new converted FASTA file efficiently.
|
|
66
162
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
163
|
+
def generate_converted_FASTA(
|
|
164
|
+
input_fasta: str | Path,
|
|
165
|
+
modification_types: list[str],
|
|
166
|
+
strands: list[str],
|
|
167
|
+
output_fasta: str | Path,
|
|
168
|
+
num_threads: int = 4,
|
|
169
|
+
chunk_size: int = 500,
|
|
170
|
+
) -> None:
|
|
171
|
+
"""Convert a FASTA file and write converted records to disk.
|
|
74
172
|
|
|
75
|
-
|
|
76
|
-
|
|
173
|
+
Args:
|
|
174
|
+
input_fasta: Path to the unconverted FASTA file.
|
|
175
|
+
modification_types: List of modification types (``5mC``, ``6mA``, or unconverted).
|
|
176
|
+
strands: List of strands (``top``, ``bottom``).
|
|
177
|
+
output_fasta: Path to the converted FASTA output file.
|
|
178
|
+
num_threads: Number of parallel workers to use.
|
|
179
|
+
chunk_size: Number of records to process per write batch.
|
|
77
180
|
"""
|
|
78
181
|
unconverted = modification_types[0]
|
|
79
182
|
input_fasta = str(input_fasta)
|
|
80
183
|
output_fasta = str(output_fasta)
|
|
81
184
|
|
|
82
185
|
# Detect if input is gzipped
|
|
83
|
-
open_func = gzip.open if input_fasta.endswith(
|
|
84
|
-
file_mode =
|
|
186
|
+
open_func = gzip.open if input_fasta.endswith(".gz") else open
|
|
187
|
+
file_mode = "rt" if input_fasta.endswith(".gz") else "r"
|
|
85
188
|
|
|
86
189
|
def _fasta_record_generator():
|
|
87
|
-
"""
|
|
190
|
+
"""Lazily yields FASTA records from file."""
|
|
88
191
|
with open_func(input_fasta, file_mode) as handle:
|
|
89
|
-
for record in SeqIO.parse(handle,
|
|
192
|
+
for record in SeqIO.parse(handle, "fasta"):
|
|
90
193
|
yield record
|
|
91
194
|
|
|
92
|
-
with
|
|
195
|
+
with (
|
|
196
|
+
open(output_fasta, "w") as output_handle,
|
|
197
|
+
ProcessPoolExecutor(max_workers=num_threads) as executor,
|
|
198
|
+
):
|
|
93
199
|
# Process records in parallel using a named function (avoiding lambda)
|
|
94
200
|
results = executor.map(
|
|
95
201
|
_process_fasta_record,
|
|
96
|
-
(
|
|
202
|
+
(
|
|
203
|
+
(record, modification_types, strands, unconverted)
|
|
204
|
+
for record in _fasta_record_generator()
|
|
205
|
+
),
|
|
97
206
|
)
|
|
98
207
|
|
|
99
208
|
buffer = []
|
|
@@ -102,16 +211,26 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
|
|
|
102
211
|
|
|
103
212
|
# Write out in chunks to save memory
|
|
104
213
|
if len(buffer) >= chunk_size:
|
|
105
|
-
SeqIO.write(buffer, output_handle,
|
|
214
|
+
SeqIO.write(buffer, output_handle, "fasta")
|
|
106
215
|
buffer.clear()
|
|
107
216
|
|
|
108
217
|
# Write any remaining records
|
|
109
218
|
if buffer:
|
|
110
|
-
SeqIO.write(buffer, output_handle,
|
|
219
|
+
SeqIO.write(buffer, output_handle, "fasta")
|
|
220
|
+
|
|
111
221
|
|
|
112
222
|
def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
|
|
223
|
+
"""Index a FASTA file and optionally write chromosome sizes.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
fasta: Path to the FASTA file.
|
|
227
|
+
write_chrom_sizes: Whether to write a ``.chrom.sizes`` file.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Path: Path to the index file or chromosome sizes file.
|
|
231
|
+
"""
|
|
113
232
|
fasta = Path(fasta)
|
|
114
|
-
|
|
233
|
+
_require_pysam().faidx(str(fasta)) # creates <fasta>.fai
|
|
115
234
|
|
|
116
235
|
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
117
236
|
if write_chrom_sizes:
|
|
@@ -123,9 +242,15 @@ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
|
|
|
123
242
|
return chrom_sizes
|
|
124
243
|
return fai
|
|
125
244
|
|
|
245
|
+
|
|
126
246
|
def get_chromosome_lengths(fasta: str | Path) -> Path:
|
|
127
|
-
"""
|
|
128
|
-
|
|
247
|
+
"""Create or reuse ``<fasta>.chrom.sizes`` derived from the FASTA index.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
fasta: Path to the FASTA file.
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Path: Path to the chromosome sizes file.
|
|
129
254
|
"""
|
|
130
255
|
fasta = Path(fasta)
|
|
131
256
|
fai = fasta.with_suffix(fasta.suffix + ".fai")
|
|
@@ -133,7 +258,7 @@ def get_chromosome_lengths(fasta: str | Path) -> Path:
|
|
|
133
258
|
index_fasta(fasta, write_chrom_sizes=True) # will also create .chrom.sizes
|
|
134
259
|
chrom_sizes = fasta.with_suffix(".chrom.sizes")
|
|
135
260
|
if chrom_sizes.exists():
|
|
136
|
-
|
|
261
|
+
logger.debug(f"Using existing chrom length file: {chrom_sizes}")
|
|
137
262
|
return chrom_sizes
|
|
138
263
|
|
|
139
264
|
# Build chrom.sizes from .fai
|
|
@@ -143,10 +268,15 @@ def get_chromosome_lengths(fasta: str | Path) -> Path:
|
|
|
143
268
|
out.write(f"{chrom}\t{size}\n")
|
|
144
269
|
return chrom_sizes
|
|
145
270
|
|
|
271
|
+
|
|
146
272
|
def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
|
|
147
|
-
"""
|
|
148
|
-
|
|
149
|
-
|
|
273
|
+
"""Return record lengths and sequences from a FASTA file.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
fasta_file: Path to the FASTA file.
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
dict[str, tuple[int, str]]: Mapping of record ID to ``(length, sequence)``.
|
|
150
280
|
"""
|
|
151
281
|
fasta_file = Path(fasta_file)
|
|
152
282
|
print(f"{time_string()}: Opening FASTA file {fasta_file}")
|
|
@@ -157,28 +287,35 @@ def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
|
|
|
157
287
|
record_dict[rec.id] = (len(seq), seq)
|
|
158
288
|
return record_dict
|
|
159
289
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
290
|
+
|
|
291
|
+
def find_conversion_sites(
|
|
292
|
+
fasta_file: str | Path,
|
|
293
|
+
modification_type: str,
|
|
294
|
+
conversions: list[str],
|
|
295
|
+
deaminase_footprinting: bool = False,
|
|
296
|
+
) -> dict[str, list]:
|
|
297
|
+
"""Find genomic coordinates of modified bases in a reference FASTA.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
fasta_file: Path to the converted reference FASTA.
|
|
301
|
+
modification_type: Modification type (``5mC``, ``6mA``, or ``unconverted``).
|
|
302
|
+
conversions: List of conversion types (first entry is the unconverted record type).
|
|
303
|
+
deaminase_footprinting: Whether the footprinting used direct deamination chemistry.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
dict[str, list]: Mapping of record name to
|
|
307
|
+
``[sequence length, top strand coordinates, bottom strand coordinates, sequence, complement]``.
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
ValueError: If the modification type is invalid.
|
|
174
311
|
"""
|
|
175
312
|
unconverted = conversions[0]
|
|
176
313
|
record_dict = {}
|
|
177
314
|
|
|
178
315
|
# Define base mapping based on modification type
|
|
179
316
|
base_mappings = {
|
|
180
|
-
|
|
181
|
-
|
|
317
|
+
"5mC": ("C", "G"), # Cytosine and Guanine
|
|
318
|
+
"6mA": ("A", "T"), # Adenine and Thymine
|
|
182
319
|
}
|
|
183
320
|
|
|
184
321
|
# Read FASTA file and process records
|
|
@@ -200,22 +337,35 @@ def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_
|
|
|
200
337
|
top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
|
|
201
338
|
bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
|
|
202
339
|
|
|
203
|
-
record_dict[record.id] = [
|
|
340
|
+
record_dict[record.id] = [
|
|
341
|
+
sequence_length,
|
|
342
|
+
top_strand_coordinates,
|
|
343
|
+
bottom_strand_coordinates,
|
|
344
|
+
sequence,
|
|
345
|
+
complement,
|
|
346
|
+
]
|
|
204
347
|
|
|
205
348
|
else:
|
|
206
|
-
raise ValueError(
|
|
349
|
+
raise ValueError(
|
|
350
|
+
f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'."
|
|
351
|
+
)
|
|
207
352
|
|
|
208
353
|
return record_dict
|
|
209
354
|
|
|
355
|
+
|
|
210
356
|
def subsample_fasta_from_bed(
|
|
211
357
|
input_FASTA: str | Path,
|
|
212
358
|
input_bed: str | Path,
|
|
213
359
|
output_directory: str | Path,
|
|
214
|
-
output_FASTA: str | Path
|
|
360
|
+
output_FASTA: str | Path,
|
|
215
361
|
) -> None:
|
|
216
|
-
"""
|
|
217
|
-
|
|
218
|
-
|
|
362
|
+
"""Subsample a FASTA using BED coordinates.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
input_FASTA: Genome-wide FASTA path.
|
|
366
|
+
input_bed: BED file path containing coordinate windows of interest.
|
|
367
|
+
output_directory: Directory to write the subsampled FASTA.
|
|
368
|
+
output_FASTA: Output FASTA path.
|
|
219
369
|
"""
|
|
220
370
|
|
|
221
371
|
# Normalize everything to Path
|
|
@@ -227,29 +377,41 @@ def subsample_fasta_from_bed(
|
|
|
227
377
|
# Ensure output directory exists
|
|
228
378
|
output_directory.mkdir(parents=True, exist_ok=True)
|
|
229
379
|
|
|
230
|
-
|
|
380
|
+
backend = _resolve_fasta_backend()
|
|
381
|
+
_ensure_fasta_index(input_FASTA)
|
|
231
382
|
|
|
232
|
-
|
|
233
|
-
|
|
383
|
+
fasta_handle = None
|
|
384
|
+
if backend == "python":
|
|
385
|
+
pysam_mod = _require_pysam()
|
|
386
|
+
fasta_handle = pysam_mod.FastaFile(str(input_FASTA))
|
|
234
387
|
|
|
235
388
|
# Open BED + output FASTA
|
|
236
|
-
with input_bed.open("r") as bed,
|
|
389
|
+
with input_bed.open("r") as bed, output_FASTA.open("w") as out_fasta:
|
|
237
390
|
for line in bed:
|
|
238
391
|
fields = line.strip().split()
|
|
239
392
|
chrom = fields[0]
|
|
240
|
-
start = int(fields[1])
|
|
241
|
-
end
|
|
242
|
-
desc
|
|
243
|
-
|
|
244
|
-
if
|
|
245
|
-
|
|
393
|
+
start = int(fields[1]) # BED is 0-based
|
|
394
|
+
end = int(fields[2]) # BED is 0-based and end is exclusive
|
|
395
|
+
desc = " ".join(fields[3:]) if len(fields) > 3 else ""
|
|
396
|
+
|
|
397
|
+
if backend == "python":
|
|
398
|
+
assert fasta_handle is not None
|
|
399
|
+
if chrom not in fasta_handle.references:
|
|
400
|
+
logger.warning(f"{chrom} not found in FASTA")
|
|
401
|
+
continue
|
|
402
|
+
sequence = fasta_handle.fetch(chrom, start, end)
|
|
403
|
+
else:
|
|
404
|
+
sequence = _fetch_sequence_with_samtools(input_FASTA, chrom, start, end)
|
|
405
|
+
|
|
406
|
+
if not sequence:
|
|
407
|
+
logger.warning(f"{chrom} not found in FASTA")
|
|
246
408
|
continue
|
|
247
409
|
|
|
248
|
-
# pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
|
|
249
|
-
sequence = fasta[chrom][start:end].seq
|
|
250
|
-
|
|
251
410
|
header = f">{chrom}:{start}-{end}"
|
|
252
411
|
if desc:
|
|
253
412
|
header += f" {desc}"
|
|
254
413
|
|
|
255
|
-
out_fasta.write(f"{header}\n{sequence}\n")
|
|
414
|
+
out_fasta.write(f"{header}\n{sequence}\n")
|
|
415
|
+
|
|
416
|
+
if fasta_handle is not None:
|
|
417
|
+
fasta_handle.close()
|