smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,36 +1,127 @@
1
- from ..readwrite import make_dirs, time_string
1
+ from __future__ import annotations
2
2
 
3
- import os
3
+ import gzip
4
+ import shutil
4
5
  import subprocess
6
+ from concurrent.futures import ProcessPoolExecutor
7
+ from importlib.util import find_spec
5
8
  from pathlib import Path
6
-
7
- from typing import Union, List, Dict, Tuple
9
+ from typing import TYPE_CHECKING, Dict, Iterable, Tuple
8
10
 
9
11
  import numpy as np
10
- import gzip
11
-
12
12
  from Bio import SeqIO
13
- from Bio.SeqRecord import SeqRecord
14
13
  from Bio.Seq import Seq
15
- from pyfaidx import Fasta
16
- import pysam
14
+ from Bio.SeqRecord import SeqRecord
17
15
 
18
- from concurrent.futures import ProcessPoolExecutor
19
- from itertools import chain
16
+ from smftools.logging_utils import get_logger
17
+ from smftools.optional_imports import require
18
+
19
+ from ..readwrite import time_string
20
+
21
+ logger = get_logger(__name__)
22
+
23
+ if TYPE_CHECKING:
24
+ import pysam as pysam_module
25
+
26
+
27
+ def _require_pysam() -> "pysam_module":
28
+ if pysam_types is not None:
29
+ return pysam_types
30
+ return require("pysam", extra="pysam", purpose="FASTA access")
31
+
32
+
33
+ pysam_types = None
34
+ if find_spec("pysam") is not None:
35
+ pysam_types = require("pysam", extra="pysam", purpose="FASTA access")
36
+
37
+
38
+ def _resolve_fasta_backend() -> str:
39
+ """Resolve the backend to use for FASTA access."""
40
+ if pysam_types is not None:
41
+ return "python"
42
+ if shutil is not None and shutil.which("samtools"):
43
+ return "cli"
44
+ raise RuntimeError("FASTA access requires pysam or samtools in PATH.")
20
45
 
21
- def _convert_FASTA_record(record, modification_type, strand, unconverted):
22
- """ Converts a FASTA record based on modification type and strand. """
46
+
47
+ def _ensure_fasta_index(fasta: Path) -> None:
48
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
49
+ if fai.exists():
50
+ return
51
+ if subprocess is None or shutil is None or not shutil.which("samtools"):
52
+ pysam_mod = _require_pysam()
53
+ pysam_mod.faidx(str(fasta))
54
+ return
55
+ cp = subprocess.run(
56
+ ["samtools", "faidx", str(fasta)],
57
+ stdout=subprocess.DEVNULL,
58
+ stderr=subprocess.PIPE,
59
+ text=True,
60
+ )
61
+ if cp.returncode != 0:
62
+ raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
63
+
64
+
65
+ def _bed_to_faidx_region(chrom: str, start: int, end: int) -> str:
66
+ """Convert 0-based half-open BED coords to samtools faidx region."""
67
+ start1 = start + 1
68
+ end1 = end
69
+ if start1 > end1:
70
+ start1, end1 = end1, start1
71
+ return f"{chrom}:{start1}-{end1}"
72
+
73
+
74
+ def _fetch_sequence_with_samtools(fasta: Path, chrom: str, start: int, end: int) -> str:
75
+ if subprocess is None or shutil is None:
76
+ raise RuntimeError("samtools backend is unavailable.")
77
+ if not shutil.which("samtools"):
78
+ raise RuntimeError("samtools is required but not available in PATH.")
79
+ region = _bed_to_faidx_region(chrom, start, end)
80
+ cp = subprocess.run(
81
+ ["samtools", "faidx", str(fasta), region],
82
+ stdout=subprocess.PIPE,
83
+ stderr=subprocess.PIPE,
84
+ text=True,
85
+ )
86
+ if cp.returncode != 0:
87
+ raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
88
+ lines = [line.strip() for line in cp.stdout.splitlines() if line and not line.startswith(">")]
89
+ return "".join(lines)
90
+
91
+
92
+ def _convert_FASTA_record(
93
+ record: SeqRecord,
94
+ modification_type: str,
95
+ strand: str,
96
+ unconverted: str,
97
+ ) -> SeqRecord:
98
+ """Convert a FASTA record based on modification type and strand.
99
+
100
+ Args:
101
+ record: Input FASTA record.
102
+ modification_type: Modification type (e.g., ``5mC`` or ``6mA``).
103
+ strand: Strand label (``top`` or ``bottom``).
104
+ unconverted: Label for the unconverted record type.
105
+
106
+ Returns:
107
+ Bio.SeqRecord.SeqRecord: Converted FASTA record.
108
+
109
+ Raises:
110
+ ValueError: If the modification type/strand combination is invalid.
111
+ """
23
112
  conversion_maps = {
24
- ('5mC', 'top'): ('C', 'T'),
25
- ('5mC', 'bottom'): ('G', 'A'),
26
- ('6mA', 'top'): ('A', 'G'),
27
- ('6mA', 'bottom'): ('T', 'C')
113
+ ("5mC", "top"): ("C", "T"),
114
+ ("5mC", "bottom"): ("G", "A"),
115
+ ("6mA", "top"): ("A", "G"),
116
+ ("6mA", "bottom"): ("T", "C"),
28
117
  }
29
118
 
30
119
  sequence = str(record.seq).upper()
31
120
 
32
121
  if modification_type == unconverted:
33
- return SeqRecord(Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description)
122
+ return SeqRecord(
123
+ Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description
124
+ )
34
125
 
35
126
  if (modification_type, strand) not in conversion_maps:
36
127
  raise ValueError(f"Invalid combination: {modification_type}, {strand}")
@@ -38,62 +129,80 @@ def _convert_FASTA_record(record, modification_type, strand, unconverted):
38
129
  original_base, converted_base = conversion_maps[(modification_type, strand)]
39
130
  new_seq = sequence.replace(original_base, converted_base)
40
131
 
41
- return SeqRecord(Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description)
132
+ return SeqRecord(
133
+ Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description
134
+ )
135
+
136
+
137
+ def _process_fasta_record(
138
+ args: tuple[SeqRecord, Iterable[str], Iterable[str], str],
139
+ ) -> list[SeqRecord]:
140
+ """Process a single FASTA record for parallel conversion.
42
141
 
43
- def _process_fasta_record(args):
44
- """
45
- Processes a single FASTA record for parallel execution.
46
142
  Args:
47
- args (tuple): (record, modification_types, strands, unconverted)
143
+ args: Tuple containing ``(record, modification_types, strands, unconverted)``.
144
+
48
145
  Returns:
49
- list of modified SeqRecord objects.
146
+ list[Bio.SeqRecord.SeqRecord]: Converted FASTA records.
50
147
  """
51
148
  record, modification_types, strands, unconverted = args
52
149
  modified_records = []
53
-
150
+
54
151
  for modification_type in modification_types:
55
152
  for i, strand in enumerate(strands):
56
153
  if i > 0 and modification_type == unconverted:
57
154
  continue # Ensure unconverted is added only once
58
155
 
59
- modified_records.append(_convert_FASTA_record(record, modification_type, strand, unconverted))
156
+ modified_records.append(
157
+ _convert_FASTA_record(record, modification_type, strand, unconverted)
158
+ )
60
159
 
61
160
  return modified_records
62
161
 
63
- def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
64
- """
65
- Converts an input FASTA file and writes a new converted FASTA file efficiently.
66
162
 
67
- Parameters:
68
- input_fasta (str): Path to the unconverted FASTA file.
69
- modification_types (list): List of modification types ('5mC', '6mA', or unconverted).
70
- strands (list): List of strands ('top', 'bottom').
71
- output_fasta (str): Path to the converted FASTA output file.
72
- num_threads (int): Number of parallel threads to use.
73
- chunk_size (int): Number of records to process per write batch.
163
+ def generate_converted_FASTA(
164
+ input_fasta: str | Path,
165
+ modification_types: list[str],
166
+ strands: list[str],
167
+ output_fasta: str | Path,
168
+ num_threads: int = 4,
169
+ chunk_size: int = 500,
170
+ ) -> None:
171
+ """Convert a FASTA file and write converted records to disk.
74
172
 
75
- Returns:
76
- None (Writes the converted FASTA file).
173
+ Args:
174
+ input_fasta: Path to the unconverted FASTA file.
175
+ modification_types: List of modification types (``5mC``, ``6mA``, or unconverted).
176
+ strands: List of strands (``top``, ``bottom``).
177
+ output_fasta: Path to the converted FASTA output file.
178
+ num_threads: Number of parallel workers to use.
179
+ chunk_size: Number of records to process per write batch.
77
180
  """
78
181
  unconverted = modification_types[0]
79
182
  input_fasta = str(input_fasta)
80
183
  output_fasta = str(output_fasta)
81
184
 
82
185
  # Detect if input is gzipped
83
- open_func = gzip.open if input_fasta.endswith('.gz') else open
84
- file_mode = 'rt' if input_fasta.endswith('.gz') else 'r'
186
+ open_func = gzip.open if input_fasta.endswith(".gz") else open
187
+ file_mode = "rt" if input_fasta.endswith(".gz") else "r"
85
188
 
86
189
  def _fasta_record_generator():
87
- """ Lazily yields FASTA records from file. """
190
+ """Lazily yields FASTA records from file."""
88
191
  with open_func(input_fasta, file_mode) as handle:
89
- for record in SeqIO.parse(handle, 'fasta'):
192
+ for record in SeqIO.parse(handle, "fasta"):
90
193
  yield record
91
194
 
92
- with open(output_fasta, 'w') as output_handle, ProcessPoolExecutor(max_workers=num_threads) as executor:
195
+ with (
196
+ open(output_fasta, "w") as output_handle,
197
+ ProcessPoolExecutor(max_workers=num_threads) as executor,
198
+ ):
93
199
  # Process records in parallel using a named function (avoiding lambda)
94
200
  results = executor.map(
95
201
  _process_fasta_record,
96
- ((record, modification_types, strands, unconverted) for record in _fasta_record_generator())
202
+ (
203
+ (record, modification_types, strands, unconverted)
204
+ for record in _fasta_record_generator()
205
+ ),
97
206
  )
98
207
 
99
208
  buffer = []
@@ -102,16 +211,26 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
102
211
 
103
212
  # Write out in chunks to save memory
104
213
  if len(buffer) >= chunk_size:
105
- SeqIO.write(buffer, output_handle, 'fasta')
214
+ SeqIO.write(buffer, output_handle, "fasta")
106
215
  buffer.clear()
107
216
 
108
217
  # Write any remaining records
109
218
  if buffer:
110
- SeqIO.write(buffer, output_handle, 'fasta')
219
+ SeqIO.write(buffer, output_handle, "fasta")
220
+
111
221
 
112
222
  def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
223
+ """Index a FASTA file and optionally write chromosome sizes.
224
+
225
+ Args:
226
+ fasta: Path to the FASTA file.
227
+ write_chrom_sizes: Whether to write a ``.chrom.sizes`` file.
228
+
229
+ Returns:
230
+ Path: Path to the index file or chromosome sizes file.
231
+ """
113
232
  fasta = Path(fasta)
114
- pysam.faidx(str(fasta)) # creates <fasta>.fai
233
+ _require_pysam().faidx(str(fasta)) # creates <fasta>.fai
115
234
 
116
235
  fai = fasta.with_suffix(fasta.suffix + ".fai")
117
236
  if write_chrom_sizes:
@@ -123,9 +242,15 @@ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
123
242
  return chrom_sizes
124
243
  return fai
125
244
 
245
+
126
246
  def get_chromosome_lengths(fasta: str | Path) -> Path:
127
- """
128
- Create (or reuse) <fasta>.chrom.sizes, derived from the FASTA index.
247
+ """Create or reuse ``<fasta>.chrom.sizes`` derived from the FASTA index.
248
+
249
+ Args:
250
+ fasta: Path to the FASTA file.
251
+
252
+ Returns:
253
+ Path: Path to the chromosome sizes file.
129
254
  """
130
255
  fasta = Path(fasta)
131
256
  fai = fasta.with_suffix(fasta.suffix + ".fai")
@@ -133,7 +258,7 @@ def get_chromosome_lengths(fasta: str | Path) -> Path:
133
258
  index_fasta(fasta, write_chrom_sizes=True) # will also create .chrom.sizes
134
259
  chrom_sizes = fasta.with_suffix(".chrom.sizes")
135
260
  if chrom_sizes.exists():
136
- print(f"Using existing chrom length file: {chrom_sizes}")
261
+ logger.debug(f"Using existing chrom length file: {chrom_sizes}")
137
262
  return chrom_sizes
138
263
 
139
264
  # Build chrom.sizes from .fai
@@ -143,10 +268,15 @@ def get_chromosome_lengths(fasta: str | Path) -> Path:
143
268
  out.write(f"{chrom}\t{size}\n")
144
269
  return chrom_sizes
145
270
 
271
+
146
272
  def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
147
- """
148
- Return {record_id: (length, sequence)} from a FASTA.
149
- Direct methylation specific
273
+ """Return record lengths and sequences from a FASTA file.
274
+
275
+ Args:
276
+ fasta_file: Path to the FASTA file.
277
+
278
+ Returns:
279
+ dict[str, tuple[int, str]]: Mapping of record ID to ``(length, sequence)``.
150
280
  """
151
281
  fasta_file = Path(fasta_file)
152
282
  print(f"{time_string()}: Opening FASTA file {fasta_file}")
@@ -157,28 +287,35 @@ def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
157
287
  record_dict[rec.id] = (len(seq), seq)
158
288
  return record_dict
159
289
 
160
- def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
161
- """
162
- Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
163
-
164
- Parameters:
165
- fasta_file (str): Path to the converted reference FASTA.
166
- modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
167
- conversions (list): List of conversion types. The first element is the unconverted record type.
168
- deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
169
-
170
- Returns:
171
- dict: Dictionary where keys are **both unconverted & converted record names**.
172
- Values contain:
173
- [sequence length, top strand coordinates, bottom strand coordinates, sequence, complement sequence].
290
+
291
+ def find_conversion_sites(
292
+ fasta_file: str | Path,
293
+ modification_type: str,
294
+ conversions: list[str],
295
+ deaminase_footprinting: bool = False,
296
+ ) -> dict[str, list]:
297
+ """Find genomic coordinates of modified bases in a reference FASTA.
298
+
299
+ Args:
300
+ fasta_file: Path to the converted reference FASTA.
301
+ modification_type: Modification type (``5mC``, ``6mA``, or ``unconverted``).
302
+ conversions: List of conversion types (first entry is the unconverted record type).
303
+ deaminase_footprinting: Whether the footprinting used direct deamination chemistry.
304
+
305
+ Returns:
306
+ dict[str, list]: Mapping of record name to
307
+ ``[sequence length, top strand coordinates, bottom strand coordinates, sequence, complement]``.
308
+
309
+ Raises:
310
+ ValueError: If the modification type is invalid.
174
311
  """
175
312
  unconverted = conversions[0]
176
313
  record_dict = {}
177
314
 
178
315
  # Define base mapping based on modification type
179
316
  base_mappings = {
180
- '5mC': ('C', 'G'), # Cytosine and Guanine
181
- '6mA': ('A', 'T') # Adenine and Thymine
317
+ "5mC": ("C", "G"), # Cytosine and Guanine
318
+ "6mA": ("A", "T"), # Adenine and Thymine
182
319
  }
183
320
 
184
321
  # Read FASTA file and process records
@@ -200,22 +337,35 @@ def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_
200
337
  top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
201
338
  bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
202
339
 
203
- record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
340
+ record_dict[record.id] = [
341
+ sequence_length,
342
+ top_strand_coordinates,
343
+ bottom_strand_coordinates,
344
+ sequence,
345
+ complement,
346
+ ]
204
347
 
205
348
  else:
206
- raise ValueError(f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'.")
349
+ raise ValueError(
350
+ f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'."
351
+ )
207
352
 
208
353
  return record_dict
209
354
 
355
+
210
356
  def subsample_fasta_from_bed(
211
357
  input_FASTA: str | Path,
212
358
  input_bed: str | Path,
213
359
  output_directory: str | Path,
214
- output_FASTA: str | Path
360
+ output_FASTA: str | Path,
215
361
  ) -> None:
216
- """
217
- Take a genome-wide FASTA file and a BED file containing
218
- coordinate windows of interest. Outputs a subsampled FASTA.
362
+ """Subsample a FASTA using BED coordinates.
363
+
364
+ Args:
365
+ input_FASTA: Genome-wide FASTA path.
366
+ input_bed: BED file path containing coordinate windows of interest.
367
+ output_directory: Directory to write the subsampled FASTA.
368
+ output_FASTA: Output FASTA path.
219
369
  """
220
370
 
221
371
  # Normalize everything to Path
@@ -227,29 +377,41 @@ def subsample_fasta_from_bed(
227
377
  # Ensure output directory exists
228
378
  output_directory.mkdir(parents=True, exist_ok=True)
229
379
 
230
- output_FASTA_path = output_directory / output_FASTA
380
+ backend = _resolve_fasta_backend()
381
+ _ensure_fasta_index(input_FASTA)
231
382
 
232
- # Load the FASTA file using pyfaidx
233
- fasta = Fasta(str(input_FASTA)) # pyfaidx requires string paths
383
+ fasta_handle = None
384
+ if backend == "python":
385
+ pysam_mod = _require_pysam()
386
+ fasta_handle = pysam_mod.FastaFile(str(input_FASTA))
234
387
 
235
388
  # Open BED + output FASTA
236
- with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
389
+ with input_bed.open("r") as bed, output_FASTA.open("w") as out_fasta:
237
390
  for line in bed:
238
391
  fields = line.strip().split()
239
392
  chrom = fields[0]
240
- start = int(fields[1]) # BED is 0-based
241
- end = int(fields[2]) # BED is 0-based and end is exclusive
242
- desc = " ".join(fields[3:]) if len(fields) > 3 else ""
243
-
244
- if chrom not in fasta:
245
- print(f"Warning: {chrom} not found in FASTA")
393
+ start = int(fields[1]) # BED is 0-based
394
+ end = int(fields[2]) # BED is 0-based and end is exclusive
395
+ desc = " ".join(fields[3:]) if len(fields) > 3 else ""
396
+
397
+ if backend == "python":
398
+ assert fasta_handle is not None
399
+ if chrom not in fasta_handle.references:
400
+ logger.warning(f"{chrom} not found in FASTA")
401
+ continue
402
+ sequence = fasta_handle.fetch(chrom, start, end)
403
+ else:
404
+ sequence = _fetch_sequence_with_samtools(input_FASTA, chrom, start, end)
405
+
406
+ if not sequence:
407
+ logger.warning(f"{chrom} not found in FASTA")
246
408
  continue
247
409
 
248
- # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
249
- sequence = fasta[chrom][start:end].seq
250
-
251
410
  header = f">{chrom}:{start}-{end}"
252
411
  if desc:
253
412
  header += f" {desc}"
254
413
 
255
- out_fasta.write(f"{header}\n{sequence}\n")
414
+ out_fasta.write(f"{header}\n{sequence}\n")
415
+
416
+ if fasta_handle is not None:
417
+ fasta_handle.close()