smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,21 +1,134 @@
1
- from pathlib import Path
1
+ from __future__ import annotations
2
+
3
+ import concurrent.futures
2
4
  import os
5
+ import shutil
3
6
  import subprocess
4
- from typing import List, Optional, Union
5
- import pysam
6
- import pybedtools
7
- import pyBigWig
7
+ from concurrent.futures import ProcessPoolExecutor
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
8
10
 
9
11
  import numpy as np
10
12
  import pandas as pd
11
- import concurrent.futures
12
- from concurrent.futures import ProcessPoolExecutor
13
13
 
14
- import matplotlib.pyplot as plt
14
+ from smftools.logging_utils import get_logger
15
+ from smftools.optional_imports import require
15
16
 
16
17
  from ..readwrite import make_dirs
17
18
 
18
- def _bed_to_bigwig(fasta: str, bed: str) -> str:
19
+ logger = get_logger(__name__)
20
+
21
+ if TYPE_CHECKING:
22
+ import pybedtools as pybedtools_types
23
+ import pyBigWig as pybigwig_types
24
+ import pysam as pysam_types
25
+
26
+ try:
27
+ import pybedtools
28
+ except Exception:
29
+ pybedtools = None # type: ignore
30
+
31
+ try:
32
+ import pyBigWig
33
+ except Exception:
34
+ pyBigWig = None # type: ignore
35
+
36
+ try:
37
+ import pysam
38
+ except Exception:
39
+ pysam = None # type: ignore
40
+
41
+
42
+ def _require_pybedtools() -> "pybedtools_types":
43
+ if pybedtools is not None:
44
+ return pybedtools
45
+ return require("pybedtools", extra="pybedtools", purpose="bedtools Python backend")
46
+
47
+
48
+ def _require_pybigwig() -> "pybigwig_types":
49
+ if pyBigWig is not None:
50
+ return pyBigWig
51
+ return require("pyBigWig", extra="pybigwig", purpose="BigWig Python backend")
52
+
53
+
54
+ def _require_pysam() -> "pysam_types":
55
+ if pysam is not None:
56
+ return pysam
57
+ return require("pysam", extra="pysam", purpose="FASTA indexing")
58
+
59
+
60
+ def _resolve_backend(
61
+ backend: str | None, *, tool: str, python_available: bool, cli_name: str
62
+ ) -> str:
63
+ choice = (backend or "auto").strip().lower()
64
+ if choice not in {"auto", "python", "cli"}:
65
+ raise ValueError(f"{tool}_backend must be one of: auto, python, cli")
66
+ if choice == "python":
67
+ if not python_available:
68
+ raise RuntimeError(
69
+ f"{tool}_backend=python requires the Python package to be installed."
70
+ )
71
+ return "python"
72
+ if choice == "cli":
73
+ if not shutil.which(cli_name):
74
+ raise RuntimeError(f"{tool}_backend=cli requires {cli_name} in PATH.")
75
+ return "cli"
76
+ if shutil.which(cli_name):
77
+ return "cli"
78
+ if python_available:
79
+ return "python"
80
+ raise RuntimeError(f"Neither Python nor CLI backend is available for {tool}.")
81
+
82
+
83
+ def _read_chrom_sizes(chrom_sizes: Path) -> list[tuple[str, int]]:
84
+ sizes: list[tuple[str, int]] = []
85
+ with chrom_sizes.open() as f:
86
+ for line in f:
87
+ chrom, size = line.split()[:2]
88
+ sizes.append((chrom, int(size)))
89
+ return sizes
90
+
91
+
92
+ def _ensure_fasta_index(fasta: Path) -> Path:
93
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
94
+ if fai.exists():
95
+ return fai
96
+ if shutil.which("samtools"):
97
+ cp = subprocess.run(
98
+ ["samtools", "faidx", str(fasta)],
99
+ stdout=subprocess.DEVNULL,
100
+ stderr=subprocess.PIPE,
101
+ text=True,
102
+ )
103
+ if cp.returncode != 0:
104
+ raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
105
+ return fai
106
+ if pysam is not None:
107
+ pysam_mod = _require_pysam()
108
+ pysam_mod.faidx(str(fasta))
109
+ return fai
110
+ raise RuntimeError("FASTA indexing requires pysam or samtools in PATH.")
111
+
112
+
113
+ def _ensure_chrom_sizes(fasta: Path) -> Path:
114
+ fai = _ensure_fasta_index(fasta)
115
+ chrom_sizes = fasta.with_suffix(".chrom.sizes")
116
+ if chrom_sizes.exists():
117
+ return chrom_sizes
118
+ with fai.open() as f_in, chrom_sizes.open("w") as out:
119
+ for line in f_in:
120
+ chrom, size = line.split()[:2]
121
+ out.write(f"{chrom}\t{size}\n")
122
+ return chrom_sizes
123
+
124
+
125
+ def _bed_to_bigwig(
126
+ fasta: str,
127
+ bed: str,
128
+ *,
129
+ bedtools_backend: str | None = "auto",
130
+ bigwig_backend: str | None = "auto",
131
+ ) -> str:
19
132
  """
20
133
  BED → bedGraph → bigWig
21
134
  Requires:
@@ -26,44 +139,75 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
26
139
  fa = Path(fasta) # path to .fa
27
140
  parent = bed.parent
28
141
  stem = bed.stem
29
- fa_stem = fa.stem
30
- fai = parent / f"{fa_stem}.fai"
142
+ chrom_sizes = _ensure_chrom_sizes(fa)
31
143
 
32
144
  bedgraph = parent / f"{stem}.bedgraph"
33
145
  bigwig = parent / f"{stem}.bw"
34
146
 
35
147
  # 1) Compute coverage → bedGraph
36
- print(f"[pybedtools] generating coverage bedgraph from {bed}")
37
- bt = pybedtools.BedTool(str(bed))
38
- # bedtools genomecov -bg
39
- coverage = bt.genome_coverage(bg=True, genome=str(fai))
40
- coverage.saveas(str(bedgraph))
148
+ bedtools_choice = _resolve_backend(
149
+ bedtools_backend,
150
+ tool="bedtools",
151
+ python_available=pybedtools is not None,
152
+ cli_name="bedtools",
153
+ )
154
+ if bedtools_choice == "python":
155
+ logger.debug(f"[pybedtools] generating coverage bedgraph from {bed}")
156
+ pybedtools_mod = _require_pybedtools()
157
+ bt = pybedtools_mod.BedTool(str(bed))
158
+ # bedtools genomecov -bg
159
+ coverage = bt.genome_coverage(bg=True, genome=str(chrom_sizes))
160
+ coverage.saveas(str(bedgraph))
161
+ else:
162
+ if not shutil.which("bedtools"):
163
+ raise RuntimeError("bedtools is required but not available in PATH.")
164
+ cmd = [
165
+ "bedtools",
166
+ "genomecov",
167
+ "-i",
168
+ str(bed),
169
+ "-g",
170
+ str(chrom_sizes),
171
+ "-bg",
172
+ ]
173
+ logger.debug("[bedtools] generating coverage bedgraph: %s", " ".join(cmd))
174
+ with bedgraph.open("w") as out:
175
+ cp = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
176
+ if cp.returncode != 0:
177
+ raise RuntimeError(f"bedtools genomecov failed (exit {cp.returncode}):\n{cp.stderr}")
41
178
 
42
179
  # 2) Convert bedGraph → BigWig via pyBigWig
43
- print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
44
-
45
- # read chrom sizes from the FASTA .fai index
46
- chrom_sizes = {}
47
- with open(fai) as f:
48
- for line in f:
49
- fields = line.strip().split("\t")
50
- chrom = fields[0]
51
- size = int(fields[1])
52
- chrom_sizes[chrom] = size
53
-
54
- bw = pyBigWig.open(str(bigwig), "w")
55
- bw.addHeader(list(chrom_sizes.items()))
56
-
57
- with open(bedgraph) as f:
58
- for line in f:
59
- chrom, start, end, coverage = line.strip().split()
60
- bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
61
-
62
- bw.close()
63
-
64
- print(f"BigWig written: {bigwig}")
180
+ bigwig_choice = _resolve_backend(
181
+ bigwig_backend,
182
+ tool="bigwig",
183
+ python_available=pyBigWig is not None,
184
+ cli_name="bedGraphToBigWig",
185
+ )
186
+ if bigwig_choice == "python":
187
+ logger.debug(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
188
+ pybigwig_mod = _require_pybigwig()
189
+ bw = pybigwig_mod.open(str(bigwig), "w")
190
+ bw.addHeader(_read_chrom_sizes(chrom_sizes))
191
+
192
+ with bedgraph.open() as f:
193
+ for line in f:
194
+ chrom, start, end, coverage = line.strip().split()
195
+ bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
196
+
197
+ bw.close()
198
+ else:
199
+ if not shutil.which("bedGraphToBigWig"):
200
+ raise RuntimeError("bedGraphToBigWig is required but not available in PATH.")
201
+ cmd = ["bedGraphToBigWig", str(bedgraph), str(chrom_sizes), str(bigwig)]
202
+ logger.debug("[bedGraphToBigWig] converting bedgraph → bigwig: %s", " ".join(cmd))
203
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
204
+ if cp.returncode != 0:
205
+ raise RuntimeError(f"bedGraphToBigWig failed (exit {cp.returncode}):\n{cp.stderr}")
206
+
207
+ logger.debug(f"BigWig written: {bigwig}")
65
208
  return str(bigwig)
66
209
 
210
+
67
211
  def _plot_bed_histograms(
68
212
  bed_file,
69
213
  plotting_directory,
@@ -71,9 +215,9 @@ def _plot_bed_histograms(
71
215
  *,
72
216
  bins=60,
73
217
  clip_quantiles=(0.0, 0.995),
74
- cov_bin_size=1000, # coverage bin size in bp
75
- rows_per_fig=6, # paginate if many chromosomes
76
- include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
218
+ cov_bin_size=1000, # coverage bin size in bp
219
+ rows_per_fig=6, # paginate if many chromosomes
220
+ include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
77
221
  coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
78
222
  ):
79
223
  """
@@ -110,22 +254,35 @@ def _plot_bed_histograms(
110
254
  coordinate_mode : {"one_based","zero_based"}
111
255
  One-based, inclusive (your file) vs BED-standard zero-based, half-open.
112
256
  """
257
+ plt = require("matplotlib.pyplot", extra="plotting", purpose="plotting BED histograms")
258
+
113
259
  os.makedirs(plotting_directory, exist_ok=True)
114
260
 
115
261
  bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
116
- print(f"[plot_bed_histograms] Loading: {bed_file}")
262
+ logger.debug(f"[plot_bed_histograms] Loading: {bed_file}")
117
263
 
118
264
  # Load BED-like table
119
- cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
120
- df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
121
- 'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
122
- 'mapq': float, 'avg_q': float
123
- })
265
+ cols = ["chrom", "start", "end", "read_len", "qname", "mapq", "avg_q"]
266
+ df = pd.read_csv(
267
+ bed_file,
268
+ sep="\t",
269
+ header=None,
270
+ names=cols,
271
+ dtype={
272
+ "chrom": str,
273
+ "start": int,
274
+ "end": int,
275
+ "read_len": int,
276
+ "qname": str,
277
+ "mapq": float,
278
+ "avg_q": float,
279
+ },
280
+ )
124
281
 
125
282
  # Drop unaligned records (chrom == '*') if present
126
- df = df[df['chrom'] != '*'].copy()
283
+ df = df[df["chrom"] != "*"].copy()
127
284
  if df.empty:
128
- print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
285
+ logger.debug("[plot_bed_histograms] No aligned reads found; nothing to plot.")
129
286
  return
130
287
 
131
288
  # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
@@ -135,15 +292,16 @@ def _plot_bed_histograms(
135
292
 
136
293
  if coordinate_mode == "one_based":
137
294
  # convert to 0-based half-open [start0, end0)
138
- start0 = df['start'].to_numpy() - 1
139
- end0 = df['end'].to_numpy() # inclusive in input -> +1 already handled by not subtracting
295
+ start0 = df["start"].to_numpy() - 1
296
+ end0 = df["end"].to_numpy() # inclusive in input -> +1 already handled by not subtracting
140
297
  else:
141
298
  # already 0-based half-open (assumption)
142
- start0 = df['start'].to_numpy()
143
- end0 = df['end'].to_numpy()
299
+ start0 = df["start"].to_numpy()
300
+ end0 = df["end"].to_numpy()
144
301
 
145
302
  # Clip helper for hist tails
146
303
  def _clip_series(s, q=(0.0, 0.995)):
304
+ """Clip a Series to quantile bounds for plotting."""
147
305
  if q is None:
148
306
  return s.to_numpy()
149
307
  lo = s.quantile(q[0]) if q[0] is not None else s.min()
@@ -152,47 +310,48 @@ def _plot_bed_histograms(
152
310
  return np.clip(x, lo, hi)
153
311
 
154
312
  # Load chromosome order/lengths from FASTA
155
- with pysam.FastaFile(fasta) as fa:
313
+ pysam_mod = _require_pysam()
314
+ with pysam_mod.FastaFile(fasta) as fa:
156
315
  ref_names = list(fa.references)
157
316
  ref_lengths = dict(zip(ref_names, fa.lengths))
158
317
 
159
318
  # Keep only chroms present in FASTA and with at least one read
160
- chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
319
+ chroms = [c for c in df["chrom"].unique() if c in ref_lengths]
161
320
  # Order chromosomes by FASTA order
162
321
  chrom_order = [c for c in ref_names if c in chroms]
163
322
 
164
323
  if not chrom_order:
165
- print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
324
+ logger.debug(
325
+ "[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting."
326
+ )
166
327
  return
167
328
 
168
329
  # Pagination
169
330
  def _sanitize(name: str) -> str:
331
+ """Sanitize a string for use in filenames."""
170
332
  return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
171
333
 
172
334
  cols_per_fig = 4 if include_mapq_quality else 2
173
335
 
174
336
  for start_idx in range(0, len(chrom_order), rows_per_fig):
175
- chunk = chrom_order[start_idx:start_idx + rows_per_fig]
337
+ chunk = chrom_order[start_idx : start_idx + rows_per_fig]
176
338
  nrows = len(chunk)
177
339
  ncols = cols_per_fig
178
340
 
179
341
  fig, axes = plt.subplots(
180
- nrows=nrows, ncols=ncols,
181
- figsize=(4.0 * ncols, 2.6 * nrows),
182
- dpi=160,
183
- squeeze=False
342
+ nrows=nrows, ncols=ncols, figsize=(4.0 * ncols, 2.6 * nrows), dpi=160, squeeze=False
184
343
  )
185
344
 
186
345
  for r, chrom in enumerate(chunk):
187
346
  chrom_len = ref_lengths[chrom]
188
- mask = (df['chrom'].to_numpy() == chrom)
347
+ mask = df["chrom"].to_numpy() == chrom
189
348
 
190
349
  # Slice per-chrom arrays for speed
191
350
  s0 = start0[mask]
192
351
  e0 = end0[mask]
193
- len_arr = df.loc[mask, 'read_len']
194
- mapq_arr = df.loc[mask, 'mapq']
195
- q_arr = df.loc[mask, 'avg_q']
352
+ len_arr = df.loc[mask, "read_len"]
353
+ mapq_arr = df.loc[mask, "mapq"]
354
+ q_arr = df.loc[mask, "avg_q"]
196
355
 
197
356
  # --- Col 1: Read length histogram (clipped) ---
198
357
  ax = axes[r, 0]
@@ -222,7 +381,7 @@ def _plot_bed_histograms(
222
381
 
223
382
  # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
224
383
  for lo, hi in zip(b_lo, b_hi):
225
- cov[lo:hi + 1] += 1
384
+ cov[lo : hi + 1] += 1
226
385
 
227
386
  x_mid = (edges[:-1] + edges[1:]) / 2.0
228
387
  ax.plot(x_mid, cov)
@@ -237,7 +396,12 @@ def _plot_bed_histograms(
237
396
  # --- Col 3: MAPQ ---
238
397
  ax = axes[r, 2]
239
398
  # Clip MAPQ upper tail if needed (usually 60)
240
- ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
399
+ ax.hist(
400
+ _clip_series(mapq_arr.fillna(0), clip_quantiles),
401
+ bins=bins,
402
+ edgecolor="black",
403
+ alpha=0.7,
404
+ )
241
405
  if r == 0:
242
406
  ax.set_title("MAPQ")
243
407
  ax.set_xlabel("MAPQ")
@@ -245,7 +409,12 @@ def _plot_bed_histograms(
245
409
 
246
410
  # --- Col 4: Avg base quality ---
247
411
  ax = axes[r, 3]
248
- ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
412
+ ax.hist(
413
+ _clip_series(q_arr.fillna(np.nan), clip_quantiles),
414
+ bins=bins,
415
+ edgecolor="black",
416
+ alpha=0.7,
417
+ )
249
418
  if r == 0:
250
419
  ax.set_title("Avg base qual")
251
420
  ax.set_xlabel("Phred")
@@ -254,7 +423,8 @@ def _plot_bed_histograms(
254
423
  fig.suptitle(
255
424
  f"{bed_basename} — per-chromosome QC "
256
425
  f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
257
- y=0.995, fontsize=11
426
+ y=0.995,
427
+ fontsize=11,
258
428
  )
259
429
  fig.tight_layout(rect=[0, 0, 1, 0.98])
260
430
 
@@ -263,9 +433,20 @@ def _plot_bed_histograms(
263
433
  plt.savefig(out_png, bbox_inches="tight")
264
434
  plt.close(fig)
265
435
 
266
- print("[plot_bed_histograms] Done.")
436
+ logger.debug("[plot_bed_histograms] Done.")
267
437
 
268
- def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
438
+
439
+ def aligned_BAM_to_bed(
440
+ aligned_BAM,
441
+ out_dir,
442
+ fasta,
443
+ make_bigwigs,
444
+ threads=None,
445
+ *,
446
+ samtools_backend: str | None = "auto",
447
+ bedtools_backend: str | None = "auto",
448
+ bigwig_backend: str | None = "auto",
449
+ ):
269
450
  """
270
451
  Takes an aligned BAM as input and writes a BED file of reads as output.
271
452
  Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
@@ -287,60 +468,121 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
287
468
  bed_dir = out_dir / "beds"
288
469
  make_dirs([plotting_dir, bed_dir])
289
470
 
290
- bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
291
-
292
- print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
293
-
294
- with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
295
- for read in bam.fetch(until_eof=True):
296
- if read.is_unmapped:
297
- chrom = "*"
298
- start1 = 1
299
- rl = read.query_length or 0
300
- mapq = 0
301
- else:
302
- chrom = bam.get_reference_name(read.reference_id)
303
- # pysam reference_start is 0-based → +1 for 1-based SAM-like start
304
- start1 = int(read.reference_start) + 1
305
- rl = read.query_length or 0
306
- mapq = int(read.mapping_quality)
307
-
308
- # End position in 1-based inclusive coords
309
- end1 = start1 + (rl or 0) - 1
310
-
311
- qname = read.query_name
312
- quals = read.query_qualities
313
- if quals is None or rl == 0:
314
- avg_q = float("nan")
315
- else:
316
- avg_q = float(np.mean(quals))
317
-
318
- out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
319
-
320
- print(f"BED-like file created: {bed_output}")
471
+ bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
472
+
473
+ logger.debug(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
474
+
475
+ backend_choice = _resolve_backend(
476
+ samtools_backend,
477
+ tool="samtools",
478
+ python_available=pysam is not None,
479
+ cli_name="samtools",
480
+ )
481
+ with open(bed_output, "w") as out:
482
+ if backend_choice == "python":
483
+ pysam_mod = _require_pysam()
484
+ with pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam:
485
+ for read in bam.fetch(until_eof=True):
486
+ if read.is_unmapped:
487
+ chrom = "*"
488
+ start1 = 1
489
+ rl = read.query_length or 0
490
+ mapq = 0
491
+ else:
492
+ chrom = bam.get_reference_name(read.reference_id)
493
+ # pysam reference_start is 0-based → +1 for 1-based SAM-like start
494
+ start1 = int(read.reference_start) + 1
495
+ rl = read.query_length or 0
496
+ mapq = int(read.mapping_quality)
497
+
498
+ # End position in 1-based inclusive coords
499
+ end1 = start1 + (rl or 0) - 1
500
+
501
+ qname = read.query_name
502
+ quals = read.query_qualities
503
+ if quals is None or rl == 0:
504
+ avg_q = float("nan")
505
+ else:
506
+ avg_q = float(np.mean(quals))
507
+
508
+ out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
509
+ else:
510
+ samtools_view = subprocess.Popen(
511
+ ["samtools", "view", str(aligned_BAM)],
512
+ stdout=subprocess.PIPE,
513
+ stderr=subprocess.PIPE,
514
+ text=True,
515
+ )
516
+ assert samtools_view.stdout is not None
517
+ for line in samtools_view.stdout:
518
+ if not line.strip():
519
+ continue
520
+ fields = line.rstrip("\n").split("\t")
521
+ if len(fields) < 11:
522
+ continue
523
+ qname = fields[0]
524
+ flag = int(fields[1])
525
+ chrom = fields[2]
526
+ pos = int(fields[3])
527
+ mapq = int(fields[4])
528
+ seq = fields[9]
529
+ qual = fields[10]
530
+ rl = 0 if seq == "*" else len(seq)
531
+ is_unmapped = bool(flag & 0x4) or chrom == "*"
532
+ if is_unmapped:
533
+ chrom = "*"
534
+ start1 = 1
535
+ mapq = 0
536
+ else:
537
+ start1 = pos
538
+ end1 = start1 + (rl or 0) - 1
539
+ if qual == "*" or rl == 0:
540
+ avg_q = float("nan")
541
+ else:
542
+ avg_q = float(np.mean([ord(ch) - 33 for ch in qual]))
543
+ out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
544
+ rc = samtools_view.wait()
545
+ if rc != 0:
546
+ stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
547
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
548
+
549
+ logger.debug(f"BED-like file created: {bed_output}")
321
550
 
322
551
  def split_bed(bed):
323
552
  """Splits into aligned and unaligned reads (chrom == '*')."""
324
553
  bed = str(bed)
325
554
  aligned = bed.replace(".bed", "_aligned.bed")
326
555
  unaligned = bed.replace(".bed", "_unaligned.bed")
327
- with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
556
+ with (
557
+ open(bed, "r") as infile,
558
+ open(aligned, "w") as aligned_out,
559
+ open(unaligned, "w") as unaligned_out,
560
+ ):
328
561
  for line in infile:
329
562
  (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
330
563
  os.remove(bed)
331
564
  return aligned
332
565
 
333
- print(f"Splitting: {bed_output}")
566
+ logger.debug(f"Splitting: {bed_output}")
334
567
  aligned_bed = split_bed(bed_output)
335
568
 
336
569
  with ProcessPoolExecutor() as executor:
337
570
  futures = []
338
571
  futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
339
572
  if make_bigwigs:
340
- futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
573
+ futures.append(
574
+ executor.submit(
575
+ _bed_to_bigwig,
576
+ fasta,
577
+ aligned_bed,
578
+ bedtools_backend=bedtools_backend,
579
+ bigwig_backend=bigwig_backend,
580
+ )
581
+ )
341
582
  concurrent.futures.wait(futures)
342
583
 
343
- print("Processing completed successfully.")
584
+ logger.debug("Processing completed successfully.")
585
+
344
586
 
345
587
  def extract_read_lengths_from_bed(file_path):
346
588
  """
@@ -352,15 +594,16 @@ def extract_read_lengths_from_bed(file_path):
352
594
  read_dict (dict)
353
595
  """
354
596
  import pandas as pd
355
- columns = ['chrom', 'start', 'end', 'length', 'name']
356
- df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
597
+
598
+ columns = ["chrom", "start", "end", "length", "name"]
599
+ df = pd.read_csv(file_path, sep="\t", header=None, names=columns, comment="#")
357
600
  read_dict = {}
358
601
  for _, row in df.iterrows():
359
- chrom = row['chrom']
360
- start = row['start']
361
- end = row['end']
362
- name = row['name']
363
- length = row['length']
602
+ chrom = row["chrom"]
603
+ start = row["start"]
604
+ end = row["end"]
605
+ name = row["name"]
606
+ length = row["length"]
364
607
  read_dict[name] = length
365
608
 
366
- return read_dict
609
+ return read_dict