smftools 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +2 -0
  7. smftools/cli/hmm_adata.py +7 -2
  8. smftools/cli/load_adata.py +130 -98
  9. smftools/cli/preprocess_adata.py +2 -0
  10. smftools/cli/spatial_adata.py +5 -1
  11. smftools/cli_entry.py +26 -1
  12. smftools/config/__init__.py +2 -0
  13. smftools/config/default.yaml +4 -1
  14. smftools/config/experiment_config.py +6 -0
  15. smftools/datasets/__init__.py +2 -0
  16. smftools/hmm/HMM.py +9 -3
  17. smftools/hmm/__init__.py +24 -13
  18. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  19. smftools/hmm/archived/calculate_distances.py +2 -0
  20. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  21. smftools/hmm/archived/train_hmm.py +2 -0
  22. smftools/hmm/call_hmm_peaks.py +5 -2
  23. smftools/hmm/display_hmm.py +4 -1
  24. smftools/hmm/hmm_readwrite.py +7 -2
  25. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  26. smftools/informatics/__init__.py +53 -34
  27. smftools/informatics/archived/bam_conversion.py +2 -0
  28. smftools/informatics/archived/bam_direct.py +2 -0
  29. smftools/informatics/archived/basecall_pod5s.py +2 -0
  30. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  31. smftools/informatics/archived/conversion_smf.py +2 -0
  32. smftools/informatics/archived/deaminase_smf.py +1 -0
  33. smftools/informatics/archived/direct_smf.py +2 -0
  34. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  35. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  36. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  37. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  38. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  39. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  40. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  41. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  42. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  43. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  44. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  45. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  48. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  49. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  50. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  51. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  52. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  53. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  54. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  55. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  56. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  57. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  58. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  59. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  60. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  61. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  62. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  63. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  64. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  65. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  66. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  67. smftools/informatics/archived/subsample_pod5.py +2 -0
  68. smftools/informatics/bam_functions.py +737 -170
  69. smftools/informatics/basecalling.py +2 -0
  70. smftools/informatics/bed_functions.py +271 -61
  71. smftools/informatics/binarize_converted_base_identities.py +3 -0
  72. smftools/informatics/complement_base_list.py +2 -0
  73. smftools/informatics/converted_BAM_to_adata.py +66 -22
  74. smftools/informatics/fasta_functions.py +94 -10
  75. smftools/informatics/h5ad_functions.py +8 -2
  76. smftools/informatics/modkit_extract_to_adata.py +16 -6
  77. smftools/informatics/modkit_functions.py +2 -0
  78. smftools/informatics/ohe.py +2 -0
  79. smftools/informatics/pod5_functions.py +3 -2
  80. smftools/machine_learning/__init__.py +22 -6
  81. smftools/machine_learning/data/__init__.py +2 -0
  82. smftools/machine_learning/data/anndata_data_module.py +18 -4
  83. smftools/machine_learning/data/preprocessing.py +2 -0
  84. smftools/machine_learning/evaluation/__init__.py +2 -0
  85. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  86. smftools/machine_learning/evaluation/evaluators.py +14 -9
  87. smftools/machine_learning/inference/__init__.py +2 -0
  88. smftools/machine_learning/inference/inference_utils.py +2 -0
  89. smftools/machine_learning/inference/lightning_inference.py +6 -1
  90. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  91. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  92. smftools/machine_learning/models/__init__.py +2 -0
  93. smftools/machine_learning/models/base.py +7 -2
  94. smftools/machine_learning/models/cnn.py +7 -2
  95. smftools/machine_learning/models/lightning_base.py +16 -11
  96. smftools/machine_learning/models/mlp.py +5 -1
  97. smftools/machine_learning/models/positional.py +7 -2
  98. smftools/machine_learning/models/rnn.py +5 -1
  99. smftools/machine_learning/models/sklearn_models.py +14 -9
  100. smftools/machine_learning/models/transformer.py +7 -2
  101. smftools/machine_learning/models/wrappers.py +6 -2
  102. smftools/machine_learning/training/__init__.py +2 -0
  103. smftools/machine_learning/training/train_lightning_model.py +13 -3
  104. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  105. smftools/machine_learning/utils/__init__.py +2 -0
  106. smftools/machine_learning/utils/device.py +5 -1
  107. smftools/machine_learning/utils/grl.py +5 -1
  108. smftools/optional_imports.py +31 -0
  109. smftools/plotting/__init__.py +32 -31
  110. smftools/plotting/autocorrelation_plotting.py +9 -5
  111. smftools/plotting/classifiers.py +16 -4
  112. smftools/plotting/general_plotting.py +6 -3
  113. smftools/plotting/hmm_plotting.py +12 -2
  114. smftools/plotting/position_stats.py +15 -7
  115. smftools/plotting/qc_plotting.py +6 -1
  116. smftools/preprocessing/__init__.py +35 -37
  117. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  118. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  119. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  120. smftools/preprocessing/archived/preprocessing.py +2 -0
  121. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  122. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  123. smftools/preprocessing/calculate_complexity_II.py +4 -1
  124. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  125. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  126. smftools/preprocessing/calculate_position_Youden.py +9 -2
  127. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  128. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  129. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  130. smftools/preprocessing/make_dirs.py +2 -1
  131. smftools/preprocessing/min_non_diagonal.py +2 -0
  132. smftools/preprocessing/recipes.py +2 -0
  133. smftools/tools/__init__.py +26 -18
  134. smftools/tools/archived/apply_hmm.py +2 -0
  135. smftools/tools/archived/classifiers.py +2 -0
  136. smftools/tools/archived/classify_methylated_features.py +2 -0
  137. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  138. smftools/tools/archived/subset_adata_v1.py +2 -0
  139. smftools/tools/archived/subset_adata_v2.py +2 -0
  140. smftools/tools/calculate_umap.py +3 -1
  141. smftools/tools/cluster_adata_on_methylation.py +7 -1
  142. smftools/tools/position_stats.py +17 -27
  143. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/METADATA +67 -33
  144. smftools-0.3.0.dist-info/RECORD +182 -0
  145. smftools-0.2.5.dist-info/RECORD +0 -181
  146. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  147. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  148. {smftools-0.2.5.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import subprocess
2
4
 
3
5
 
@@ -1,23 +1,134 @@
1
+ from __future__ import annotations
2
+
1
3
  import concurrent.futures
2
4
  import os
5
+ import shutil
6
+ import subprocess
3
7
  from concurrent.futures import ProcessPoolExecutor
4
8
  from pathlib import Path
9
+ from typing import TYPE_CHECKING
5
10
 
6
- import matplotlib.pyplot as plt
7
11
  import numpy as np
8
12
  import pandas as pd
9
- import pybedtools
10
- import pyBigWig
11
- import pysam
12
13
 
13
14
  from smftools.logging_utils import get_logger
15
+ from smftools.optional_imports import require
14
16
 
15
17
  from ..readwrite import make_dirs
16
18
 
17
19
  logger = get_logger(__name__)
18
20
 
19
-
20
- def _bed_to_bigwig(fasta: str, bed: str) -> str:
21
+ if TYPE_CHECKING:
22
+ import pybedtools as pybedtools_types
23
+ import pyBigWig as pybigwig_types
24
+ import pysam as pysam_types
25
+
26
+ try:
27
+ import pybedtools
28
+ except Exception:
29
+ pybedtools = None # type: ignore
30
+
31
+ try:
32
+ import pyBigWig
33
+ except Exception:
34
+ pyBigWig = None # type: ignore
35
+
36
+ try:
37
+ import pysam
38
+ except Exception:
39
+ pysam = None # type: ignore
40
+
41
+
42
+ def _require_pybedtools() -> "pybedtools_types":
43
+ if pybedtools is not None:
44
+ return pybedtools
45
+ return require("pybedtools", extra="pybedtools", purpose="bedtools Python backend")
46
+
47
+
48
+ def _require_pybigwig() -> "pybigwig_types":
49
+ if pyBigWig is not None:
50
+ return pyBigWig
51
+ return require("pyBigWig", extra="pybigwig", purpose="BigWig Python backend")
52
+
53
+
54
+ def _require_pysam() -> "pysam_types":
55
+ if pysam is not None:
56
+ return pysam
57
+ return require("pysam", extra="pysam", purpose="FASTA indexing")
58
+
59
+
60
+ def _resolve_backend(
61
+ backend: str | None, *, tool: str, python_available: bool, cli_name: str
62
+ ) -> str:
63
+ choice = (backend or "auto").strip().lower()
64
+ if choice not in {"auto", "python", "cli"}:
65
+ raise ValueError(f"{tool}_backend must be one of: auto, python, cli")
66
+ if choice == "python":
67
+ if not python_available:
68
+ raise RuntimeError(
69
+ f"{tool}_backend=python requires the Python package to be installed."
70
+ )
71
+ return "python"
72
+ if choice == "cli":
73
+ if not shutil.which(cli_name):
74
+ raise RuntimeError(f"{tool}_backend=cli requires {cli_name} in PATH.")
75
+ return "cli"
76
+ if shutil.which(cli_name):
77
+ return "cli"
78
+ if python_available:
79
+ return "python"
80
+ raise RuntimeError(f"Neither Python nor CLI backend is available for {tool}.")
81
+
82
+
83
+ def _read_chrom_sizes(chrom_sizes: Path) -> list[tuple[str, int]]:
84
+ sizes: list[tuple[str, int]] = []
85
+ with chrom_sizes.open() as f:
86
+ for line in f:
87
+ chrom, size = line.split()[:2]
88
+ sizes.append((chrom, int(size)))
89
+ return sizes
90
+
91
+
92
+ def _ensure_fasta_index(fasta: Path) -> Path:
93
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
94
+ if fai.exists():
95
+ return fai
96
+ if shutil.which("samtools"):
97
+ cp = subprocess.run(
98
+ ["samtools", "faidx", str(fasta)],
99
+ stdout=subprocess.DEVNULL,
100
+ stderr=subprocess.PIPE,
101
+ text=True,
102
+ )
103
+ if cp.returncode != 0:
104
+ raise RuntimeError(f"samtools faidx failed (exit {cp.returncode}):\n{cp.stderr}")
105
+ return fai
106
+ if pysam is not None:
107
+ pysam_mod = _require_pysam()
108
+ pysam_mod.faidx(str(fasta))
109
+ return fai
110
+ raise RuntimeError("FASTA indexing requires pysam or samtools in PATH.")
111
+
112
+
113
+ def _ensure_chrom_sizes(fasta: Path) -> Path:
114
+ fai = _ensure_fasta_index(fasta)
115
+ chrom_sizes = fasta.with_suffix(".chrom.sizes")
116
+ if chrom_sizes.exists():
117
+ return chrom_sizes
118
+ with fai.open() as f_in, chrom_sizes.open("w") as out:
119
+ for line in f_in:
120
+ chrom, size = line.split()[:2]
121
+ out.write(f"{chrom}\t{size}\n")
122
+ return chrom_sizes
123
+
124
+
125
+ def _bed_to_bigwig(
126
+ fasta: str,
127
+ bed: str,
128
+ *,
129
+ bedtools_backend: str | None = "auto",
130
+ bigwig_backend: str | None = "auto",
131
+ ) -> str:
21
132
  """
22
133
  BED → bedGraph → bigWig
23
134
  Requires:
@@ -28,40 +139,70 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
28
139
  fa = Path(fasta) # path to .fa
29
140
  parent = bed.parent
30
141
  stem = bed.stem
31
- fa_stem = fa.stem
32
- fai = parent / f"{fa_stem}.fai"
142
+ chrom_sizes = _ensure_chrom_sizes(fa)
33
143
 
34
144
  bedgraph = parent / f"{stem}.bedgraph"
35
145
  bigwig = parent / f"{stem}.bw"
36
146
 
37
147
  # 1) Compute coverage → bedGraph
38
- logger.debug(f"[pybedtools] generating coverage bedgraph from {bed}")
39
- bt = pybedtools.BedTool(str(bed))
40
- # bedtools genomecov -bg
41
- coverage = bt.genome_coverage(bg=True, genome=str(fai))
42
- coverage.saveas(str(bedgraph))
148
+ bedtools_choice = _resolve_backend(
149
+ bedtools_backend,
150
+ tool="bedtools",
151
+ python_available=pybedtools is not None,
152
+ cli_name="bedtools",
153
+ )
154
+ if bedtools_choice == "python":
155
+ logger.debug(f"[pybedtools] generating coverage bedgraph from {bed}")
156
+ pybedtools_mod = _require_pybedtools()
157
+ bt = pybedtools_mod.BedTool(str(bed))
158
+ # bedtools genomecov -bg
159
+ coverage = bt.genome_coverage(bg=True, genome=str(chrom_sizes))
160
+ coverage.saveas(str(bedgraph))
161
+ else:
162
+ if not shutil.which("bedtools"):
163
+ raise RuntimeError("bedtools is required but not available in PATH.")
164
+ cmd = [
165
+ "bedtools",
166
+ "genomecov",
167
+ "-i",
168
+ str(bed),
169
+ "-g",
170
+ str(chrom_sizes),
171
+ "-bg",
172
+ ]
173
+ logger.debug("[bedtools] generating coverage bedgraph: %s", " ".join(cmd))
174
+ with bedgraph.open("w") as out:
175
+ cp = subprocess.run(cmd, stdout=out, stderr=subprocess.PIPE, text=True)
176
+ if cp.returncode != 0:
177
+ raise RuntimeError(f"bedtools genomecov failed (exit {cp.returncode}):\n{cp.stderr}")
43
178
 
44
179
  # 2) Convert bedGraph → BigWig via pyBigWig
45
- logger.debug(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
46
-
47
- # read chrom sizes from the FASTA .fai index
48
- chrom_sizes = {}
49
- with open(fai) as f:
50
- for line in f:
51
- fields = line.strip().split("\t")
52
- chrom = fields[0]
53
- size = int(fields[1])
54
- chrom_sizes[chrom] = size
55
-
56
- bw = pyBigWig.open(str(bigwig), "w")
57
- bw.addHeader(list(chrom_sizes.items()))
58
-
59
- with open(bedgraph) as f:
60
- for line in f:
61
- chrom, start, end, coverage = line.strip().split()
62
- bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
63
-
64
- bw.close()
180
+ bigwig_choice = _resolve_backend(
181
+ bigwig_backend,
182
+ tool="bigwig",
183
+ python_available=pyBigWig is not None,
184
+ cli_name="bedGraphToBigWig",
185
+ )
186
+ if bigwig_choice == "python":
187
+ logger.debug(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
188
+ pybigwig_mod = _require_pybigwig()
189
+ bw = pybigwig_mod.open(str(bigwig), "w")
190
+ bw.addHeader(_read_chrom_sizes(chrom_sizes))
191
+
192
+ with bedgraph.open() as f:
193
+ for line in f:
194
+ chrom, start, end, coverage = line.strip().split()
195
+ bw.addEntries(chrom, int(start), ends=int(end), values=float(coverage))
196
+
197
+ bw.close()
198
+ else:
199
+ if not shutil.which("bedGraphToBigWig"):
200
+ raise RuntimeError("bedGraphToBigWig is required but not available in PATH.")
201
+ cmd = ["bedGraphToBigWig", str(bedgraph), str(chrom_sizes), str(bigwig)]
202
+ logger.debug("[bedGraphToBigWig] converting bedgraph → bigwig: %s", " ".join(cmd))
203
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
204
+ if cp.returncode != 0:
205
+ raise RuntimeError(f"bedGraphToBigWig failed (exit {cp.returncode}):\n{cp.stderr}")
65
206
 
66
207
  logger.debug(f"BigWig written: {bigwig}")
67
208
  return str(bigwig)
@@ -113,6 +254,8 @@ def _plot_bed_histograms(
113
254
  coordinate_mode : {"one_based","zero_based"}
114
255
  One-based, inclusive (your file) vs BED-standard zero-based, half-open.
115
256
  """
257
+ plt = require("matplotlib.pyplot", extra="plotting", purpose="plotting BED histograms")
258
+
116
259
  os.makedirs(plotting_directory, exist_ok=True)
117
260
 
118
261
  bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
@@ -167,7 +310,8 @@ def _plot_bed_histograms(
167
310
  return np.clip(x, lo, hi)
168
311
 
169
312
  # Load chromosome order/lengths from FASTA
170
- with pysam.FastaFile(fasta) as fa:
313
+ pysam_mod = _require_pysam()
314
+ with pysam_mod.FastaFile(fasta) as fa:
171
315
  ref_names = list(fa.references)
172
316
  ref_lengths = dict(zip(ref_names, fa.lengths))
173
317
 
@@ -292,7 +436,17 @@ def _plot_bed_histograms(
292
436
  logger.debug("[plot_bed_histograms] Done.")
293
437
 
294
438
 
295
- def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
439
+ def aligned_BAM_to_bed(
440
+ aligned_BAM,
441
+ out_dir,
442
+ fasta,
443
+ make_bigwigs,
444
+ threads=None,
445
+ *,
446
+ samtools_backend: str | None = "auto",
447
+ bedtools_backend: str | None = "auto",
448
+ bigwig_backend: str | None = "auto",
449
+ ):
296
450
  """
297
451
  Takes an aligned BAM as input and writes a BED file of reads as output.
298
452
  Bed columns are: Record name, start position, end position, read length, read name, mapping quality, read quality.
@@ -318,31 +472,79 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
318
472
 
319
473
  logger.debug(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
320
474
 
321
- with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
322
- for read in bam.fetch(until_eof=True):
323
- if read.is_unmapped:
324
- chrom = "*"
325
- start1 = 1
326
- rl = read.query_length or 0
327
- mapq = 0
328
- else:
329
- chrom = bam.get_reference_name(read.reference_id)
330
- # pysam reference_start is 0-based → +1 for 1-based SAM-like start
331
- start1 = int(read.reference_start) + 1
332
- rl = read.query_length or 0
333
- mapq = int(read.mapping_quality)
334
-
335
- # End position in 1-based inclusive coords
336
- end1 = start1 + (rl or 0) - 1
337
-
338
- qname = read.query_name
339
- quals = read.query_qualities
340
- if quals is None or rl == 0:
341
- avg_q = float("nan")
342
- else:
343
- avg_q = float(np.mean(quals))
344
-
345
- out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
475
+ backend_choice = _resolve_backend(
476
+ samtools_backend,
477
+ tool="samtools",
478
+ python_available=pysam is not None,
479
+ cli_name="samtools",
480
+ )
481
+ with open(bed_output, "w") as out:
482
+ if backend_choice == "python":
483
+ pysam_mod = _require_pysam()
484
+ with pysam_mod.AlignmentFile(aligned_BAM, "rb") as bam:
485
+ for read in bam.fetch(until_eof=True):
486
+ if read.is_unmapped:
487
+ chrom = "*"
488
+ start1 = 1
489
+ rl = read.query_length or 0
490
+ mapq = 0
491
+ else:
492
+ chrom = bam.get_reference_name(read.reference_id)
493
+ # pysam reference_start is 0-based → +1 for 1-based SAM-like start
494
+ start1 = int(read.reference_start) + 1
495
+ rl = read.query_length or 0
496
+ mapq = int(read.mapping_quality)
497
+
498
+ # End position in 1-based inclusive coords
499
+ end1 = start1 + (rl or 0) - 1
500
+
501
+ qname = read.query_name
502
+ quals = read.query_qualities
503
+ if quals is None or rl == 0:
504
+ avg_q = float("nan")
505
+ else:
506
+ avg_q = float(np.mean(quals))
507
+
508
+ out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
509
+ else:
510
+ samtools_view = subprocess.Popen(
511
+ ["samtools", "view", str(aligned_BAM)],
512
+ stdout=subprocess.PIPE,
513
+ stderr=subprocess.PIPE,
514
+ text=True,
515
+ )
516
+ assert samtools_view.stdout is not None
517
+ for line in samtools_view.stdout:
518
+ if not line.strip():
519
+ continue
520
+ fields = line.rstrip("\n").split("\t")
521
+ if len(fields) < 11:
522
+ continue
523
+ qname = fields[0]
524
+ flag = int(fields[1])
525
+ chrom = fields[2]
526
+ pos = int(fields[3])
527
+ mapq = int(fields[4])
528
+ seq = fields[9]
529
+ qual = fields[10]
530
+ rl = 0 if seq == "*" else len(seq)
531
+ is_unmapped = bool(flag & 0x4) or chrom == "*"
532
+ if is_unmapped:
533
+ chrom = "*"
534
+ start1 = 1
535
+ mapq = 0
536
+ else:
537
+ start1 = pos
538
+ end1 = start1 + (rl or 0) - 1
539
+ if qual == "*" or rl == 0:
540
+ avg_q = float("nan")
541
+ else:
542
+ avg_q = float(np.mean([ord(ch) - 33 for ch in qual]))
543
+ out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
544
+ rc = samtools_view.wait()
545
+ if rc != 0:
546
+ stderr = samtools_view.stderr.read() if samtools_view.stderr else ""
547
+ raise RuntimeError(f"samtools view failed (exit {rc}):\n{stderr}")
346
548
 
347
549
  logger.debug(f"BED-like file created: {bed_output}")
348
550
 
@@ -368,7 +570,15 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
368
570
  futures = []
369
571
  futures.append(executor.submit(_plot_bed_histograms, aligned_bed, plotting_dir, fasta))
370
572
  if make_bigwigs:
371
- futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
573
+ futures.append(
574
+ executor.submit(
575
+ _bed_to_bigwig,
576
+ fasta,
577
+ aligned_bed,
578
+ bedtools_backend=bedtools_backend,
579
+ bigwig_backend=bigwig_backend,
580
+ )
581
+ )
372
582
  concurrent.futures.wait(futures)
373
583
 
374
584
  logger.debug("Processing completed successfully.")
@@ -1,3 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+
1
4
  def binarize_converted_base_identities(
2
5
  base_identities,
3
6
  strand,
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  # complement_base_list
2
4
 
3
5
 
@@ -1,18 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import gc
2
- import multiprocessing
4
+ import logging
3
5
  import shutil
4
6
  import time
5
7
  import traceback
6
8
  from multiprocessing import Manager, Pool, current_process
7
9
  from pathlib import Path
8
- from typing import Iterable, Optional, Union
10
+ from typing import TYPE_CHECKING, Iterable, Optional, Union
9
11
 
10
12
  import anndata as ad
11
13
  import numpy as np
12
14
  import pandas as pd
13
- import torch
14
15
 
15
- from smftools.logging_utils import get_logger
16
+ from smftools.logging_utils import get_logger, setup_logging
17
+ from smftools.optional_imports import require
16
18
 
17
19
  from ..readwrite import make_dirs
18
20
  from .bam_functions import count_aligned_reads, extract_base_identities
@@ -22,8 +24,10 @@ from .ohe import ohe_batching
22
24
 
23
25
  logger = get_logger(__name__)
24
26
 
25
- if __name__ == "__main__":
26
- multiprocessing.set_start_method("forkserver", force=True)
27
+ if TYPE_CHECKING:
28
+ import torch
29
+
30
+ torch = require("torch", extra="torch", purpose="converted BAM processing")
27
31
 
28
32
 
29
33
  def converted_BAM_to_adata(
@@ -40,6 +44,7 @@ def converted_BAM_to_adata(
40
44
  deaminase_footprinting: bool = False,
41
45
  delete_intermediates: bool = True,
42
46
  double_barcoded_path: Path | None = None,
47
+ samtools_backend: str | None = "auto",
43
48
  ) -> tuple[ad.AnnData | None, Path]:
44
49
  """Convert BAM files into an AnnData object by binarizing modified base identities.
45
50
 
@@ -89,7 +94,9 @@ def converted_BAM_to_adata(
89
94
  )
90
95
 
91
96
  bam_path_list = bam_files
92
- logger.info(f"Found {len(bam_files)} BAM files: {bam_files}")
97
+
98
+ bam_names = [bam.name for bam in bam_files]
99
+ logger.info(f"Found {len(bam_files)} BAM files within {split_dir}: {bam_names}")
93
100
 
94
101
  ## Process Conversion Sites
95
102
  max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(
@@ -98,7 +105,7 @@ def converted_BAM_to_adata(
98
105
 
99
106
  ## Filter BAM Files by Mapping Threshold
100
107
  records_to_analyze = filter_bams_by_mapping_threshold(
101
- bam_path_list, bam_files, mapping_threshold
108
+ bam_path_list, bam_files, mapping_threshold, samtools_backend
102
109
  )
103
110
 
104
111
  ## Process BAMs in Parallel
@@ -113,6 +120,7 @@ def converted_BAM_to_adata(
113
120
  max_reference_length,
114
121
  device,
115
122
  deaminase_footprinting,
123
+ samtools_backend,
116
124
  )
117
125
 
118
126
  final_adata.uns["References"] = {}
@@ -240,14 +248,14 @@ def process_conversion_sites(
240
248
  return max_reference_length, record_FASTA_dict, chromosome_FASTA_dict
241
249
 
242
250
 
243
- def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold):
251
+ def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold, samtools_backend):
244
252
  """Filters BAM files based on mapping threshold."""
245
253
  records_to_analyze = set()
246
254
 
247
255
  for i, bam in enumerate(bam_path_list):
248
- aligned_reads, unaligned_reads, record_counts = count_aligned_reads(bam)
256
+ aligned_reads, unaligned_reads, record_counts = count_aligned_reads(bam, samtools_backend)
249
257
  aligned_percent = aligned_reads * 100 / (aligned_reads + unaligned_reads)
250
- print(f"{aligned_percent:.2f}% of reads in {bam_files[i]} aligned successfully.")
258
+ logger.info(f"{aligned_percent:.2f}% of reads in {bam_files[i].name} aligned successfully.")
251
259
 
252
260
  for record, (count, percent) in record_counts.items():
253
261
  if percent >= mapping_threshold:
@@ -267,6 +275,7 @@ def process_single_bam(
267
275
  max_reference_length,
268
276
  device,
269
277
  deaminase_footprinting,
278
+ samtools_backend,
270
279
  ):
271
280
  """Worker function to process a single BAM file (must be at top-level for multiprocessing)."""
272
281
  adata_list = []
@@ -281,7 +290,7 @@ def process_single_bam(
281
290
  # Extract Base Identities
282
291
  fwd_bases, rev_bases, mismatch_counts_per_read, mismatch_trend_per_read = (
283
292
  extract_base_identities(
284
- bam, record, range(current_length), max_reference_length, sequence
293
+ bam, record, range(current_length), max_reference_length, sequence, samtools_backend
285
294
  )
286
295
  )
287
296
  mismatch_trend_series = pd.Series(mismatch_trend_per_read)
@@ -433,9 +442,13 @@ def worker_function(
433
442
  max_reference_length,
434
443
  device,
435
444
  deaminase_footprinting,
445
+ samtools_backend,
436
446
  progress_queue,
447
+ log_level,
448
+ log_file,
437
449
  ):
438
450
  """Worker function that processes a single BAM and writes the output to an H5AD file."""
451
+ _ensure_worker_logging(log_level, log_file)
439
452
  worker_id = current_process().pid # Get worker process ID
440
453
  sample = bam.stem
441
454
 
@@ -471,6 +484,7 @@ def worker_function(
471
484
  max_reference_length,
472
485
  device,
473
486
  deaminase_footprinting,
487
+ samtools_backend,
474
488
  )
475
489
 
476
490
  if adata is not None:
@@ -501,19 +515,13 @@ def process_bams_parallel(
501
515
  max_reference_length,
502
516
  device,
503
517
  deaminase_footprinting,
518
+ samtools_backend,
504
519
  ):
505
520
  """Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
506
521
  make_dirs(h5_dir) # Ensure h5_dir exists
507
522
 
508
523
  logger.info(f"Starting parallel BAM processing with {num_threads} threads...")
509
-
510
- # Ensure macOS uses forkserver to avoid spawning issues
511
- try:
512
- import multiprocessing
513
-
514
- multiprocessing.set_start_method("forkserver", force=True)
515
- except RuntimeError:
516
- logger.warning(f"Multiprocessing context already set. Skipping set_start_method.")
524
+ log_level, log_file = _get_logger_config()
517
525
 
518
526
  with Manager() as manager:
519
527
  progress_queue = manager.Queue()
@@ -534,13 +542,16 @@ def process_bams_parallel(
534
542
  max_reference_length,
535
543
  device,
536
544
  deaminase_footprinting,
545
+ samtools_backend,
537
546
  progress_queue,
547
+ log_level,
548
+ log_file,
538
549
  ),
539
550
  )
540
551
  for i, bam in enumerate(bam_path_list)
541
552
  ]
542
553
 
543
- logger.info(f"Submitted {len(bam_path_list)} BAMs for processing.")
554
+ logger.info(f"Submitting {len(results)} BAMs for processing.")
544
555
 
545
556
  # Track completed BAMs
546
557
  completed_bams = set()
@@ -550,15 +561,18 @@ def process_bams_parallel(
550
561
  completed_bams.add(processed_bam)
551
562
  except Exception as e:
552
563
  logger.error(f"Timeout waiting for worker process. Possible crash? {e}")
564
+ _log_async_result_errors(results, bam_path_list)
553
565
 
554
566
  pool.close()
555
567
  pool.join() # Ensure all workers finish
556
568
 
569
+ _log_async_result_errors(results, bam_path_list)
570
+
557
571
  # Final Concatenation Step
558
572
  h5ad_files = [f for f in h5_dir.iterdir() if f.suffix == ".h5ad"]
559
573
 
560
574
  if not h5ad_files:
561
- logger.debug(f"No valid H5AD files generated. Exiting.")
575
+ logger.warning(f"No valid H5AD files generated. Exiting.")
562
576
  return None
563
577
 
564
578
  logger.info(f"Concatenating {len(h5ad_files)} H5AD files into final output...")
@@ -568,6 +582,36 @@ def process_bams_parallel(
568
582
  return final_adata
569
583
 
570
584
 
585
+ def _log_async_result_errors(results, bam_path_list):
586
+ """Log worker failures captured by multiprocessing AsyncResult objects."""
587
+ for bam, result in zip(bam_path_list, results):
588
+ if not result.ready():
589
+ continue
590
+ try:
591
+ result.get()
592
+ except Exception as exc:
593
+ logger.error("Worker process failed for %s: %s", bam, exc)
594
+
595
+
596
+ def _get_logger_config() -> tuple[int, Path | None]:
597
+ smftools_logger = logging.getLogger("smftools")
598
+ level = smftools_logger.level
599
+ if level == logging.NOTSET:
600
+ level = logging.INFO
601
+ log_file: Path | None = None
602
+ for handler in smftools_logger.handlers:
603
+ if isinstance(handler, logging.FileHandler):
604
+ log_file = Path(handler.baseFilename)
605
+ break
606
+ return level, log_file
607
+
608
+
609
+ def _ensure_worker_logging(log_level: int, log_file: Path | None) -> None:
610
+ smftools_logger = logging.getLogger("smftools")
611
+ if not smftools_logger.handlers:
612
+ setup_logging(level=log_level, log_file=log_file)
613
+
614
+
571
615
  def delete_intermediate_h5ads_and_tmpdir(
572
616
  h5_dir: Union[str, Path, Iterable[str], None],
573
617
  tmp_dir: Optional[Union[str, Path]] = None,