smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +7 -1
  5. smftools/cli/hmm_adata.py +902 -244
  6. smftools/cli/load_adata.py +318 -198
  7. smftools/cli/preprocess_adata.py +285 -171
  8. smftools/cli/spatial_adata.py +137 -53
  9. smftools/cli_entry.py +94 -178
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +5 -1
  12. smftools/config/deaminase.yaml +1 -1
  13. smftools/config/default.yaml +22 -17
  14. smftools/config/direct.yaml +8 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +505 -276
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2125 -1426
  21. smftools/hmm/__init__.py +2 -3
  22. smftools/hmm/archived/call_hmm_peaks.py +16 -1
  23. smftools/hmm/call_hmm_peaks.py +173 -193
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +379 -156
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +195 -29
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +347 -168
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +145 -85
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +8 -8
  84. smftools/preprocessing/append_base_context.py +105 -79
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  86. smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +127 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +44 -22
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +103 -55
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +688 -271
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +93 -27
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +264 -109
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.4.dist-info/RECORD +0 -176
  128. /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
  129. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  130. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  131. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  132. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  133. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,24 +1,55 @@
1
1
  from __future__ import annotations
2
2
 
3
- from pathlib import Path
3
+ import glob
4
4
  import os
5
+ import re
5
6
  import subprocess
6
- import glob
7
7
  import time
8
- from typing import Dict, List, Any, Tuple, Union, Optional, Iterable
9
- import re
8
+ from collections import Counter, defaultdict, deque
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
10
  from itertools import zip_longest
11
- import pysam
11
+ from pathlib import Path
12
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
12
13
 
13
14
  import numpy as np
14
- import concurrent.futures
15
- from concurrent.futures import ThreadPoolExecutor, as_completed
16
- from concurrent.futures import ProcessPoolExecutor
17
-
15
+ import pysam
18
16
  from tqdm import tqdm
19
- from collections import defaultdict, Counter
20
17
 
21
- from ..readwrite import make_dirs, time_string, date_string
18
+ from smftools.logging_utils import get_logger
19
+
20
+ from ..readwrite import date_string, time_string
21
+
22
+ logger = get_logger(__name__)
23
+
24
+ _PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
25
+ _EMPTY_RE = re.compile(r"^\s*$")
26
+
27
+
28
+ def _stream_dorado_logs(stderr_iter) -> None:
29
+ """Stream dorado stderr and emit structured log messages.
30
+
31
+ Args:
32
+ stderr_iter: Iterable of stderr lines.
33
+ """
34
+ last_n: int | None = None
35
+
36
+ for raw in stderr_iter:
37
+ line = raw.rstrip("\n")
38
+ if _EMPTY_RE.match(line):
39
+ continue
40
+
41
+ m = _PROGRESS_RE.search(line)
42
+ if m:
43
+ n = int(m.group(1))
44
+ logger.debug("[dorado] Output records written: %d", n)
45
+ last_n = n
46
+ continue
47
+
48
+ logger.info("[dorado] %s", line)
49
+
50
+ if last_n is not None:
51
+ logger.info("[dorado] Final output records written: %d", last_n)
52
+
22
53
 
23
54
  def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
24
55
  """
@@ -26,7 +57,13 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
26
57
  """
27
58
  bam_path = str(bam_path)
28
59
  fastq_path = str(fastq_path)
29
- with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w", encoding="utf-8") as fq:
60
+
61
+ logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
62
+
63
+ with (
64
+ pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
65
+ open(fastq_path, "w", encoding="utf-8") as fq,
66
+ ):
30
67
  for r in bam.fetch(until_eof=True):
31
68
  # Optionally skip secondary/supplementary:
32
69
  # if r.is_secondary or r.is_supplementary:
@@ -45,14 +82,22 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
45
82
  # q is an array/list of ints (Phred scores).
46
83
  # Convert to FASTQ string with Phred+33 encoding,
47
84
  # clamping to sane range [0, 93] to stay in printable ASCII.
48
- qual_str = "".join(
49
- chr(min(max(int(qv), 0), 93) + 33)
50
- for qv in q
51
- )
85
+ qual_str = "".join(chr(min(max(int(qv), 0), 93) + 33) for qv in q)
52
86
 
53
87
  fq.write(f"@{name}\n{seq}\n+\n{qual_str}\n")
54
88
 
55
- def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
89
+
90
+ def _sort_bam_with_pysam(
91
+ in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
92
+ ) -> None:
93
+ """Sort a BAM file using pysam.
94
+
95
+ Args:
96
+ in_bam: Input BAM path.
97
+ out_bam: Output BAM path.
98
+ threads: Optional thread count.
99
+ """
100
+ logger.debug(f"Sorting BAM using _sort_bam_with_pysam")
56
101
  in_bam, out_bam = str(in_bam), str(out_bam)
57
102
  args = []
58
103
  if threads:
@@ -60,21 +105,31 @@ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], th
60
105
  args += ["-o", out_bam, in_bam]
61
106
  pysam.sort(*args)
62
107
 
108
+
63
109
  def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
110
+ """Index a BAM file using pysam.
111
+
112
+ Args:
113
+ bam_path: BAM path to index.
114
+ threads: Optional thread count.
115
+ """
64
116
  bam_path = str(bam_path)
117
+ logger.debug(f"Indexing BAM using _index_bam_with_pysam")
65
118
  # pysam.index supports samtools-style args
66
119
  if threads:
67
120
  pysam.index("-@", str(threads), bam_path)
68
121
  else:
69
122
  pysam.index(bam_path)
70
123
 
71
- def align_and_sort_BAM(fasta,
72
- input,
73
- cfg,
124
+
125
+ def align_and_sort_BAM(
126
+ fasta,
127
+ input,
128
+ cfg,
74
129
  ):
75
130
  """
76
131
  A wrapper for running dorado aligner and samtools functions
77
-
132
+
78
133
  Parameters:
79
134
  fasta (str): File path to the reference genome to align to.
80
135
  input (str): File path to the basecalled file to align. Works for .bam and .fastq files
@@ -84,61 +139,95 @@ def align_and_sort_BAM(fasta,
84
139
  None
85
140
  The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
86
141
  """
142
+ logger.debug("Aligning and sorting BAM using align_and_sort_BAM")
87
143
  input_basename = input.name
88
144
  input_suffix = input.suffix
89
- input_as_fastq = input.with_name(input.stem + '.fastq')
145
+ input_as_fastq = input.with_name(input.stem + ".fastq")
90
146
 
91
147
  output_path_minus_suffix = cfg.output_directory / input.stem
92
-
148
+
93
149
  aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
94
150
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
95
- aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
151
+ aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
96
152
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
97
153
 
98
154
  if cfg.threads:
99
155
  threads = str(cfg.threads)
100
156
  else:
101
157
  threads = None
102
-
103
- if cfg.aligner == 'minimap2':
158
+
159
+ if cfg.aligner == "minimap2":
104
160
  if not cfg.align_from_bam:
105
- print(f"Converting BAM to FASTQ: {input}")
161
+ logger.debug(f"Converting BAM to FASTQ: {input}")
106
162
  _bam_to_fastq_with_pysam(input, input_as_fastq)
107
- print(f"Aligning FASTQ to Reference: {input_as_fastq}")
163
+ logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
108
164
  mm_input = input_as_fastq
109
- else:
110
- print(f"Aligning BAM to Reference: {input}")
165
+ else:
166
+ logger.debug(f"Aligning BAM to Reference: {input}")
111
167
  mm_input = input
112
168
 
113
169
  if threads:
114
- minimap_command = ['minimap2'] + cfg.aligner_args + ['-t', threads, str(fasta), str(mm_input)]
170
+ minimap_command = (
171
+ ["minimap2"] + cfg.aligner_args + ["-t", threads, str(fasta), str(mm_input)]
172
+ )
115
173
  else:
116
- minimap_command = ['minimap2'] + cfg.aligner_args + [str(fasta), str(mm_input)]
117
- subprocess.run(minimap_command, stdout=open(aligned_output, "wb"))
174
+ minimap_command = ["minimap2"] + cfg.aligner_args + [str(fasta), str(mm_input)]
175
+
176
+ with open(aligned_output, "wb") as out:
177
+ proc = subprocess.Popen(
178
+ minimap_command,
179
+ stdout=out,
180
+ stderr=subprocess.PIPE,
181
+ text=True,
182
+ )
183
+
184
+ assert proc.stderr is not None
185
+ for line in proc.stderr:
186
+ logger.info("[minimap2] %s", line.rstrip())
187
+
188
+ ret = proc.wait()
189
+ if ret != 0:
190
+ raise RuntimeError(f"minimap2 failed with exit code {ret}")
118
191
 
119
192
  if not cfg.align_from_bam:
120
193
  os.remove(input_as_fastq)
121
194
 
122
- elif cfg.aligner == 'dorado':
195
+ elif cfg.aligner == "dorado":
123
196
  # Run dorado aligner
124
197
  print(f"Aligning BAM to Reference: {input}")
125
198
  if threads:
126
- alignment_command = ["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
199
+ alignment_command = (
200
+ ["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
201
+ )
127
202
  else:
128
203
  alignment_command = ["dorado", "aligner"] + cfg.aligner_args + [str(fasta), str(input)]
129
- subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
130
204
 
205
+ with open(aligned_output, "wb") as out:
206
+ proc = subprocess.Popen(
207
+ alignment_command,
208
+ stdout=out,
209
+ stderr=subprocess.PIPE,
210
+ text=True,
211
+ )
212
+
213
+ assert proc.stderr is not None
214
+ _stream_dorado_logs(proc.stderr)
215
+ ret = proc.wait()
216
+
217
+ if ret != 0:
218
+ raise RuntimeError(f"dorado failed with exit code {ret}")
131
219
  else:
132
- print(f'Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado')
220
+ logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
133
221
  return
134
-
222
+
135
223
  # --- Sort & Index with pysam ---
136
- print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
224
+ logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
137
225
  _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
138
226
 
139
- print(f"[pysam] Indexing: {aligned_sorted_output}")
227
+ logger.debug(f"Indexing: {aligned_sorted_output}")
140
228
  _index_bam_with_pysam(aligned_sorted_output, threads=threads)
141
229
 
230
+
142
231
  def bam_qc(
143
232
  bam_files: Iterable[str | Path],
144
233
  bam_qc_dir: str | Path,
@@ -153,133 +242,154 @@ def bam_qc(
153
242
  Prefers pysam; falls back to `samtools` if needed.
154
243
  Runs BAMs in parallel (up to `threads`, default serial).
155
244
  """
156
- import subprocess
157
245
  import shutil
246
+ import subprocess
247
+
248
+ logger.debug("Performing BAM QC using bam_qc")
158
249
 
159
250
  # Try to import pysam once
160
251
  try:
161
- import pysam
162
- HAVE_PYSAM = True
252
+ import pysam # type: ignore
253
+
254
+ have_pysam = True
163
255
  except Exception:
164
- HAVE_PYSAM = False
256
+ pysam = None # type: ignore
257
+ have_pysam = False
165
258
 
166
259
  bam_qc_dir = Path(bam_qc_dir)
167
260
  bam_qc_dir.mkdir(parents=True, exist_ok=True)
168
261
 
169
- bam_files = [Path(b) for b in bam_files]
262
+ bam_paths = [Path(b) for b in bam_files]
170
263
 
171
264
  def _has_index(p: Path) -> bool:
172
- if p.suffix.lower() == ".bam":
173
- bai = p.with_suffix(p.suffix + ".bai")
174
- bai_alt = Path(str(p) + ".bai")
175
- return bai.exists() or bai_alt.exists()
176
- if p.suffix.lower() == ".cram":
177
- crai = Path(str(p) + ".crai")
178
- return crai.exists()
265
+ """Return True if a BAM/CRAM index exists for the path."""
266
+ suf = p.suffix.lower()
267
+ if suf == ".bam":
268
+ return p.with_suffix(p.suffix + ".bai").exists() or Path(str(p) + ".bai").exists()
269
+ if suf == ".cram":
270
+ return Path(str(p) + ".crai").exists()
179
271
  return False
180
272
 
181
273
  def _ensure_index(p: Path) -> None:
274
+ """Ensure a BAM/CRAM index exists, creating one if needed."""
182
275
  if _has_index(p):
183
276
  return
184
- if HAVE_PYSAM:
185
- # pysam.index supports both BAM & CRAM
186
- pysam.index(str(p))
277
+ if have_pysam:
278
+ assert pysam is not None
279
+ pysam.index(str(p)) # supports BAM & CRAM
187
280
  else:
281
+ if not shutil.which("samtools"):
282
+ raise RuntimeError("Neither pysam nor samtools is available in PATH.")
188
283
  cmd = ["samtools", "index", str(p)]
189
- subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
284
+ # capture text so errors are readable; raise on failure
285
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
286
+ if cp.returncode != 0:
287
+ raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
288
+
289
+ def _run_samtools_to_file(cmd: list[str], out_path: Path, bam: Path, tag: str) -> int:
290
+ """
291
+ Stream stderr to logger; write stdout to out_path; return rc; raise with stderr tail on failure.
292
+ """
293
+ last_err = deque(maxlen=80)
294
+ out_path.parent.mkdir(parents=True, exist_ok=True)
295
+
296
+ with open(out_path, "w") as fh:
297
+ proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.PIPE, text=True)
298
+ assert proc.stderr is not None
299
+ for line in proc.stderr:
300
+ line = line.rstrip()
301
+ if line:
302
+ last_err.append(line)
303
+ logger.info("[%s][%s] %s", tag, bam.name, line)
304
+ rc = proc.wait()
305
+
306
+ if rc != 0:
307
+ tail = "\n".join(last_err)
308
+ raise RuntimeError(f"{tag} failed for {bam} (exit {rc}). Stderr tail:\n{tail}")
309
+ return rc
310
+
311
+ def _run_one(bam: Path) -> tuple[Path, list[tuple[str, int]]]:
312
+ """Run stats/flagstat/idxstats for a single BAM.
313
+
314
+ Args:
315
+ bam: Path to the BAM file.
316
+
317
+ Returns:
318
+ Tuple of (bam_path, list of (stage, return_code)).
319
+ """
320
+ import subprocess
190
321
 
191
- def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
192
- # outputs + return (file, [(task_name, returncode)])
193
- results: List[Tuple[str, int]] = []
194
- base = bam.stem # filename without .bam
322
+ results: list[tuple[str, int]] = []
323
+ base = bam.stem # e.g. sample.bam -> sample
195
324
  out_stats = bam_qc_dir / f"{base}_stats.txt"
196
325
  out_flag = bam_qc_dir / f"{base}_flagstat.txt"
197
- out_idx = bam_qc_dir / f"{base}_idxstats.txt"
326
+ out_idx = bam_qc_dir / f"{base}_idxstats.txt"
198
327
 
199
- # Make sure index exists (samtools stats/flagstat dont require, idxstats does)
328
+ # Make sure index exists (idxstats requires; stats/flagstat usually don't, but indexing is cheap/useful)
200
329
  try:
201
330
  _ensure_index(bam)
202
331
  except Exception as e:
203
- # Still attempt stats/flagstat if requested
204
- print(f"[warn] Indexing failed for {bam}: {e}")
205
-
206
- # Choose runner per task
207
- def run_stats():
208
- if not stats:
209
- return
210
- if HAVE_PYSAM and hasattr(pysam, "stats"):
332
+ # Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
333
+ logger.warning("Indexing failed for %s: %s", bam, e)
334
+
335
+ if not have_pysam:
336
+ import shutil
337
+
338
+ if not shutil.which("samtools"):
339
+ raise RuntimeError("Neither pysam nor samtools is available in PATH.")
340
+
341
+ # --- stats ---
342
+ if stats:
343
+ if have_pysam and pysam is not None and hasattr(pysam, "stats"):
211
344
  txt = pysam.stats(str(bam))
212
345
  out_stats.write_text(txt)
213
346
  results.append(("stats(pysam)", 0))
214
347
  else:
215
348
  cmd = ["samtools", "stats", str(bam)]
216
- with open(out_stats, "w") as fh:
217
- cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
218
- results.append(("stats(samtools)", cp.returncode))
219
- if cp.returncode != 0:
220
- raise RuntimeError(cp.stderr.decode(errors="replace"))
221
-
222
- def run_flagstat():
223
- if not flagstats:
224
- return
225
- if HAVE_PYSAM and hasattr(pysam, "flagstat"):
349
+ rc = _run_samtools_to_file(cmd, out_stats, bam, "samtools stats")
350
+ results.append(("stats(samtools)", rc))
351
+
352
+ # --- flagstat ---
353
+ if flagstats:
354
+ if have_pysam and pysam is not None and hasattr(pysam, "flagstat"):
226
355
  txt = pysam.flagstat(str(bam))
227
356
  out_flag.write_text(txt)
228
357
  results.append(("flagstat(pysam)", 0))
229
358
  else:
230
359
  cmd = ["samtools", "flagstat", str(bam)]
231
- with open(out_flag, "w") as fh:
232
- cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
233
- results.append(("flagstat(samtools)", cp.returncode))
234
- if cp.returncode != 0:
235
- raise RuntimeError(cp.stderr.decode(errors="replace"))
236
-
237
- def run_idxstats():
238
- if not idxstats:
239
- return
240
- if HAVE_PYSAM and hasattr(pysam, "idxstats"):
360
+ rc = _run_samtools_to_file(cmd, out_flag, bam, "samtools flagstat")
361
+ results.append(("flagstat(samtools)", rc))
362
+
363
+ # --- idxstats ---
364
+ if idxstats:
365
+ if have_pysam and pysam is not None and hasattr(pysam, "idxstats"):
241
366
  txt = pysam.idxstats(str(bam))
242
367
  out_idx.write_text(txt)
243
368
  results.append(("idxstats(pysam)", 0))
244
369
  else:
245
370
  cmd = ["samtools", "idxstats", str(bam)]
246
- with open(out_idx, "w") as fh:
247
- cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
248
- results.append(("idxstats(samtools)", cp.returncode))
249
- if cp.returncode != 0:
250
- raise RuntimeError(cp.stderr.decode(errors="replace"))
251
-
252
- # Sanity: ensure samtools exists if pysam missing
253
- if not HAVE_PYSAM:
254
- if not shutil.which("samtools"):
255
- raise RuntimeError("Neither pysam nor samtools is available in PATH.")
371
+ rc = _run_samtools_to_file(cmd, out_idx, bam, "samtools idxstats")
372
+ results.append(("idxstats(samtools)", rc))
256
373
 
257
- # Execute tasks (serial per file; parallelized across files)
258
- run_stats()
259
- run_flagstat()
260
- run_idxstats()
261
374
  return bam, results
262
375
 
263
- # Parallel across BAMs
264
376
  max_workers = int(threads) if threads and int(threads) > 0 else 1
265
- futures = []
266
- with ThreadPoolExecutor(max_workers=max_workers) as ex:
267
- for b in bam_files:
268
- futures.append(ex.submit(_run_one, b))
269
377
 
270
- for fut in as_completed(futures):
378
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
379
+ futs = [ex.submit(_run_one, b) for b in bam_paths]
380
+ for fut in as_completed(futs):
271
381
  try:
272
382
  bam, res = fut.result()
273
383
  summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
274
- print(f"[qc] {bam.name}: {summary}")
384
+ logger.info("[qc] %s: %s", bam.name, summary)
275
385
  except Exception as e:
276
- print(f"[error] QC failed: {e}")
386
+ logger.exception("QC failed: %s", e)
387
+
388
+ if modality not in {"conversion", "direct", "deaminase"}:
389
+ logger.warning("Unknown modality '%s', continuing.", modality)
277
390
 
278
- # Placeholders to keep your signature stable
279
- if modality not in {"conversion", "direct"}:
280
- print(f"[warn] Unknown modality '{modality}', continuing.")
391
+ logger.info("QC processing completed.")
281
392
 
282
- print("QC processing completed.")
283
393
 
284
394
  def concatenate_fastqs_to_bam(
285
395
  fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
@@ -326,12 +436,29 @@ def concatenate_fastqs_to_bam(
326
436
  """
327
437
  name = p.name
328
438
  lowers = name.lower()
329
- for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
439
+ for ext in (
440
+ ".fastq.gz",
441
+ ".fq.gz",
442
+ ".fastq.bz2",
443
+ ".fq.bz2",
444
+ ".fastq.xz",
445
+ ".fq.xz",
446
+ ".fastq",
447
+ ".fq",
448
+ ):
330
449
  if lowers.endswith(ext):
331
450
  return name[: -len(ext)]
332
451
  return p.stem # fallback: remove last suffix only
333
452
 
334
453
  def _extract_barcode_from_filename(p: Path) -> str:
454
+ """Extract a barcode token from a FASTQ filename.
455
+
456
+ Args:
457
+ p: FASTQ path.
458
+
459
+ Returns:
460
+ Barcode token string.
461
+ """
335
462
  stem = _strip_fastq_ext(p)
336
463
  if "_" in stem:
337
464
  token = stem.split("_")[-1]
@@ -340,10 +467,18 @@ def concatenate_fastqs_to_bam(
340
467
  return stem
341
468
 
342
469
  def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
470
+ """Classify a FASTQ filename stem into (prefix, read_number).
471
+
472
+ Args:
473
+ stem: Filename stem.
474
+
475
+ Returns:
476
+ Tuple of (prefix, read_number) or (None, None) if not matched.
477
+ """
343
478
  # return (prefix, readnum) if matches; else (None, None)
344
479
  patterns = [
345
- r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
346
- r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
480
+ r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
481
+ r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
347
482
  ]
348
483
  for pat in patterns:
349
484
  m = re.match(pat, stem)
@@ -352,6 +487,14 @@ def concatenate_fastqs_to_bam(
352
487
  return None, None
353
488
 
354
489
  def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
490
+ """Pair FASTQ files based on filename conventions.
491
+
492
+ Args:
493
+ paths: FASTQ paths to pair.
494
+
495
+ Returns:
496
+ Tuple of (paired list, leftover list).
497
+ """
355
498
  pref_map: Dict[str, Dict[int, Path]] = {}
356
499
  unpaired: List[Path] = []
357
500
  for pth in paths:
@@ -373,6 +516,14 @@ def concatenate_fastqs_to_bam(
373
516
  return pairs, leftovers
374
517
 
375
518
  def _fastq_iter(p: Path):
519
+ """Yield FASTQ records using pysam.FastxFile.
520
+
521
+ Args:
522
+ p: FASTQ path.
523
+
524
+ Yields:
525
+ Pysam Fastx records.
526
+ """
376
527
  # pysam.FastxFile handles compressed extensions transparently
377
528
  with pysam.FastxFile(str(p)) as fx:
378
529
  for rec in fx:
@@ -386,6 +537,19 @@ def concatenate_fastqs_to_bam(
386
537
  read1: bool,
387
538
  read2: bool,
388
539
  ) -> pysam.AlignedSegment:
540
+ """Construct an unaligned pysam.AlignedSegment.
541
+
542
+ Args:
543
+ name: Read name.
544
+ seq: Read sequence.
545
+ qual: FASTQ quality string.
546
+ bc: Barcode string.
547
+ read1: Whether this is read 1.
548
+ read2: Whether this is read 2.
549
+
550
+ Returns:
551
+ Unaligned pysam.AlignedSegment.
552
+ """
389
553
  a = pysam.AlignedSegment()
390
554
  a.query_name = name
391
555
  a.query_sequence = seq
@@ -408,6 +572,7 @@ def concatenate_fastqs_to_bam(
408
572
 
409
573
  # ---------- normalize inputs to Path ----------
410
574
  def _to_path_pair(x) -> Tuple[Path, Path]:
575
+ """Convert a tuple of path-like objects to Path instances."""
411
576
  a, b = x
412
577
  return Path(a), Path(b)
413
578
 
@@ -450,7 +615,10 @@ def concatenate_fastqs_to_bam(
450
615
  # ---------- BAM header ----------
451
616
  header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
452
617
  if add_read_group:
453
- header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
618
+ header["RG"] = [
619
+ {"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})}
620
+ for bc in barcodes_in_order
621
+ ]
454
622
  header.setdefault("PG", []).append(
455
623
  {"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
456
624
  )
@@ -476,7 +644,9 @@ def concatenate_fastqs_to_bam(
476
644
  it2 = _fastq_iter(r2_path)
477
645
 
478
646
  for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
647
+
479
648
  def _clean(n: Optional[str]) -> Optional[str]:
649
+ """Normalize FASTQ read names by trimming read suffixes."""
480
650
  if n is None:
481
651
  return None
482
652
  return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
@@ -489,12 +659,16 @@ def concatenate_fastqs_to_bam(
489
659
  )
490
660
 
491
661
  if rec1 is not None:
492
- a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
662
+ a1 = _make_unaligned_segment(
663
+ name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
664
+ )
493
665
  bam_out.write(a1)
494
666
  per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
495
667
  total_written += 1
496
668
  if rec2 is not None:
497
- a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
669
+ a2 = _make_unaligned_segment(
670
+ name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
671
+ )
498
672
  bam_out.write(a2)
499
673
  per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
500
674
  total_written += 1
@@ -516,7 +690,9 @@ def concatenate_fastqs_to_bam(
516
690
  raise FileNotFoundError(pth)
517
691
  bc = per_path_barcode.get(pth, "barcode")
518
692
  for rec in _fastq_iter(pth):
519
- a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
693
+ a = _make_unaligned_segment(
694
+ rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
695
+ )
520
696
  bam_out.write(a)
521
697
  per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
522
698
  total_written += 1
@@ -530,20 +706,21 @@ def concatenate_fastqs_to_bam(
530
706
  "barcodes": barcodes_in_order,
531
707
  }
532
708
 
709
+
533
710
  def count_aligned_reads(bam_file):
534
711
  """
535
712
  Counts the number of aligned reads in a bam file that map to each reference record.
536
-
713
+
537
714
  Parameters:
538
715
  bam_file (str): A string representing the path to an aligned BAM file.
539
-
716
+
540
717
  Returns:
541
718
  aligned_reads_count (int): The total number or reads aligned in the BAM.
542
719
  unaligned_reads_count (int): The total number of reads not aligned in the BAM.
543
720
  record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
544
721
 
545
722
  """
546
- print('{0}: Counting aligned reads in BAM > {1}'.format(time_string(), bam_file))
723
+ print("{0}: Counting aligned reads in BAM > {1}".format(time_string(), bam_file))
547
724
  aligned_reads_count = 0
548
725
  unaligned_reads_count = 0
549
726
  # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
@@ -552,12 +729,14 @@ def count_aligned_reads(bam_file):
552
729
  with pysam.AlignmentFile(str(bam_file), "rb") as bam:
553
730
  total_reads = bam.mapped + bam.unmapped
554
731
  # Iterate over reads to get the total mapped read counts and the reads that map to each reference
555
- for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
732
+ for read in tqdm(bam, desc="Counting aligned reads in BAM", total=total_reads):
556
733
  if read.is_unmapped:
557
734
  unaligned_reads_count += 1
558
735
  else:
559
736
  aligned_reads_count += 1
560
- record_counts[read.reference_name] += 1 # Automatically increments if key exists, adds if not
737
+ record_counts[read.reference_name] += (
738
+ 1 # Automatically increments if key exists, adds if not
739
+ )
561
740
 
562
741
  # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
563
742
  for reference in record_counts:
@@ -566,7 +745,10 @@ def count_aligned_reads(bam_file):
566
745
 
567
746
  return aligned_reads_count, unaligned_reads_count, dict(record_counts)
568
747
 
569
- def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads):
748
+
749
+ def demux_and_index_BAM(
750
+ aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads
751
+ ):
570
752
  """
571
753
  A wrapper function for splitting BAMS and indexing them.
572
754
  Parameters:
@@ -577,11 +759,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
577
759
  barcode_both_ends (bool): Whether to require both ends to be barcoded.
578
760
  trim (bool): Whether to trim off barcodes after demultiplexing.
579
761
  threads (int): Number of threads to use.
580
-
762
+
581
763
  Returns:
582
764
  bam_files (list): List of split BAM file path strings
583
765
  Splits an input BAM file on barcode value and makes a BAM index file.
584
766
  """
767
+
585
768
  input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
586
769
  command = ["dorado", "demux", "--kit-name", barcode_kit]
587
770
  if barcode_both_ends:
@@ -594,25 +777,37 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
594
777
  pass
595
778
  command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
596
779
  command.append(str(input_bam))
597
- command_string = ' '.join(command)
598
- print(f"Running: {command_string}")
599
- subprocess.run(command)
780
+ command_string = " ".join(command)
781
+ logger.info("Running dorado demux: %s", " ".join(command))
782
+
783
+ proc = subprocess.Popen(
784
+ command,
785
+ stdout=subprocess.PIPE,
786
+ stderr=subprocess.PIPE,
787
+ text=True,
788
+ )
789
+
790
+ assert proc.stderr is not None
791
+ _stream_dorado_logs(proc.stderr)
792
+ rc = proc.wait()
793
+
794
+ if rc != 0:
795
+ raise RuntimeError(f"dorado demux failed with exit code {rc}")
600
796
 
601
797
  bam_files = sorted(
602
- p for p in split_dir.glob(f"*{bam_suffix}")
603
- if p.is_file() and p.suffix == bam_suffix
798
+ p for p in split_dir.glob(f"*{bam_suffix}") if p.is_file() and p.suffix == bam_suffix
604
799
  )
605
800
 
606
801
  if not bam_files:
607
802
  raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
608
-
803
+
609
804
  # ---- Optional renaming with prefix ----
610
805
  renamed_bams = []
611
806
  prefix = "de" if barcode_both_ends else "se"
612
807
 
613
808
  for bam in bam_files:
614
809
  bam = Path(bam)
615
- bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
810
+ bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
616
811
 
617
812
  if prefix:
618
813
  new_name = f"{prefix}_{bam.name}"
@@ -628,9 +823,10 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
628
823
  bai.rename(new_bai)
629
824
 
630
825
  renamed_bams.append(new_bam)
631
-
826
+
632
827
  return renamed_bams
633
828
 
829
+
634
830
  def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
635
831
  """
636
832
  Efficiently extracts base identities from mapped reads with reference coordinates.
@@ -646,14 +842,15 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
646
842
  dict: Base identities from forward mapped reads.
647
843
  dict: Base identities from reverse mapped reads.
648
844
  """
845
+ logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
649
846
  timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
650
847
 
651
848
  positions = set(positions)
652
- fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
653
- rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
849
+ fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
850
+ rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
654
851
  mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
655
852
 
656
- #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
853
+ # print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
657
854
  with pysam.AlignmentFile(str(bam_file), "rb") as bam:
658
855
  total_reads = bam.mapped
659
856
  ref_seq = sequence.upper()
@@ -676,7 +873,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
676
873
  base_dict[read_name][reference_position] = read_base
677
874
 
678
875
  # Track mismatches (excluding Ns)
679
- if read_base != ref_base and read_base != 'N' and ref_base != 'N':
876
+ if read_base != ref_base and read_base != "N" and ref_base != "N":
680
877
  mismatch_counts_per_read[read_name][ref_base][read_base] += 1
681
878
 
682
879
  # Determine C→T vs G→A dominance per read
@@ -694,7 +891,13 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
694
891
  else:
695
892
  mismatch_trend_per_read[read_name] = "none"
696
893
 
697
- return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read
894
+ return (
895
+ dict(fwd_base_identities),
896
+ dict(rev_base_identities),
897
+ dict(mismatch_counts_per_read),
898
+ mismatch_trend_per_read,
899
+ )
900
+
698
901
 
699
902
  def extract_read_features_from_bam(bam_file_path):
700
903
  """
@@ -705,7 +908,9 @@ def extract_read_features_from_bam(bam_file_path):
705
908
  read_metrics (dict)
706
909
  """
707
910
  # Open the BAM file
708
- print(f'Extracting read features from BAM: {bam_file_path}')
911
+ logger.debug(
912
+ f"Extracting read metrics from BAM using extract_read_features_from_bam: {bam_file_path}"
913
+ )
709
914
  with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
710
915
  read_metrics = {}
711
916
  reference_lengths = bam_file.lengths # List of lengths for each reference (chromosome)
@@ -722,10 +927,17 @@ def extract_read_features_from_bam(bam_file_path):
722
927
  reference_length = reference_lengths[reference_index]
723
928
  mapped_length = sum(end - start for start, end in read.get_blocks())
724
929
  mapping_quality = read.mapping_quality # Phred-scaled MAPQ
725
- read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
930
+ read_metrics[read.query_name] = [
931
+ read.query_length,
932
+ median_read_quality,
933
+ reference_length,
934
+ mapped_length,
935
+ mapping_quality,
936
+ ]
726
937
 
727
938
  return read_metrics
728
939
 
940
+
729
941
  def extract_readnames_from_bam(aligned_BAM):
730
942
  """
731
943
  Takes a BAM and writes out a txt file containing read names from the BAM
@@ -738,15 +950,19 @@ def extract_readnames_from_bam(aligned_BAM):
738
950
 
739
951
  """
740
952
  import subprocess
953
+
741
954
  # Make a text file of reads for the BAM
742
- txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
955
+ txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
743
956
  samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
744
957
  with open(txt_output, "w") as output_file:
745
- cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
958
+ cut_process = subprocess.Popen(
959
+ ["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file
960
+ )
746
961
  samtools_view.stdout.close()
747
962
  cut_process.wait()
748
963
  samtools_view.wait()
749
964
 
965
+
750
966
  def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
751
967
  """
752
968
  Separates an input BAM file on the BC SAM tag values.
@@ -756,11 +972,12 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
756
972
  output_prefix (str): A prefix to append to the output BAM.
757
973
  bam_suffix (str): A suffix to add to the bam file.
758
974
  split_dir (str): String indicating path to directory to split BAMs into
759
-
975
+
760
976
  Returns:
761
977
  None
762
978
  Writes out split BAM files.
763
979
  """
980
+ logger.debug("Demultiplexing BAM based on the BC tag")
764
981
  bam_base = input_bam.name
765
982
  bam_base_minus_suffix = input_bam.stem
766
983
 
@@ -773,19 +990,24 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
773
990
  try:
774
991
  # Get the barcode tag value
775
992
  bc_tag = read.get_tag("BC", with_value_type=True)[0]
776
- #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
993
+ # bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
777
994
  # Open the output BAM file corresponding to the barcode
778
995
  if bc_tag not in output_files:
779
- output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
780
- output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
996
+ output_path = (
997
+ split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
998
+ )
999
+ output_files[bc_tag] = pysam.AlignmentFile(
1000
+ str(output_path), "wb", header=bam.header
1001
+ )
781
1002
  # Write the read to the corresponding output BAM file
782
1003
  output_files[bc_tag].write(read)
783
1004
  except KeyError:
784
- print(f"BC tag not present for read: {read.query_name}")
1005
+ logger.warning(f"BC tag not present for read: {read.query_name}")
785
1006
  # Close all output BAM files
786
1007
  for output_file in output_files.values():
787
1008
  output_file.close()
788
1009
 
1010
+
789
1011
  def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
790
1012
  """
791
1013
  A wrapper function for splitting BAMS and indexing them.
@@ -793,19 +1015,20 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
793
1015
  aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
794
1016
  split_dir (str): A string representing the file path to the directory to split the BAMs into.
795
1017
  bam_suffix (str): A suffix to add to the bam file.
796
-
1018
+
797
1019
  Returns:
798
1020
  None
799
1021
  Splits an input BAM file on barcode value and makes a BAM index file.
800
1022
  """
1023
+ logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
801
1024
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
802
1025
  file_prefix = date_string()
803
1026
  separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
804
1027
  # Make a BAM index file for the BAMs in that directory
805
- bam_pattern = '*' + bam_suffix
1028
+ bam_pattern = "*" + bam_suffix
806
1029
  bam_files = glob.glob(split_dir / bam_pattern)
807
- bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
1030
+ bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
808
1031
  for input_file in bam_files:
809
1032
  pysam.index(input_file)
810
1033
 
811
- return bam_files
1034
+ return bam_files