smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,24 +1,55 @@
1
1
  from __future__ import annotations
2
2
 
3
- from pathlib import Path
3
+ import glob
4
4
  import os
5
+ import re
5
6
  import subprocess
6
- import glob
7
7
  import time
8
- from typing import Dict, List, Any, Tuple, Union, Optional, Iterable
9
- import re
8
+ from collections import Counter, defaultdict, deque
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
10
  from itertools import zip_longest
11
- import pysam
11
+ from pathlib import Path
12
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
12
13
 
13
14
  import numpy as np
14
- import concurrent.futures
15
- from concurrent.futures import ThreadPoolExecutor, as_completed
16
- from concurrent.futures import ProcessPoolExecutor
17
-
15
+ import pysam
18
16
  from tqdm import tqdm
19
- from collections import defaultdict, Counter
20
17
 
21
- from ..readwrite import make_dirs, time_string, date_string
18
+ from smftools.logging_utils import get_logger
19
+
20
+ from ..readwrite import date_string, time_string
21
+
22
+ logger = get_logger(__name__)
23
+
24
+ _PROGRESS_RE = re.compile(r"Output records written:\s*(\d+)")
25
+ _EMPTY_RE = re.compile(r"^\s*$")
26
+
27
+
28
+ def _stream_dorado_logs(stderr_iter) -> None:
29
+ """Stream dorado stderr and emit structured log messages.
30
+
31
+ Args:
32
+ stderr_iter: Iterable of stderr lines.
33
+ """
34
+ last_n: int | None = None
35
+
36
+ for raw in stderr_iter:
37
+ line = raw.rstrip("\n")
38
+ if _EMPTY_RE.match(line):
39
+ continue
40
+
41
+ m = _PROGRESS_RE.search(line)
42
+ if m:
43
+ n = int(m.group(1))
44
+ logger.debug("[dorado] Output records written: %d", n)
45
+ last_n = n
46
+ continue
47
+
48
+ logger.info("[dorado] %s", line)
49
+
50
+ if last_n is not None:
51
+ logger.info("[dorado] Final output records written: %d", last_n)
52
+
22
53
 
23
54
  def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str, Path]) -> None:
24
55
  """
@@ -26,7 +57,13 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
26
57
  """
27
58
  bam_path = str(bam_path)
28
59
  fastq_path = str(fastq_path)
29
- with pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam, open(fastq_path, "w", encoding="utf-8") as fq:
60
+
61
+ logger.debug(f"Converting BAM to FASTQ using _bam_to_fastq_with_pysam")
62
+
63
+ with (
64
+ pysam.AlignmentFile(bam_path, "rb", check_sq=False) as bam,
65
+ open(fastq_path, "w", encoding="utf-8") as fq,
66
+ ):
30
67
  for r in bam.fetch(until_eof=True):
31
68
  # Optionally skip secondary/supplementary:
32
69
  # if r.is_secondary or r.is_supplementary:
@@ -45,14 +82,22 @@ def _bam_to_fastq_with_pysam(bam_path: Union[str, Path], fastq_path: Union[str,
45
82
  # q is an array/list of ints (Phred scores).
46
83
  # Convert to FASTQ string with Phred+33 encoding,
47
84
  # clamping to sane range [0, 93] to stay in printable ASCII.
48
- qual_str = "".join(
49
- chr(min(max(int(qv), 0), 93) + 33)
50
- for qv in q
51
- )
85
+ qual_str = "".join(chr(min(max(int(qv), 0), 93) + 33) for qv in q)
52
86
 
53
87
  fq.write(f"@{name}\n{seq}\n+\n{qual_str}\n")
54
88
 
55
- def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None) -> None:
89
+
90
+ def _sort_bam_with_pysam(
91
+ in_bam: Union[str, Path], out_bam: Union[str, Path], threads: Optional[int] = None
92
+ ) -> None:
93
+ """Sort a BAM file using pysam.
94
+
95
+ Args:
96
+ in_bam: Input BAM path.
97
+ out_bam: Output BAM path.
98
+ threads: Optional thread count.
99
+ """
100
+ logger.debug(f"Sorting BAM using _sort_bam_with_pysam")
56
101
  in_bam, out_bam = str(in_bam), str(out_bam)
57
102
  args = []
58
103
  if threads:
@@ -60,86 +105,129 @@ def _sort_bam_with_pysam(in_bam: Union[str, Path], out_bam: Union[str, Path], th
60
105
  args += ["-o", out_bam, in_bam]
61
106
  pysam.sort(*args)
62
107
 
108
+
63
109
  def _index_bam_with_pysam(bam_path: Union[str, Path], threads: Optional[int] = None) -> None:
110
+ """Index a BAM file using pysam.
111
+
112
+ Args:
113
+ bam_path: BAM path to index.
114
+ threads: Optional thread count.
115
+ """
64
116
  bam_path = str(bam_path)
117
+ logger.debug(f"Indexing BAM using _index_bam_with_pysam")
65
118
  # pysam.index supports samtools-style args
66
119
  if threads:
67
120
  pysam.index("-@", str(threads), bam_path)
68
121
  else:
69
122
  pysam.index(bam_path)
70
123
 
71
- def align_and_sort_BAM(fasta,
72
- input,
73
- bam_suffix='.bam',
74
- output_directory='aligned_outputs',
75
- make_bigwigs=False,
76
- threads=None,
77
- aligner='minimap2',
78
- aligner_args=['-a', '-x', 'map-ont', '--MD', '-Y', '-y', '-N', '5', '--secondary=no']):
124
+
125
+ def align_and_sort_BAM(
126
+ fasta,
127
+ input,
128
+ cfg,
129
+ ):
79
130
  """
80
131
  A wrapper for running dorado aligner and samtools functions
81
-
132
+
82
133
  Parameters:
83
134
  fasta (str): File path to the reference genome to align to.
84
135
  input (str): File path to the basecalled file to align. Works for .bam and .fastq files
85
- bam_suffix (str): The suffix to use for the BAM file.
86
- output_directory (str): A file path to the directory to output all the analyses.
87
- make_bigwigs (bool): Whether to make bigwigs
88
- threads (int): Number of additional threads to use
89
- aligner (str): Aligner to use. minimap2 and dorado options
90
- aligner_args (list): list of optional parameters to use for the alignment
136
+ cfg: The configuration object
91
137
 
92
138
  Returns:
93
139
  None
94
140
  The function writes out files for: 1) An aligned BAM, 2) and aligned_sorted BAM, 3) an index file for the aligned_sorted BAM, 4) A bed file for the aligned_sorted BAM, 5) A text file containing read names in the aligned_sorted BAM
95
141
  """
142
+ logger.debug("Aligning and sorting BAM using align_and_sort_BAM")
96
143
  input_basename = input.name
97
144
  input_suffix = input.suffix
98
- input_as_fastq = input.with_name(input.stem + '.fastq')
145
+ input_as_fastq = input.with_name(input.stem + ".fastq")
146
+
147
+ output_path_minus_suffix = cfg.output_directory / input.stem
99
148
 
100
- output_path_minus_suffix = output_directory / input.stem
101
-
102
149
  aligned_BAM = output_path_minus_suffix.with_name(output_path_minus_suffix.stem + "_aligned")
103
- aligned_output = aligned_BAM.with_suffix(bam_suffix)
104
- aligned_sorted_BAM =aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
105
- aligned_sorted_output = aligned_sorted_BAM.with_suffix(bam_suffix)
150
+ aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
151
+ aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
152
+ aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
106
153
 
107
- if threads:
108
- threads = str(threads)
154
+ if cfg.threads:
155
+ threads = str(cfg.threads)
109
156
  else:
110
- pass
111
-
112
- if aligner == 'minimap2':
113
- print(f"Converting BAM to FASTQ: {input}")
114
- _bam_to_fastq_with_pysam(input, input_as_fastq)
115
- print(f"Aligning FASTQ to Reference: {input_as_fastq}")
157
+ threads = None
158
+
159
+ if cfg.aligner == "minimap2":
160
+ if not cfg.align_from_bam:
161
+ logger.debug(f"Converting BAM to FASTQ: {input}")
162
+ _bam_to_fastq_with_pysam(input, input_as_fastq)
163
+ logger.debug(f"Aligning FASTQ to Reference: {input_as_fastq}")
164
+ mm_input = input_as_fastq
165
+ else:
166
+ logger.debug(f"Aligning BAM to Reference: {input}")
167
+ mm_input = input
168
+
116
169
  if threads:
117
- minimap_command = ['minimap2'] + aligner_args + ['-t', threads, str(fasta), str(input_as_fastq)]
170
+ minimap_command = (
171
+ ["minimap2"] + cfg.aligner_args + ["-t", threads, str(fasta), str(mm_input)]
172
+ )
118
173
  else:
119
- minimap_command = ['minimap2'] + aligner_args + [str(fasta), str(input_as_fastq)]
120
- subprocess.run(minimap_command, stdout=open(aligned_output, "wb"))
121
- os.remove(input_as_fastq)
174
+ minimap_command = ["minimap2"] + cfg.aligner_args + [str(fasta), str(mm_input)]
175
+
176
+ with open(aligned_output, "wb") as out:
177
+ proc = subprocess.Popen(
178
+ minimap_command,
179
+ stdout=out,
180
+ stderr=subprocess.PIPE,
181
+ text=True,
182
+ )
122
183
 
123
- elif aligner == 'dorado':
184
+ assert proc.stderr is not None
185
+ for line in proc.stderr:
186
+ logger.info("[minimap2] %s", line.rstrip())
187
+
188
+ ret = proc.wait()
189
+ if ret != 0:
190
+ raise RuntimeError(f"minimap2 failed with exit code {ret}")
191
+
192
+ if not cfg.align_from_bam:
193
+ os.remove(input_as_fastq)
194
+
195
+ elif cfg.aligner == "dorado":
124
196
  # Run dorado aligner
125
197
  print(f"Aligning BAM to Reference: {input}")
126
198
  if threads:
127
- alignment_command = ["dorado", "aligner", "-t", threads] + aligner_args + [str(fasta), str(input)]
199
+ alignment_command = (
200
+ ["dorado", "aligner", "-t", threads] + cfg.aligner_args + [str(fasta), str(input)]
201
+ )
128
202
  else:
129
- alignment_command = ["dorado", "aligner"] + aligner_args + [str(fasta), str(input)]
130
- subprocess.run(alignment_command, stdout=open(aligned_output, "wb"))
131
-
203
+ alignment_command = ["dorado", "aligner"] + cfg.aligner_args + [str(fasta), str(input)]
204
+
205
+ with open(aligned_output, "wb") as out:
206
+ proc = subprocess.Popen(
207
+ alignment_command,
208
+ stdout=out,
209
+ stderr=subprocess.PIPE,
210
+ text=True,
211
+ )
212
+
213
+ assert proc.stderr is not None
214
+ _stream_dorado_logs(proc.stderr)
215
+ ret = proc.wait()
216
+
217
+ if ret != 0:
218
+ raise RuntimeError(f"dorado failed with exit code {ret}")
132
219
  else:
133
- print(f'Aligner not recognized: {aligner}. Choose from minimap2 and dorado')
220
+ logger.error(f"Aligner not recognized: {cfg.aligner}. Choose from minimap2 and dorado")
134
221
  return
135
-
222
+
136
223
  # --- Sort & Index with pysam ---
137
- print(f"[pysam] Sorting: {aligned_output} -> {aligned_sorted_output}")
224
+ logger.debug(f"Sorting: {aligned_output} -> {aligned_sorted_output}")
138
225
  _sort_bam_with_pysam(aligned_output, aligned_sorted_output, threads=threads)
139
226
 
140
- print(f"[pysam] Indexing: {aligned_sorted_output}")
227
+ logger.debug(f"Indexing: {aligned_sorted_output}")
141
228
  _index_bam_with_pysam(aligned_sorted_output, threads=threads)
142
229
 
230
+
143
231
  def bam_qc(
144
232
  bam_files: Iterable[str | Path],
145
233
  bam_qc_dir: str | Path,
@@ -154,133 +242,154 @@ def bam_qc(
154
242
  Prefers pysam; falls back to `samtools` if needed.
155
243
  Runs BAMs in parallel (up to `threads`, default serial).
156
244
  """
157
- import subprocess
158
245
  import shutil
246
+ import subprocess
247
+
248
+ logger.debug("Performing BAM QC using bam_qc")
159
249
 
160
250
  # Try to import pysam once
161
251
  try:
162
- import pysam
163
- HAVE_PYSAM = True
252
+ import pysam # type: ignore
253
+
254
+ have_pysam = True
164
255
  except Exception:
165
- HAVE_PYSAM = False
256
+ pysam = None # type: ignore
257
+ have_pysam = False
166
258
 
167
259
  bam_qc_dir = Path(bam_qc_dir)
168
260
  bam_qc_dir.mkdir(parents=True, exist_ok=True)
169
261
 
170
- bam_files = [Path(b) for b in bam_files]
262
+ bam_paths = [Path(b) for b in bam_files]
171
263
 
172
264
  def _has_index(p: Path) -> bool:
173
- if p.suffix.lower() == ".bam":
174
- bai = p.with_suffix(p.suffix + ".bai")
175
- bai_alt = Path(str(p) + ".bai")
176
- return bai.exists() or bai_alt.exists()
177
- if p.suffix.lower() == ".cram":
178
- crai = Path(str(p) + ".crai")
179
- return crai.exists()
265
+ """Return True if a BAM/CRAM index exists for the path."""
266
+ suf = p.suffix.lower()
267
+ if suf == ".bam":
268
+ return p.with_suffix(p.suffix + ".bai").exists() or Path(str(p) + ".bai").exists()
269
+ if suf == ".cram":
270
+ return Path(str(p) + ".crai").exists()
180
271
  return False
181
272
 
182
273
  def _ensure_index(p: Path) -> None:
274
+ """Ensure a BAM/CRAM index exists, creating one if needed."""
183
275
  if _has_index(p):
184
276
  return
185
- if HAVE_PYSAM:
186
- # pysam.index supports both BAM & CRAM
187
- pysam.index(str(p))
277
+ if have_pysam:
278
+ assert pysam is not None
279
+ pysam.index(str(p)) # supports BAM & CRAM
188
280
  else:
281
+ if not shutil.which("samtools"):
282
+ raise RuntimeError("Neither pysam nor samtools is available in PATH.")
189
283
  cmd = ["samtools", "index", str(p)]
190
- subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
284
+ # capture text so errors are readable; raise on failure
285
+ cp = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
286
+ if cp.returncode != 0:
287
+ raise RuntimeError(f"samtools index failed (exit {cp.returncode}):\n{cp.stderr}")
191
288
 
192
- def _run_one(bam: Path) -> Tuple[Path, List[Tuple[str, int]]]:
193
- # outputs + return (file, [(task_name, returncode)])
194
- results: List[Tuple[str, int]] = []
195
- base = bam.stem # filename without .bam
289
+ def _run_samtools_to_file(cmd: list[str], out_path: Path, bam: Path, tag: str) -> int:
290
+ """
291
+ Stream stderr to logger; write stdout to out_path; return rc; raise with stderr tail on failure.
292
+ """
293
+ last_err = deque(maxlen=80)
294
+ out_path.parent.mkdir(parents=True, exist_ok=True)
295
+
296
+ with open(out_path, "w") as fh:
297
+ proc = subprocess.Popen(cmd, stdout=fh, stderr=subprocess.PIPE, text=True)
298
+ assert proc.stderr is not None
299
+ for line in proc.stderr:
300
+ line = line.rstrip()
301
+ if line:
302
+ last_err.append(line)
303
+ logger.info("[%s][%s] %s", tag, bam.name, line)
304
+ rc = proc.wait()
305
+
306
+ if rc != 0:
307
+ tail = "\n".join(last_err)
308
+ raise RuntimeError(f"{tag} failed for {bam} (exit {rc}). Stderr tail:\n{tail}")
309
+ return rc
310
+
311
+ def _run_one(bam: Path) -> tuple[Path, list[tuple[str, int]]]:
312
+ """Run stats/flagstat/idxstats for a single BAM.
313
+
314
+ Args:
315
+ bam: Path to the BAM file.
316
+
317
+ Returns:
318
+ Tuple of (bam_path, list of (stage, return_code)).
319
+ """
320
+ import subprocess
321
+
322
+ results: list[tuple[str, int]] = []
323
+ base = bam.stem # e.g. sample.bam -> sample
196
324
  out_stats = bam_qc_dir / f"{base}_stats.txt"
197
325
  out_flag = bam_qc_dir / f"{base}_flagstat.txt"
198
- out_idx = bam_qc_dir / f"{base}_idxstats.txt"
326
+ out_idx = bam_qc_dir / f"{base}_idxstats.txt"
199
327
 
200
- # Make sure index exists (samtools stats/flagstat dont require, idxstats does)
328
+ # Make sure index exists (idxstats requires; stats/flagstat usually don't, but indexing is cheap/useful)
201
329
  try:
202
330
  _ensure_index(bam)
203
331
  except Exception as e:
204
- # Still attempt stats/flagstat if requested
205
- print(f"[warn] Indexing failed for {bam}: {e}")
206
-
207
- # Choose runner per task
208
- def run_stats():
209
- if not stats:
210
- return
211
- if HAVE_PYSAM and hasattr(pysam, "stats"):
332
+ # Still attempt stats/flagstat if requested; idxstats may fail later if index is required.
333
+ logger.warning("Indexing failed for %s: %s", bam, e)
334
+
335
+ if not have_pysam:
336
+ import shutil
337
+
338
+ if not shutil.which("samtools"):
339
+ raise RuntimeError("Neither pysam nor samtools is available in PATH.")
340
+
341
+ # --- stats ---
342
+ if stats:
343
+ if have_pysam and pysam is not None and hasattr(pysam, "stats"):
212
344
  txt = pysam.stats(str(bam))
213
345
  out_stats.write_text(txt)
214
346
  results.append(("stats(pysam)", 0))
215
347
  else:
216
348
  cmd = ["samtools", "stats", str(bam)]
217
- with open(out_stats, "w") as fh:
218
- cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
219
- results.append(("stats(samtools)", cp.returncode))
220
- if cp.returncode != 0:
221
- raise RuntimeError(cp.stderr.decode(errors="replace"))
222
-
223
- def run_flagstat():
224
- if not flagstats:
225
- return
226
- if HAVE_PYSAM and hasattr(pysam, "flagstat"):
349
+ rc = _run_samtools_to_file(cmd, out_stats, bam, "samtools stats")
350
+ results.append(("stats(samtools)", rc))
351
+
352
+ # --- flagstat ---
353
+ if flagstats:
354
+ if have_pysam and pysam is not None and hasattr(pysam, "flagstat"):
227
355
  txt = pysam.flagstat(str(bam))
228
356
  out_flag.write_text(txt)
229
357
  results.append(("flagstat(pysam)", 0))
230
358
  else:
231
359
  cmd = ["samtools", "flagstat", str(bam)]
232
- with open(out_flag, "w") as fh:
233
- cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
234
- results.append(("flagstat(samtools)", cp.returncode))
235
- if cp.returncode != 0:
236
- raise RuntimeError(cp.stderr.decode(errors="replace"))
237
-
238
- def run_idxstats():
239
- if not idxstats:
240
- return
241
- if HAVE_PYSAM and hasattr(pysam, "idxstats"):
360
+ rc = _run_samtools_to_file(cmd, out_flag, bam, "samtools flagstat")
361
+ results.append(("flagstat(samtools)", rc))
362
+
363
+ # --- idxstats ---
364
+ if idxstats:
365
+ if have_pysam and pysam is not None and hasattr(pysam, "idxstats"):
242
366
  txt = pysam.idxstats(str(bam))
243
367
  out_idx.write_text(txt)
244
368
  results.append(("idxstats(pysam)", 0))
245
369
  else:
246
370
  cmd = ["samtools", "idxstats", str(bam)]
247
- with open(out_idx, "w") as fh:
248
- cp = subprocess.run(cmd, stdout=fh, stderr=subprocess.PIPE)
249
- results.append(("idxstats(samtools)", cp.returncode))
250
- if cp.returncode != 0:
251
- raise RuntimeError(cp.stderr.decode(errors="replace"))
252
-
253
- # Sanity: ensure samtools exists if pysam missing
254
- if not HAVE_PYSAM:
255
- if not shutil.which("samtools"):
256
- raise RuntimeError("Neither pysam nor samtools is available in PATH.")
371
+ rc = _run_samtools_to_file(cmd, out_idx, bam, "samtools idxstats")
372
+ results.append(("idxstats(samtools)", rc))
257
373
 
258
- # Execute tasks (serial per file; parallelized across files)
259
- run_stats()
260
- run_flagstat()
261
- run_idxstats()
262
374
  return bam, results
263
375
 
264
- # Parallel across BAMs
265
376
  max_workers = int(threads) if threads and int(threads) > 0 else 1
266
- futures = []
267
- with ThreadPoolExecutor(max_workers=max_workers) as ex:
268
- for b in bam_files:
269
- futures.append(ex.submit(_run_one, b))
270
377
 
271
- for fut in as_completed(futures):
378
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
379
+ futs = [ex.submit(_run_one, b) for b in bam_paths]
380
+ for fut in as_completed(futs):
272
381
  try:
273
382
  bam, res = fut.result()
274
383
  summary = ", ".join(f"{name}:{rc}" for name, rc in res) or "no-op"
275
- print(f"[qc] {bam.name}: {summary}")
384
+ logger.info("[qc] %s: %s", bam.name, summary)
276
385
  except Exception as e:
277
- print(f"[error] QC failed: {e}")
386
+ logger.exception("QC failed: %s", e)
278
387
 
279
- # Placeholders to keep your signature stable
280
- if modality not in {"conversion", "direct"}:
281
- print(f"[warn] Unknown modality '{modality}', continuing.")
388
+ if modality not in {"conversion", "direct", "deaminase"}:
389
+ logger.warning("Unknown modality '%s', continuing.", modality)
390
+
391
+ logger.info("QC processing completed.")
282
392
 
283
- print("QC processing completed.")
284
393
 
285
394
  def concatenate_fastqs_to_bam(
286
395
  fastq_files: List[Union[str, Tuple[str, str], Path, Tuple[Path, Path]]],
@@ -327,12 +436,29 @@ def concatenate_fastqs_to_bam(
327
436
  """
328
437
  name = p.name
329
438
  lowers = name.lower()
330
- for ext in (".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq", ".fq"):
439
+ for ext in (
440
+ ".fastq.gz",
441
+ ".fq.gz",
442
+ ".fastq.bz2",
443
+ ".fq.bz2",
444
+ ".fastq.xz",
445
+ ".fq.xz",
446
+ ".fastq",
447
+ ".fq",
448
+ ):
331
449
  if lowers.endswith(ext):
332
450
  return name[: -len(ext)]
333
451
  return p.stem # fallback: remove last suffix only
334
452
 
335
453
  def _extract_barcode_from_filename(p: Path) -> str:
454
+ """Extract a barcode token from a FASTQ filename.
455
+
456
+ Args:
457
+ p: FASTQ path.
458
+
459
+ Returns:
460
+ Barcode token string.
461
+ """
336
462
  stem = _strip_fastq_ext(p)
337
463
  if "_" in stem:
338
464
  token = stem.split("_")[-1]
@@ -341,10 +467,18 @@ def concatenate_fastqs_to_bam(
341
467
  return stem
342
468
 
343
469
  def _classify_read_token(stem: str) -> Tuple[Optional[str], Optional[int]]:
470
+ """Classify a FASTQ filename stem into (prefix, read_number).
471
+
472
+ Args:
473
+ stem: Filename stem.
474
+
475
+ Returns:
476
+ Tuple of (prefix, read_number) or (None, None) if not matched.
477
+ """
344
478
  # return (prefix, readnum) if matches; else (None, None)
345
479
  patterns = [
346
- r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
347
- r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
480
+ r"(?i)(.*?)[._-]r?([12])$", # prefix_R1 / prefix.r2 / prefix-1
481
+ r"(?i)(.*?)[._-]read[_-]?([12])$", # prefix_read1
348
482
  ]
349
483
  for pat in patterns:
350
484
  m = re.match(pat, stem)
@@ -353,6 +487,14 @@ def concatenate_fastqs_to_bam(
353
487
  return None, None
354
488
 
355
489
  def _pair_by_filename(paths: List[Path]) -> Tuple[List[Tuple[Path, Path]], List[Path]]:
490
+ """Pair FASTQ files based on filename conventions.
491
+
492
+ Args:
493
+ paths: FASTQ paths to pair.
494
+
495
+ Returns:
496
+ Tuple of (paired list, leftover list).
497
+ """
356
498
  pref_map: Dict[str, Dict[int, Path]] = {}
357
499
  unpaired: List[Path] = []
358
500
  for pth in paths:
@@ -374,6 +516,14 @@ def concatenate_fastqs_to_bam(
374
516
  return pairs, leftovers
375
517
 
376
518
  def _fastq_iter(p: Path):
519
+ """Yield FASTQ records using pysam.FastxFile.
520
+
521
+ Args:
522
+ p: FASTQ path.
523
+
524
+ Yields:
525
+ Pysam Fastx records.
526
+ """
377
527
  # pysam.FastxFile handles compressed extensions transparently
378
528
  with pysam.FastxFile(str(p)) as fx:
379
529
  for rec in fx:
@@ -387,6 +537,19 @@ def concatenate_fastqs_to_bam(
387
537
  read1: bool,
388
538
  read2: bool,
389
539
  ) -> pysam.AlignedSegment:
540
+ """Construct an unaligned pysam.AlignedSegment.
541
+
542
+ Args:
543
+ name: Read name.
544
+ seq: Read sequence.
545
+ qual: FASTQ quality string.
546
+ bc: Barcode string.
547
+ read1: Whether this is read 1.
548
+ read2: Whether this is read 2.
549
+
550
+ Returns:
551
+ Unaligned pysam.AlignedSegment.
552
+ """
390
553
  a = pysam.AlignedSegment()
391
554
  a.query_name = name
392
555
  a.query_sequence = seq
@@ -409,6 +572,7 @@ def concatenate_fastqs_to_bam(
409
572
 
410
573
  # ---------- normalize inputs to Path ----------
411
574
  def _to_path_pair(x) -> Tuple[Path, Path]:
575
+ """Convert a tuple of path-like objects to Path instances."""
412
576
  a, b = x
413
577
  return Path(a), Path(b)
414
578
 
@@ -451,7 +615,10 @@ def concatenate_fastqs_to_bam(
451
615
  # ---------- BAM header ----------
452
616
  header = {"HD": {"VN": "1.6", "SO": "unknown"}, "SQ": []}
453
617
  if add_read_group:
454
- header["RG"] = [{"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})} for bc in barcodes_in_order]
618
+ header["RG"] = [
619
+ {"ID": bc, **({"SM": rg_sample_field} if rg_sample_field else {})}
620
+ for bc in barcodes_in_order
621
+ ]
455
622
  header.setdefault("PG", []).append(
456
623
  {"ID": "concat-fastq", "PN": "concatenate_fastqs_to_bam", "VN": "1"}
457
624
  )
@@ -477,7 +644,9 @@ def concatenate_fastqs_to_bam(
477
644
  it2 = _fastq_iter(r2_path)
478
645
 
479
646
  for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
647
+
480
648
  def _clean(n: Optional[str]) -> Optional[str]:
649
+ """Normalize FASTQ read names by trimming read suffixes."""
481
650
  if n is None:
482
651
  return None
483
652
  return re.sub(r"(?:/1$|/2$|\s[12]$)", "", n)
@@ -490,12 +659,16 @@ def concatenate_fastqs_to_bam(
490
659
  )
491
660
 
492
661
  if rec1 is not None:
493
- a1 = _make_unaligned_segment(name, rec1.sequence, rec1.quality, bc, read1=True, read2=False)
662
+ a1 = _make_unaligned_segment(
663
+ name, rec1.sequence, rec1.quality, bc, read1=True, read2=False
664
+ )
494
665
  bam_out.write(a1)
495
666
  per_file_counts[r1_path] = per_file_counts.get(r1_path, 0) + 1
496
667
  total_written += 1
497
668
  if rec2 is not None:
498
- a2 = _make_unaligned_segment(name, rec2.sequence, rec2.quality, bc, read1=False, read2=True)
669
+ a2 = _make_unaligned_segment(
670
+ name, rec2.sequence, rec2.quality, bc, read1=False, read2=True
671
+ )
499
672
  bam_out.write(a2)
500
673
  per_file_counts[r2_path] = per_file_counts.get(r2_path, 0) + 1
501
674
  total_written += 1
@@ -517,7 +690,9 @@ def concatenate_fastqs_to_bam(
517
690
  raise FileNotFoundError(pth)
518
691
  bc = per_path_barcode.get(pth, "barcode")
519
692
  for rec in _fastq_iter(pth):
520
- a = _make_unaligned_segment(rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False)
693
+ a = _make_unaligned_segment(
694
+ rec.name, rec.sequence, rec.quality, bc, read1=False, read2=False
695
+ )
521
696
  bam_out.write(a)
522
697
  per_file_counts[pth] = per_file_counts.get(pth, 0) + 1
523
698
  total_written += 1
@@ -531,20 +706,21 @@ def concatenate_fastqs_to_bam(
531
706
  "barcodes": barcodes_in_order,
532
707
  }
533
708
 
709
+
534
710
  def count_aligned_reads(bam_file):
535
711
  """
536
712
  Counts the number of aligned reads in a bam file that map to each reference record.
537
-
713
+
538
714
  Parameters:
539
715
  bam_file (str): A string representing the path to an aligned BAM file.
540
-
716
+
541
717
  Returns:
542
718
  aligned_reads_count (int): The total number or reads aligned in the BAM.
543
719
  unaligned_reads_count (int): The total number of reads not aligned in the BAM.
544
720
  record_counts (dict): A dictionary keyed by reference record instance that points toa tuple containing the total reads mapped to the record and the fraction of mapped reads which map to the record.
545
721
 
546
722
  """
547
- print('{0}: Counting aligned reads in BAM > {1}'.format(time_string(), bam_file))
723
+ print("{0}: Counting aligned reads in BAM > {1}".format(time_string(), bam_file))
548
724
  aligned_reads_count = 0
549
725
  unaligned_reads_count = 0
550
726
  # Make a dictionary, keyed by the reference_name of reference chromosome that points to an integer number of read counts mapped to the chromosome, as well as the proportion of mapped reads in that chromosome
@@ -553,12 +729,14 @@ def count_aligned_reads(bam_file):
553
729
  with pysam.AlignmentFile(str(bam_file), "rb") as bam:
554
730
  total_reads = bam.mapped + bam.unmapped
555
731
  # Iterate over reads to get the total mapped read counts and the reads that map to each reference
556
- for read in tqdm(bam, desc='Counting aligned reads in BAM', total=total_reads):
732
+ for read in tqdm(bam, desc="Counting aligned reads in BAM", total=total_reads):
557
733
  if read.is_unmapped:
558
734
  unaligned_reads_count += 1
559
735
  else:
560
736
  aligned_reads_count += 1
561
- record_counts[read.reference_name] += 1 # Automatically increments if key exists, adds if not
737
+ record_counts[read.reference_name] += (
738
+ 1 # Automatically increments if key exists, adds if not
739
+ )
562
740
 
563
741
  # reformat the dictionary to contain read counts mapped to the reference, as well as the proportion of mapped reads in reference
564
742
  for reference in record_counts:
@@ -567,7 +745,10 @@ def count_aligned_reads(bam_file):
567
745
 
568
746
  return aligned_reads_count, unaligned_reads_count, dict(record_counts)
569
747
 
570
- def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads):
748
+
749
+ def demux_and_index_BAM(
750
+ aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit, barcode_both_ends, trim, threads
751
+ ):
571
752
  """
572
753
  A wrapper function for splitting BAMS and indexing them.
573
754
  Parameters:
@@ -578,11 +759,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
578
759
  barcode_both_ends (bool): Whether to require both ends to be barcoded.
579
760
  trim (bool): Whether to trim off barcodes after demultiplexing.
580
761
  threads (int): Number of threads to use.
581
-
762
+
582
763
  Returns:
583
764
  bam_files (list): List of split BAM file path strings
584
765
  Splits an input BAM file on barcode value and makes a BAM index file.
585
766
  """
767
+
586
768
  input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
587
769
  command = ["dorado", "demux", "--kit-name", barcode_kit]
588
770
  if barcode_both_ends:
@@ -595,25 +777,37 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
595
777
  pass
596
778
  command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
597
779
  command.append(str(input_bam))
598
- command_string = ' '.join(command)
599
- print(f"Running: {command_string}")
600
- subprocess.run(command)
780
+ command_string = " ".join(command)
781
+ logger.info("Running dorado demux: %s", " ".join(command))
782
+
783
+ proc = subprocess.Popen(
784
+ command,
785
+ stdout=subprocess.PIPE,
786
+ stderr=subprocess.PIPE,
787
+ text=True,
788
+ )
789
+
790
+ assert proc.stderr is not None
791
+ _stream_dorado_logs(proc.stderr)
792
+ rc = proc.wait()
793
+
794
+ if rc != 0:
795
+ raise RuntimeError(f"dorado demux failed with exit code {rc}")
601
796
 
602
797
  bam_files = sorted(
603
- p for p in split_dir.glob(f"*{bam_suffix}")
604
- if p.is_file() and p.suffix == bam_suffix
798
+ p for p in split_dir.glob(f"*{bam_suffix}") if p.is_file() and p.suffix == bam_suffix
605
799
  )
606
800
 
607
801
  if not bam_files:
608
802
  raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
609
-
803
+
610
804
  # ---- Optional renaming with prefix ----
611
805
  renamed_bams = []
612
806
  prefix = "de" if barcode_both_ends else "se"
613
807
 
614
808
  for bam in bam_files:
615
809
  bam = Path(bam)
616
- bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
810
+ bai = bam.with_suffix(bam_suffix + ".bai") # dorado’s sorting produces .bam.bai
617
811
 
618
812
  if prefix:
619
813
  new_name = f"{prefix}_{bam.name}"
@@ -629,9 +823,10 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
629
823
  bai.rename(new_bai)
630
824
 
631
825
  renamed_bams.append(new_bam)
632
-
826
+
633
827
  return renamed_bams
634
828
 
829
+
635
830
  def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
636
831
  """
637
832
  Efficiently extracts base identities from mapped reads with reference coordinates.
@@ -647,14 +842,15 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
647
842
  dict: Base identities from forward mapped reads.
648
843
  dict: Base identities from reverse mapped reads.
649
844
  """
845
+ logger.debug("Extracting nucleotide identities for each read using extract_base_identities")
650
846
  timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
651
847
 
652
848
  positions = set(positions)
653
- fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
654
- rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
849
+ fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
850
+ rev_base_identities = defaultdict(lambda: np.full(max_reference_length, "N", dtype="<U1"))
655
851
  mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
656
852
 
657
- #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
853
+ # print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
658
854
  with pysam.AlignmentFile(str(bam_file), "rb") as bam:
659
855
  total_reads = bam.mapped
660
856
  ref_seq = sequence.upper()
@@ -677,7 +873,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
677
873
  base_dict[read_name][reference_position] = read_base
678
874
 
679
875
  # Track mismatches (excluding Ns)
680
- if read_base != ref_base and read_base != 'N' and ref_base != 'N':
876
+ if read_base != ref_base and read_base != "N" and ref_base != "N":
681
877
  mismatch_counts_per_read[read_name][ref_base][read_base] += 1
682
878
 
683
879
  # Determine C→T vs G→A dominance per read
@@ -695,7 +891,13 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
695
891
  else:
696
892
  mismatch_trend_per_read[read_name] = "none"
697
893
 
698
- return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read
894
+ return (
895
+ dict(fwd_base_identities),
896
+ dict(rev_base_identities),
897
+ dict(mismatch_counts_per_read),
898
+ mismatch_trend_per_read,
899
+ )
900
+
699
901
 
700
902
  def extract_read_features_from_bam(bam_file_path):
701
903
  """
@@ -706,7 +908,9 @@ def extract_read_features_from_bam(bam_file_path):
706
908
  read_metrics (dict)
707
909
  """
708
910
  # Open the BAM file
709
- print(f'Extracting read features from BAM: {bam_file_path}')
911
+ logger.debug(
912
+ f"Extracting read metrics from BAM using extract_read_features_from_bam: {bam_file_path}"
913
+ )
710
914
  with pysam.AlignmentFile(bam_file_path, "rb") as bam_file:
711
915
  read_metrics = {}
712
916
  reference_lengths = bam_file.lengths # List of lengths for each reference (chromosome)
@@ -723,10 +927,17 @@ def extract_read_features_from_bam(bam_file_path):
723
927
  reference_length = reference_lengths[reference_index]
724
928
  mapped_length = sum(end - start for start, end in read.get_blocks())
725
929
  mapping_quality = read.mapping_quality # Phred-scaled MAPQ
726
- read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
930
+ read_metrics[read.query_name] = [
931
+ read.query_length,
932
+ median_read_quality,
933
+ reference_length,
934
+ mapped_length,
935
+ mapping_quality,
936
+ ]
727
937
 
728
938
  return read_metrics
729
939
 
940
+
730
941
  def extract_readnames_from_bam(aligned_BAM):
731
942
  """
732
943
  Takes a BAM and writes out a txt file containing read names from the BAM
@@ -739,15 +950,19 @@ def extract_readnames_from_bam(aligned_BAM):
739
950
 
740
951
  """
741
952
  import subprocess
953
+
742
954
  # Make a text file of reads for the BAM
743
- txt_output = aligned_BAM.split('.bam')[0] + '_read_names.txt'
955
+ txt_output = aligned_BAM.split(".bam")[0] + "_read_names.txt"
744
956
  samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
745
957
  with open(txt_output, "w") as output_file:
746
- cut_process = subprocess.Popen(["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file)
958
+ cut_process = subprocess.Popen(
959
+ ["cut", "-f1"], stdin=samtools_view.stdout, stdout=output_file
960
+ )
747
961
  samtools_view.stdout.close()
748
962
  cut_process.wait()
749
963
  samtools_view.wait()
750
964
 
965
+
751
966
  def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
752
967
  """
753
968
  Separates an input BAM file on the BC SAM tag values.
@@ -757,11 +972,12 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
757
972
  output_prefix (str): A prefix to append to the output BAM.
758
973
  bam_suffix (str): A suffix to add to the bam file.
759
974
  split_dir (str): String indicating path to directory to split BAMs into
760
-
975
+
761
976
  Returns:
762
977
  None
763
978
  Writes out split BAM files.
764
979
  """
980
+ logger.debug("Demultiplexing BAM based on the BC tag")
765
981
  bam_base = input_bam.name
766
982
  bam_base_minus_suffix = input_bam.stem
767
983
 
@@ -774,19 +990,24 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
774
990
  try:
775
991
  # Get the barcode tag value
776
992
  bc_tag = read.get_tag("BC", with_value_type=True)[0]
777
- #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
993
+ # bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
778
994
  # Open the output BAM file corresponding to the barcode
779
995
  if bc_tag not in output_files:
780
- output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
781
- output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
996
+ output_path = (
997
+ split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
998
+ )
999
+ output_files[bc_tag] = pysam.AlignmentFile(
1000
+ str(output_path), "wb", header=bam.header
1001
+ )
782
1002
  # Write the read to the corresponding output BAM file
783
1003
  output_files[bc_tag].write(read)
784
1004
  except KeyError:
785
- print(f"BC tag not present for read: {read.query_name}")
1005
+ logger.warning(f"BC tag not present for read: {read.query_name}")
786
1006
  # Close all output BAM files
787
1007
  for output_file in output_files.values():
788
1008
  output_file.close()
789
1009
 
1010
+
790
1011
  def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
791
1012
  """
792
1013
  A wrapper function for splitting BAMS and indexing them.
@@ -794,19 +1015,20 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
794
1015
  aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
795
1016
  split_dir (str): A string representing the file path to the directory to split the BAMs into.
796
1017
  bam_suffix (str): A suffix to add to the bam file.
797
-
1018
+
798
1019
  Returns:
799
1020
  None
800
1021
  Splits an input BAM file on barcode value and makes a BAM index file.
801
1022
  """
1023
+ logger.debug("Demultiplexing and indexing BAMS based on BC tag using split_and_index_BAM")
802
1024
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
803
1025
  file_prefix = date_string()
804
1026
  separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
805
1027
  # Make a BAM index file for the BAMs in that directory
806
- bam_pattern = '*' + bam_suffix
1028
+ bam_pattern = "*" + bam_suffix
807
1029
  bam_files = glob.glob(split_dir / bam_pattern)
808
- bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
1030
+ bam_files = [str(bam) for bam in bam_files if ".bai" not in str(bam)]
809
1031
  for input_file in bam_files:
810
1032
  pysam.index(input_file)
811
1033
 
812
- return bam_files
1034
+ return bam_files