smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,23 @@
1
+ from __future__ import annotations
2
+
1
3
  import shutil
2
4
  from pathlib import Path
3
- from typing import Union, Iterable
5
+ from typing import Iterable, Union
6
+
7
+ import numpy as np
8
+
9
+ from smftools.logging_utils import get_logger
4
10
 
5
11
  from .helpers import AdataPaths
6
12
 
13
+ logger = get_logger(__name__)
14
+
15
+
7
16
  def check_executable_exists(cmd: str) -> bool:
8
17
  """Return True if a command-line executable is available in PATH."""
9
18
  return shutil.which(cmd) is not None
10
19
 
20
+
11
21
  def delete_tsvs(
12
22
  tsv_dir: Union[str, Path, Iterable[str], None],
13
23
  *,
@@ -27,48 +37,140 @@ def delete_tsvs(
27
37
  verbose : bool
28
38
  Print progress / warnings.
29
39
  """
40
+
30
41
  # Helper: remove a single file path (Path-like or string)
31
42
  def _maybe_unlink(p: Path):
32
43
  if not p.exists():
33
44
  if verbose:
34
- print(f"[skip] not found: {p}")
45
+ logger.info(f"[skip] not found: {p}")
35
46
  return
36
47
  if not p.is_file():
37
48
  if verbose:
38
- print(f"[skip] not a file: {p}")
49
+ logger.info(f"[skip] not a file: {p}")
39
50
  return
40
51
  if dry_run:
41
- print(f"[dry-run] would remove file: {p}")
52
+ logger.info(f"[dry-run] would remove file: {p}")
42
53
  return
43
54
  try:
44
55
  p.unlink()
45
56
  if verbose:
46
- print(f"Removed file: {p}")
57
+ logger.info(f"Removed file: {p}")
47
58
  except Exception as e:
48
- print(f"[error] failed to remove file {p}: {e}")
59
+ logger.warning(f"Failed to remove file {p}: {e}")
49
60
 
50
61
  # Remove tmp_dir recursively (if provided)
51
62
  if tsv_dir is not None:
52
63
  td = Path(tsv_dir)
53
64
  if not td.exists():
54
65
  if verbose:
55
- print(f"[skip] tsv_dir not found: {td}")
66
+ logger.info(f"[skip] tsv_dir not found: {td}")
56
67
  else:
57
68
  if not td.is_dir():
58
69
  if verbose:
59
- print(f"[skip] tsv_dir is not a directory: {td}")
70
+ logger.info(f"[skip] tsv_dir is not a directory: {td}")
60
71
  else:
61
72
  if dry_run:
62
- print(f"[dry-run] would remove directory tree: {td}")
73
+ logger.info(f"[dry-run] would remove directory tree: {td}")
63
74
  else:
64
75
  try:
65
76
  shutil.rmtree(td)
66
77
  if verbose:
67
- print(f"Removed directory tree: {td}")
78
+ logger.info(f"Removed directory tree: {td}")
68
79
  except Exception as e:
69
- print(f"[error] failed to remove tmp dir {td}: {e}")
80
+ logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
81
+
82
+
83
+ def load_adata(config_path: str):
84
+ """
85
+ CLI-facing wrapper for the load pipeline.
86
+
87
+ - Reads config CSV into ExperimentConfig
88
+ - Computes canonical paths for all downstream AnnData stages
89
+ - Registers those in the summary CSV
90
+ - Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
91
+ - If needed, calls the core pipeline to actually build the raw AnnData
70
92
 
71
- def load_adata_core(cfg, paths: AdataPaths):
93
+ Returns
94
+ -------
95
+ adata : anndata.AnnData | None
96
+ Newly created AnnData object, or None if we skipped because a later-stage
97
+ AnnData already exists.
98
+ adata_path : pathlib.Path
99
+ Path to the "current" AnnData that should be used downstream.
100
+ cfg : ExperimentConfig
101
+ Config object for downstream steps.
102
+ """
103
+ from datetime import datetime
104
+ from importlib import resources
105
+
106
+ from ..config import ExperimentConfig, LoadExperimentConfig
107
+ from ..readwrite import add_or_update_column_in_csv, make_dirs
108
+ from .helpers import get_adata_paths
109
+
110
+ date_str = datetime.today().strftime("%y%m%d")
111
+
112
+ # -----------------------------
113
+ # 1) Load config into cfg
114
+ # -----------------------------
115
+ loader = LoadExperimentConfig(config_path)
116
+ defaults_dir = resources.files("smftools").joinpath("config")
117
+ cfg, report = ExperimentConfig.from_var_dict(
118
+ loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
119
+ )
120
+
121
+ # Ensure base output dir
122
+ make_dirs([cfg.output_directory])
123
+
124
+ # -----------------------------
125
+ # 2) Compute and register paths
126
+ # -----------------------------
127
+ paths = get_adata_paths(cfg)
128
+
129
+ # experiment-level metadata in summary CSV
130
+ add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
131
+ add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
132
+ add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
133
+ add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
134
+
135
+ # AnnData stage paths
136
+ add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
137
+ add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
138
+ add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
139
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
140
+ add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
141
+
142
+ # -----------------------------
143
+ # 3) Stage skipping logic
144
+ # -----------------------------
145
+ if not getattr(cfg, "force_redo_load_adata", False):
146
+ if paths.hmm.exists():
147
+ logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
148
+ return None, paths.hmm, cfg
149
+ if paths.spatial.exists():
150
+ logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
151
+ return None, paths.spatial, cfg
152
+ if paths.pp_dedup.exists():
153
+ logger.debug(
154
+ f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
155
+ f"Skipping smftools load"
156
+ )
157
+ return None, paths.pp_dedup, cfg
158
+ if paths.pp.exists():
159
+ logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
160
+ return None, paths.pp, cfg
161
+ if paths.raw.exists():
162
+ logger.debug(
163
+ f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
164
+ )
165
+ return None, paths.raw, cfg
166
+
167
+ # If we get here, we actually want to run the full load pipeline
168
+ adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
169
+
170
+ return adata, adata_path, cfg
171
+
172
+
173
+ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
72
174
  """
73
175
  Core load pipeline.
74
176
 
@@ -97,28 +199,31 @@ def load_adata_core(cfg, paths: AdataPaths):
97
199
  cfg : ExperimentConfig
98
200
  (Same object, possibly with some fields updated, e.g. fasta path.)
99
201
  """
100
- import os
101
- from pathlib import Path
102
-
103
- import numpy as np
104
- import pandas as pd
105
- import anndata as ad
106
- import scanpy as sc
107
202
 
108
- from .helpers import write_gz_h5ad
109
-
110
- from ..readwrite import make_dirs, add_or_update_column_in_csv
111
-
112
- from ..informatics.bam_functions import concatenate_fastqs_to_bam, align_and_sort_BAM, demux_and_index_BAM, split_and_index_BAM, bam_qc, extract_read_features_from_bam
203
+ from ..informatics.bam_functions import (
204
+ align_and_sort_BAM,
205
+ bam_qc,
206
+ concatenate_fastqs_to_bam,
207
+ demux_and_index_BAM,
208
+ extract_read_features_from_bam,
209
+ split_and_index_BAM,
210
+ )
211
+ from ..informatics.basecalling import canoncall, modcall
113
212
  from ..informatics.bed_functions import aligned_BAM_to_bed
114
- from ..informatics.pod5_functions import fast5_to_pod5
115
- from ..informatics.fasta_functions import subsample_fasta_from_bed, generate_converted_FASTA, get_chromosome_lengths
116
- from ..informatics.basecalling import modcall, canoncall
117
- from ..informatics.modkit_functions import modQC, make_modbed, extract_mods
118
- from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
119
213
  from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
214
+ from ..informatics.fasta_functions import (
215
+ generate_converted_FASTA,
216
+ get_chromosome_lengths,
217
+ subsample_fasta_from_bed,
218
+ )
120
219
  from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
220
+ from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
221
+ from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
222
+ from ..informatics.pod5_functions import fast5_to_pod5
121
223
  from ..informatics.run_multiqc import run_multiqc
224
+ from ..metadata import record_smftools_metadata
225
+ from ..readwrite import add_or_update_column_in_csv, make_dirs
226
+ from .helpers import write_gz_h5ad
122
227
 
123
228
  ################################### 1) General params and input organization ###################################
124
229
  output_directory = Path(cfg.output_directory)
@@ -169,19 +274,20 @@ def load_adata_core(cfg, paths: AdataPaths):
169
274
  if cfg.aligner == "minimap2":
170
275
  if not check_executable_exists("minimap2"):
171
276
  raise RuntimeError(
172
- "Error: 'minimap2' is not installed or not in PATH. "
173
- "Install minimap2"
277
+ "Error: 'minimap2' is not installed or not in PATH. Install minimap2"
174
278
  )
175
279
 
176
280
  # # Detect the input filetypes
177
281
  # If the input files are fast5 files, convert the files to a pod5 file before proceeding.
178
282
  if cfg.input_type == "fast5":
179
283
  # take the input directory of fast5 files and write out a single pod5 file into the output directory.
180
- output_pod5 = cfg.output_directory / 'FAST5s_to_POD5.pod5'
284
+ output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
181
285
  if output_pod5.exists():
182
286
  pass
183
287
  else:
184
- print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
288
+ logger.info(
289
+ f"Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}"
290
+ )
185
291
  fast5_to_pod5(cfg.input_data_path, output_pod5)
186
292
  # Reassign the pod5_dir variable to point to the new pod5 file.
187
293
  cfg.input_data_path = output_pod5
@@ -189,22 +295,25 @@ def load_adata_core(cfg, paths: AdataPaths):
189
295
  # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
190
296
  elif cfg.input_type == "fastq":
191
297
  # Output file for FASTQ concatenation.
192
- output_bam = cfg.output_directory / 'canonical_basecalls.bam'
298
+ output_bam = cfg.output_directory / "canonical_basecalls.bam"
193
299
  if output_bam.exists():
194
- pass
300
+ logger.debug("Output BAM already exists")
195
301
  else:
302
+ logger.info("Concatenating FASTQ files into a single BAM file")
196
303
  summary = concatenate_fastqs_to_bam(
197
304
  cfg.input_files,
198
305
  output_bam,
199
- barcode_tag='BC',
200
- gzip_suffixes=('.gz','.gzip'),
306
+ barcode_tag="BC",
307
+ gzip_suffixes=(".gz", ".gzip"),
201
308
  barcode_map=cfg.fastq_barcode_map,
202
309
  add_read_group=True,
203
310
  rg_sample_field=None,
204
311
  progress=False,
205
- auto_pair=cfg.fastq_auto_pairing)
206
-
207
- print(f"Found the following barcodes: {summary['barcodes']}")
312
+ auto_pair=cfg.fastq_auto_pairing,
313
+ samtools_backend=cfg.samtools_backend,
314
+ )
315
+
316
+ logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
208
317
 
209
318
  # Set the input data path to the concatenated BAM.
210
319
  cfg.input_data_path = output_bam
@@ -213,24 +322,24 @@ def load_adata_core(cfg, paths: AdataPaths):
213
322
  pass
214
323
  else:
215
324
  pass
216
-
325
+
217
326
  add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
218
327
 
219
328
  # Determine if the input data needs to be basecalled
220
329
  if cfg.input_type == "pod5":
221
- print(f'Detected pod5 inputs: {cfg.input_files}')
330
+ logger.info(f"Detected pod5 inputs: {cfg.input_files}")
222
331
  basecall = True
223
332
  elif cfg.input_type in ["bam"]:
224
- print(f'Detected bam input: {cfg.input_files}')
333
+ logger.info(f"Detected bam input: {cfg.input_files}")
225
334
  basecall = False
226
335
  else:
227
- print('Error, can not find input bam or pod5')
336
+ logger.info("Error, can not find input bam or pod5")
228
337
 
229
338
  # Generate the base name of the unaligned bam without the .bam suffix
230
339
  if basecall:
231
340
  model_basename = Path(cfg.model).name
232
- model_basename = str(model_basename).replace('.', '_')
233
- if cfg.smf_modality == 'direct':
341
+ model_basename = str(model_basename).replace(".", "_")
342
+ if cfg.smf_modality == "direct":
234
343
  mod_string = "_".join(cfg.mod_list)
235
344
  bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
236
345
  else:
@@ -241,7 +350,9 @@ def load_adata_core(cfg, paths: AdataPaths):
241
350
 
242
351
  # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
243
352
  unaligned_output = bam.with_suffix(cfg.bam_suffix)
244
- aligned_BAM = cfg.output_directory / (bam.stem + "_aligned") # doing this allows specifying an input bam in a seperate directory as the aligned output bams
353
+ aligned_BAM = (
354
+ cfg.output_directory / (bam.stem + "_aligned")
355
+ ) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
245
356
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
246
357
  aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
247
358
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
@@ -252,34 +363,40 @@ def load_adata_core(cfg, paths: AdataPaths):
252
363
  ########################################################################################################################
253
364
 
254
365
  ################################### 2) FASTA Handling ###################################
255
- from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
256
366
 
257
367
  try:
258
368
  cfg.fasta = Path(cfg.fasta)
259
- except:
260
- print("Need to provide an input FASTA path to proceed with smftools load")
369
+ except Exception:
370
+ logger.warning("Need to provide an input FASTA path to proceed with smftools load")
261
371
 
262
372
  # If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
263
- if cfg.fasta_regions_of_interest and '.bed' in cfg.fasta_regions_of_interest:
264
- fasta_basename = cfg.fasta.parent / cfg.fasta.stem
265
- bed_basename_minus_suffix = Path(cfg.fasta_regions_of_interest).stem
266
- output_FASTA = fasta_basename.with_name(fasta_basename.name + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta')
267
- subsample_fasta_from_bed(cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA)
268
- fasta = cfg.output_directory / output_FASTA
373
+ if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
374
+ fasta_stem = cfg.fasta.stem
375
+ bed_stem = Path(cfg.fasta_regions_of_interest).stem
376
+ output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
377
+
378
+ logger.info("Subsampling FASTA records using the provided BED file")
379
+ subsample_fasta_from_bed(
380
+ cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
381
+ )
382
+ fasta = output_FASTA
269
383
  else:
384
+ logger.info("Using the full FASTA file")
270
385
  fasta = cfg.fasta
271
386
 
272
387
  # For conversion style SMF, make a converted reference FASTA
273
- if cfg.smf_modality == 'conversion':
274
- fasta_basename = fasta.parent / fasta.stem
275
- converted_FASTA_basename = fasta_basename.with_name(fasta_basename.name + '_converted.fasta')
388
+ if cfg.smf_modality == "conversion":
389
+ fasta_stem = fasta.stem
390
+ converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
276
391
  converted_FASTA = cfg.output_directory / converted_FASTA_basename
277
- if 'converted.fa' in fasta.name:
278
- print(f'{fasta} is already converted. Using existing converted FASTA.')
392
+
393
+ if "converted.fa" in fasta.name:
394
+ logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
279
395
  converted_FASTA = fasta
280
396
  elif converted_FASTA.exists():
281
- print(f'{converted_FASTA} already exists. Using existing converted FASTA.')
397
+ logger.info(f"{converted_FASTA} already exists. Using existing converted FASTA.")
282
398
  else:
399
+ logger.info(f"Converting FASTA base sequences")
283
400
  generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
284
401
  fasta = converted_FASTA
285
402
 
@@ -290,121 +407,176 @@ def load_adata_core(cfg, paths: AdataPaths):
290
407
  ########################################################################################################################
291
408
 
292
409
  ################################### 3) Basecalling ###################################
293
- from ..informatics.basecalling import modcall, canoncall
410
+
294
411
  # 1) Basecall using dorado
295
- if basecall and cfg.sequencer == 'ont':
412
+ if basecall and cfg.sequencer == "ont":
296
413
  try:
297
414
  cfg.model_dir = Path(cfg.model_dir)
298
- except:
299
- print("Need to provide a valid path to a dorado model directory to use dorado basecalling")
415
+ except Exception:
416
+ logger.warning(
417
+ "Need to provide a valid path to a dorado model directory to use dorado basecalling"
418
+ )
300
419
  if aligned_sorted_output.exists():
301
- print(f'{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM.')
420
+ logger.info(
421
+ f"{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM."
422
+ )
302
423
  elif unaligned_output.exists():
303
- print(f'{unaligned_output} already exists. Using existing basecalled BAM.')
304
- elif cfg.smf_modality != 'direct':
305
- canoncall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
424
+ logger.info(f"{unaligned_output} already exists. Using existing basecalled BAM.")
425
+ elif cfg.smf_modality != "direct":
426
+ logger.info("Running canonical basecalling using dorado")
427
+ canoncall(
428
+ str(cfg.model_dir),
429
+ cfg.model,
430
+ str(cfg.input_data_path),
431
+ cfg.barcode_kit,
432
+ str(bam),
433
+ cfg.bam_suffix,
434
+ cfg.barcode_both_ends,
435
+ cfg.trim,
436
+ cfg.device,
437
+ )
306
438
  else:
307
- modcall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, cfg.mod_list, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
439
+ logger.info("Running modified basecalling using dorado")
440
+ modcall(
441
+ str(cfg.model_dir),
442
+ cfg.model,
443
+ str(cfg.input_data_path),
444
+ cfg.barcode_kit,
445
+ cfg.mod_list,
446
+ str(bam),
447
+ cfg.bam_suffix,
448
+ cfg.barcode_both_ends,
449
+ cfg.trim,
450
+ cfg.device,
451
+ )
308
452
  elif basecall:
309
- print(f"Basecalling is currently only supported for ont sequencers and not pacbio.")
453
+ logger.error("Basecalling is currently only supported for ont sequencers and not pacbio.")
310
454
  else:
311
455
  pass
312
456
  ########################################################################################################################
313
457
 
314
458
  ################################### 4) Alignment and sorting #############################################
315
- from ..informatics.bam_functions import align_and_sort_BAM
316
- from ..informatics.bed_functions import aligned_BAM_to_bed
459
+
317
460
  # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
318
461
  if aligned_sorted_output.exists():
319
- print(f'{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.')
462
+ logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
320
463
  else:
464
+ logger.info(f"Aligning and sorting reads")
321
465
  align_and_sort_BAM(fasta, unaligned_output, cfg)
322
466
  # Deleted the unsorted aligned output
323
467
  aligned_output.unlink()
324
468
 
325
469
  if cfg.make_beds:
326
470
  # Make beds and provide basic histograms
327
- bed_dir = cfg.output_directory / 'beds'
471
+ bed_dir = cfg.output_directory / "beds"
328
472
  if bed_dir.is_dir():
329
- print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}')
473
+ logger.debug(
474
+ f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
475
+ )
330
476
  else:
331
- aligned_BAM_to_bed(aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads)
477
+ logger.info("Making bed files from the aligned and sorted BAM file")
478
+ aligned_BAM_to_bed(
479
+ aligned_sorted_output,
480
+ cfg.output_directory,
481
+ fasta,
482
+ cfg.make_bigwigs,
483
+ cfg.threads,
484
+ samtools_backend=cfg.samtools_backend,
485
+ bedtools_backend=cfg.bedtools_backend,
486
+ bigwig_backend=cfg.bigwig_backend,
487
+ )
332
488
  ########################################################################################################################
333
489
 
334
490
  ################################### 5) Demultiplexing ######################################################################
335
- from ..informatics.bam_functions import demux_and_index_BAM, split_and_index_BAM
491
+
336
492
  # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
337
493
  if cfg.input_already_demuxed:
338
494
  if cfg.split_path.is_dir():
339
- print(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
495
+ logger.debug(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
340
496
 
341
497
  all_bam_files = sorted(
342
- p for p in cfg.split_path.iterdir()
343
- if p.is_file()
344
- and p.suffix == cfg.bam_suffix
498
+ p for p in cfg.split_path.iterdir() if p.is_file() and p.suffix == cfg.bam_suffix
345
499
  )
346
500
  unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
347
501
  bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
348
502
 
349
503
  else:
350
504
  make_dirs([cfg.split_path])
351
- all_bam_files = split_and_index_BAM(aligned_sorted_BAM,
352
- cfg.split_path,
353
- cfg.bam_suffix)
354
-
505
+ logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
506
+ all_bam_files = split_and_index_BAM(
507
+ aligned_sorted_BAM,
508
+ cfg.split_path,
509
+ cfg.bam_suffix,
510
+ samtools_backend=cfg.samtools_backend,
511
+ )
512
+
355
513
  unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
356
514
  bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
357
515
 
358
516
  se_bam_files = bam_files
359
517
  bam_dir = cfg.split_path
360
-
518
+
361
519
  else:
362
520
  if single_barcoded_path.is_dir():
363
- print(f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs.")
521
+ logger.debug(
522
+ f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs."
523
+ )
364
524
 
365
525
  all_se_bam_files = sorted(
366
- p for p in single_barcoded_path.iterdir()
367
- if p.is_file()
368
- and p.suffix == cfg.bam_suffix
369
- )
526
+ p
527
+ for p in single_barcoded_path.iterdir()
528
+ if p.is_file() and p.suffix == cfg.bam_suffix
529
+ )
370
530
  unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
371
531
  se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
372
532
  else:
373
- make_dirs([cfg.split_path, single_barcoded_path])
374
- all_se_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
375
- single_barcoded_path,
376
- cfg.bam_suffix,
377
- cfg.barcode_kit,
378
- False,
379
- cfg.trim,
380
- cfg.threads)
381
-
533
+ make_dirs([cfg.split_path, single_barcoded_path])
534
+ logger.info(
535
+ "Demultiplexing samples into individual aligned/sorted BAM files based on single end barcode status with Dorado"
536
+ )
537
+ all_se_bam_files = demux_and_index_BAM(
538
+ aligned_sorted_BAM,
539
+ single_barcoded_path,
540
+ cfg.bam_suffix,
541
+ cfg.barcode_kit,
542
+ False,
543
+ cfg.trim,
544
+ cfg.threads,
545
+ )
546
+
382
547
  unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
383
548
  se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
384
-
549
+
385
550
  if double_barcoded_path.is_dir():
386
- print(f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs.")
551
+ logger.debug(
552
+ f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs."
553
+ )
387
554
 
388
555
  all_de_bam_files = sorted(
389
- p for p in double_barcoded_path.iterdir()
390
- if p.is_file()
391
- and p.suffix == cfg.bam_suffix
392
- )
556
+ p
557
+ for p in double_barcoded_path.iterdir()
558
+ if p.is_file() and p.suffix == cfg.bam_suffix
559
+ )
393
560
  unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
394
561
  de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
395
- else:
396
- make_dirs([cfg.split_path, double_barcoded_path])
397
- all_de_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
398
- double_barcoded_path,
399
- cfg.bam_suffix,
400
- cfg.barcode_kit,
401
- True,
402
- cfg.trim,
403
- cfg.threads)
404
-
562
+ else:
563
+ make_dirs([cfg.split_path, double_barcoded_path])
564
+ logger.info(
565
+ "Demultiplexing samples into individual aligned/sorted BAM files based on double end barcode status with Dorado"
566
+ )
567
+ all_de_bam_files = demux_and_index_BAM(
568
+ aligned_sorted_BAM,
569
+ double_barcoded_path,
570
+ cfg.bam_suffix,
571
+ cfg.barcode_kit,
572
+ True,
573
+ cfg.trim,
574
+ cfg.threads,
575
+ )
576
+
405
577
  unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
406
578
  de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
407
-
579
+
408
580
  bam_files = se_bam_files + de_bam_files
409
581
  unclassified_bams = unclassified_se_bams + unclassified_de_bams
410
582
  bam_dir = single_barcoded_path
@@ -413,225 +585,205 @@ def load_adata_core(cfg, paths: AdataPaths):
413
585
 
414
586
  if cfg.make_beds:
415
587
  # Make beds and provide basic histograms
416
- bed_dir = cfg.split_path / 'beds'
588
+ bed_dir = cfg.split_path / "beds"
417
589
  if bed_dir.is_dir():
418
- print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams')
590
+ logger.debug(
591
+ f"{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams"
592
+ )
419
593
  else:
594
+ logger.info("Making BED files from BAM files for each sample")
420
595
  for bam in bam_files:
421
- aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
596
+ aligned_BAM_to_bed(
597
+ bam,
598
+ cfg.split_path,
599
+ fasta,
600
+ cfg.make_bigwigs,
601
+ cfg.threads,
602
+ samtools_backend=cfg.samtools_backend,
603
+ bedtools_backend=cfg.bedtools_backend,
604
+ bigwig_backend=cfg.bigwig_backend,
605
+ )
422
606
  ########################################################################################################################
423
607
 
424
608
  ################################### 6) SAMTools based BAM QC ######################################################################
425
- from ..informatics.bam_functions import bam_qc
609
+
426
610
  # 5) Samtools QC metrics on split BAM files
427
611
  bam_qc_dir = cfg.split_path / "bam_qc"
428
612
  if bam_qc_dir.is_dir():
429
- print( f'{bam_qc_dir} already exists. Using existing BAM QC calculations.')
613
+ logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
430
614
  else:
431
615
  make_dirs([bam_qc_dir])
432
- bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
433
- ########################################################################################################################
616
+ logger.info("Performing BAM QC")
617
+ bam_qc(
618
+ bam_files,
619
+ bam_qc_dir,
620
+ cfg.threads,
621
+ modality=cfg.smf_modality,
622
+ samtools_backend=cfg.samtools_backend,
623
+ )
624
+ ########################################################################################################################
434
625
 
435
626
  ################################### 7) AnnData loading ######################################################################
436
- if cfg.smf_modality != 'direct':
627
+ if cfg.smf_modality != "direct":
437
628
  from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
629
+
438
630
  # 6) Take the converted BAM and load it into an adata object.
439
- if cfg.smf_modality == 'deaminase':
631
+ if cfg.smf_modality == "deaminase":
440
632
  deaminase_footprinting = True
441
633
  else:
442
634
  deaminase_footprinting = False
443
- raw_adata, raw_adata_path = converted_BAM_to_adata(fasta,
444
- bam_dir,
445
- cfg.output_directory,
446
- cfg.input_already_demuxed,
447
- cfg.mapping_threshold,
448
- cfg.experiment_name,
449
- cfg.conversion_types,
450
- cfg.bam_suffix,
451
- cfg.device,
452
- cfg.threads,
453
- deaminase_footprinting,
454
- delete_intermediates=cfg.delete_intermediate_hdfs,
455
- double_barcoded_path=double_barcoded_path)
635
+
636
+ logger.info(f"Loading Anndata from BAM files for {cfg.smf_modality} footprinting")
637
+ raw_adata, raw_adata_path = converted_BAM_to_adata(
638
+ fasta,
639
+ bam_dir,
640
+ cfg.output_directory,
641
+ cfg.input_already_demuxed,
642
+ cfg.mapping_threshold,
643
+ cfg.experiment_name,
644
+ cfg.conversion_types,
645
+ cfg.bam_suffix,
646
+ cfg.device,
647
+ cfg.threads,
648
+ deaminase_footprinting,
649
+ delete_intermediates=cfg.delete_intermediate_hdfs,
650
+ double_barcoded_path=double_barcoded_path,
651
+ samtools_backend=cfg.samtools_backend,
652
+ )
456
653
  else:
457
654
  if mod_bed_dir.is_dir():
458
- print(f'{mod_bed_dir} already exists, skipping making modbeds')
655
+ logger.debug(f"{mod_bed_dir} already exists, skipping making modbeds")
459
656
  else:
460
- from ..informatics.modkit_functions import modQC, make_modbed
461
- make_dirs([mod_bed_dir])
462
-
463
- modQC(aligned_sorted_output,
464
- cfg.thresholds) # get QC metrics for mod calls
465
-
466
- make_modbed(aligned_sorted_output,
467
- cfg.thresholds,
468
- mod_bed_dir) # Generate bed files of position methylation summaries for every sample
469
-
657
+ from ..informatics.modkit_functions import make_modbed, modQC
658
+
659
+ make_dirs([mod_bed_dir])
660
+
661
+ logger.info("Performing modQC for direct footprinting samples")
662
+
663
+ modQC(aligned_sorted_output, cfg.thresholds) # get QC metrics for mod calls
664
+
665
+ logger.info("Making modified BED files for direct footprinting samples")
666
+
667
+ make_modbed(
668
+ aligned_sorted_output, cfg.thresholds, mod_bed_dir
669
+ ) # Generate bed files of position methylation summaries for every sample
670
+
470
671
  from ..informatics.modkit_functions import extract_mods
672
+
471
673
  make_dirs([mod_tsv_dir])
472
674
 
473
- extract_mods(cfg.thresholds,
474
- mod_tsv_dir,
475
- bam_dir,
476
- cfg.bam_suffix,
477
- skip_unclassified=cfg.skip_unclassified,
478
- modkit_summary=False,
479
- threads=cfg.threads) # Extract methylations calls for split BAM files into split TSV files
480
-
675
+ logger.info(
676
+ "Extracting single read modification states into TSVs for direct footprinting samples"
677
+ )
678
+
679
+ extract_mods(
680
+ cfg.thresholds,
681
+ mod_tsv_dir,
682
+ bam_dir,
683
+ cfg.bam_suffix,
684
+ skip_unclassified=cfg.skip_unclassified,
685
+ modkit_summary=False,
686
+ threads=cfg.threads,
687
+ ) # Extract methylations calls for split BAM files into split TSV files
688
+
481
689
  from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
482
- #6 Load the modification data from TSVs into an adata object
483
- raw_adata, raw_adata_path = modkit_extract_to_adata(fasta,
484
- bam_dir,
485
- cfg.output_directory,
486
- cfg.input_already_demuxed,
487
- cfg.mapping_threshold,
488
- cfg.experiment_name,
489
- mods,
490
- cfg.batch_size,
491
- mod_tsv_dir,
492
- cfg.delete_batch_hdfs,
493
- cfg.threads,
494
- double_barcoded_path)
690
+
691
+ logger.info("Making Anndata for direct modification detection SMF samples")
692
+
693
+ # 6 Load the modification data from TSVs into an adata object
694
+ raw_adata, raw_adata_path = modkit_extract_to_adata(
695
+ fasta,
696
+ bam_dir,
697
+ cfg.output_directory,
698
+ cfg.input_already_demuxed,
699
+ cfg.mapping_threshold,
700
+ cfg.experiment_name,
701
+ mods,
702
+ cfg.batch_size,
703
+ mod_tsv_dir,
704
+ cfg.delete_batch_hdfs,
705
+ cfg.threads,
706
+ double_barcoded_path,
707
+ cfg.samtools_backend,
708
+ )
495
709
  if cfg.delete_intermediate_tsvs:
496
710
  delete_tsvs(mod_tsv_dir)
497
711
 
498
- raw_adata.obs['Experiment_name'] = [cfg.experiment_name] * raw_adata.shape[0]
499
- raw_adata.obs['Experiment_name_and_barcode'] = (raw_adata.obs['Experiment_name'].astype(str) + "_" + raw_adata.obs['Barcode'].astype(str))
712
+ raw_adata.obs["Experiment_name"] = [cfg.experiment_name] * raw_adata.shape[0]
713
+ raw_adata.obs["Experiment_name_and_barcode"] = (
714
+ raw_adata.obs["Experiment_name"].astype(str) + "_" + raw_adata.obs["Barcode"].astype(str)
715
+ )
500
716
 
501
717
  ########################################################################################################################
502
718
 
503
719
  ############################################### Add basic read length, read quality, mapping quality stats ###############################################
504
- from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
505
- from ..informatics.bam_functions import extract_read_features_from_bam
506
- add_read_length_and_mapping_qc(raw_adata, se_bam_files,
507
- extract_read_features_from_bam_callable=extract_read_features_from_bam,
508
- bypass=cfg.bypass_add_read_length_and_mapping_qc,
509
- force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
510
720
 
511
- raw_adata.obs['Raw_modification_signal'] = np.nansum(raw_adata.X, axis=1)
721
+ logger.info("Adding read length, mapping quality, and modification signal to Anndata")
722
+ add_read_length_and_mapping_qc(
723
+ raw_adata,
724
+ se_bam_files,
725
+ extract_read_features_from_bam_callable=extract_read_features_from_bam,
726
+ bypass=cfg.bypass_add_read_length_and_mapping_qc,
727
+ force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
728
+ samtools_backend=cfg.samtools_backend,
729
+ )
730
+
731
+ raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
732
+ ########################################################################################################################
733
+
734
+ ############################################### if input data type was pod5, append the pod5 file origin to each read ###############################################
735
+ from ..informatics.h5ad_functions import annotate_pod5_origin
736
+
737
+ if cfg.input_type == "pod5":
738
+ logger.info("Adding the POD5 origin file to each read into Anndata")
739
+ annotate_pod5_origin(
740
+ raw_adata,
741
+ cfg.input_data_path,
742
+ n_jobs=cfg.threads,
743
+ csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
744
+ )
512
745
  ########################################################################################################################
513
746
 
514
747
  ############################################### Save final adata ###############################################
515
- print(f"Saving AnnData to {raw_adata_path}")
748
+ logger.info(f"Saving AnnData to {raw_adata_path}")
749
+ record_smftools_metadata(
750
+ raw_adata,
751
+ step_name="load",
752
+ cfg=cfg,
753
+ config_path=config_path,
754
+ output_path=raw_adata_path,
755
+ )
516
756
  write_gz_h5ad(raw_adata, raw_adata_path)
517
757
  ########################################################################################################################
518
758
 
519
759
  ############################################### MultiQC HTML Report ###############################################
520
- from ..informatics.run_multiqc import run_multiqc
760
+
521
761
  # multiqc ###
522
762
  mqc_dir = cfg.split_path / "multiqc"
523
763
  if mqc_dir.is_dir():
524
- print(f'{mqc_dir} already exists, skipping multiqc')
764
+ logger.info(f"{mqc_dir} already exists, skipping multiqc")
525
765
  else:
766
+ logger.info("Running multiqc")
526
767
  run_multiqc(cfg.split_path, mqc_dir)
527
768
  ########################################################################################################################
528
769
 
529
770
  ############################################### delete intermediate BAM files ###############################################
530
771
  if cfg.delete_intermediate_bams:
772
+ logger.info("Deleting intermediate BAM files")
531
773
  # delete aligned and sorted bam
532
774
  aligned_sorted_output.unlink()
533
- bai = aligned_sorted_output.parent / (aligned_sorted_output.name + '.bai')
775
+ bai = aligned_sorted_output.parent / (aligned_sorted_output.name + ".bai")
534
776
  bai.unlink()
535
777
  # delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
536
778
  for bam in bam_files:
537
- bai = bam.parent / (bam.name + '.bai')
779
+ bai = bam.parent / (bam.name + ".bai")
538
780
  bam.unlink()
539
781
  bai.unlink()
540
782
  for bam in unclassified_bams:
541
- bai = bam.parent / (bam.name + '.bai')
783
+ bai = bam.parent / (bam.name + ".bai")
542
784
  bam.unlink()
543
- bai.unlink()
785
+ bai.unlink()
786
+ logger.info("Finished deleting intermediate BAM files")
544
787
  ########################################################################################################################
545
788
 
546
789
  return raw_adata, raw_adata_path, cfg
547
-
548
- def load_adata(config_path: str):
549
- """
550
- CLI-facing wrapper for the load pipeline.
551
-
552
- - Reads config CSV into ExperimentConfig
553
- - Computes canonical paths for all downstream AnnData stages
554
- - Registers those in the summary CSV
555
- - Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
556
- - If needed, calls the core pipeline to actually build the raw AnnData
557
-
558
- Returns
559
- -------
560
- adata : anndata.AnnData | None
561
- Newly created AnnData object, or None if we skipped because a later-stage
562
- AnnData already exists.
563
- adata_path : pathlib.Path
564
- Path to the "current" AnnData that should be used downstream.
565
- cfg : ExperimentConfig
566
- Config object for downstream steps.
567
- """
568
- from importlib import resources
569
- from datetime import datetime
570
- from pathlib import Path
571
-
572
- import pandas as pd # used for summary file reading downstream if needed
573
-
574
- from ..readwrite import make_dirs, add_or_update_column_in_csv
575
- from ..config import LoadExperimentConfig, ExperimentConfig
576
-
577
- from .helpers import get_adata_paths
578
-
579
- date_str = datetime.today().strftime("%y%m%d")
580
-
581
- # -----------------------------
582
- # 1) Load config into cfg
583
- # -----------------------------
584
- loader = LoadExperimentConfig(config_path)
585
- defaults_dir = resources.files("smftools").joinpath("config")
586
- cfg, report = ExperimentConfig.from_var_dict(
587
- loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
588
- )
589
-
590
- # Ensure base output dir
591
- make_dirs([cfg.output_directory])
592
-
593
- # -----------------------------
594
- # 2) Compute and register paths
595
- # -----------------------------
596
- paths = get_adata_paths(cfg)
597
-
598
- # experiment-level metadata in summary CSV
599
- add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
600
- add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
601
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
602
- add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
603
-
604
- # AnnData stage paths
605
- add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
606
- add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
607
- add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
608
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
609
- add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
610
-
611
- # -----------------------------
612
- # 3) Stage skipping logic
613
- # -----------------------------
614
- if not getattr(cfg, "force_redo_load_adata", False):
615
- if paths.hmm.exists():
616
- print(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
617
- return None, paths.hmm, cfg
618
- if paths.spatial.exists():
619
- print(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
620
- return None, paths.spatial, cfg
621
- if paths.pp_dedup.exists():
622
- print(
623
- f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
624
- f"Skipping smftools load"
625
- )
626
- return None, paths.pp_dedup, cfg
627
- if paths.pp.exists():
628
- print(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
629
- return None, paths.pp, cfg
630
- if paths.raw.exists():
631
- print(f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load")
632
- return None, paths.raw, cfg
633
-
634
- # If we get here, we actually want to run the full load pipeline
635
- adata, adata_path, cfg = load_adata_core(cfg, paths)
636
-
637
- return adata, adata_path, cfg