smftools 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. smftools/__init__.py +39 -7
  2. smftools/_settings.py +2 -0
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +34 -6
  7. smftools/cli/hmm_adata.py +239 -33
  8. smftools/cli/latent_adata.py +318 -0
  9. smftools/cli/load_adata.py +167 -131
  10. smftools/cli/preprocess_adata.py +180 -53
  11. smftools/cli/spatial_adata.py +152 -100
  12. smftools/cli_entry.py +38 -1
  13. smftools/config/__init__.py +2 -0
  14. smftools/config/conversion.yaml +11 -1
  15. smftools/config/default.yaml +42 -2
  16. smftools/config/experiment_config.py +59 -1
  17. smftools/constants.py +65 -0
  18. smftools/datasets/__init__.py +2 -0
  19. smftools/hmm/HMM.py +97 -3
  20. smftools/hmm/__init__.py +24 -13
  21. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  22. smftools/hmm/archived/calculate_distances.py +2 -0
  23. smftools/hmm/archived/call_hmm_peaks.py +2 -0
  24. smftools/hmm/archived/train_hmm.py +2 -0
  25. smftools/hmm/call_hmm_peaks.py +5 -2
  26. smftools/hmm/display_hmm.py +4 -1
  27. smftools/hmm/hmm_readwrite.py +7 -2
  28. smftools/hmm/nucleosome_hmm_refinement.py +2 -0
  29. smftools/informatics/__init__.py +59 -34
  30. smftools/informatics/archived/bam_conversion.py +2 -0
  31. smftools/informatics/archived/bam_direct.py +2 -0
  32. smftools/informatics/archived/basecall_pod5s.py +2 -0
  33. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  34. smftools/informatics/archived/conversion_smf.py +2 -0
  35. smftools/informatics/archived/deaminase_smf.py +1 -0
  36. smftools/informatics/archived/direct_smf.py +2 -0
  37. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  38. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  39. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +2 -0
  40. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  41. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  42. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  43. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  44. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  45. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  46. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  47. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  48. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  49. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  50. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  52. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  53. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  54. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  55. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  56. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  57. smftools/informatics/archived/helpers/archived/load_adata.py +2 -0
  58. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  59. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  60. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  61. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  62. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  63. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  64. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  65. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +2 -0
  66. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  67. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  68. smftools/informatics/archived/print_bam_query_seq.py +2 -0
  69. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  70. smftools/informatics/archived/subsample_pod5.py +2 -0
  71. smftools/informatics/bam_functions.py +1093 -176
  72. smftools/informatics/basecalling.py +2 -0
  73. smftools/informatics/bed_functions.py +271 -61
  74. smftools/informatics/binarize_converted_base_identities.py +3 -0
  75. smftools/informatics/complement_base_list.py +2 -0
  76. smftools/informatics/converted_BAM_to_adata.py +641 -176
  77. smftools/informatics/fasta_functions.py +94 -10
  78. smftools/informatics/h5ad_functions.py +123 -4
  79. smftools/informatics/modkit_extract_to_adata.py +1019 -431
  80. smftools/informatics/modkit_functions.py +2 -0
  81. smftools/informatics/ohe.py +2 -0
  82. smftools/informatics/pod5_functions.py +3 -2
  83. smftools/informatics/sequence_encoding.py +72 -0
  84. smftools/logging_utils.py +21 -2
  85. smftools/machine_learning/__init__.py +22 -6
  86. smftools/machine_learning/data/__init__.py +2 -0
  87. smftools/machine_learning/data/anndata_data_module.py +18 -4
  88. smftools/machine_learning/data/preprocessing.py +2 -0
  89. smftools/machine_learning/evaluation/__init__.py +2 -0
  90. smftools/machine_learning/evaluation/eval_utils.py +2 -0
  91. smftools/machine_learning/evaluation/evaluators.py +14 -9
  92. smftools/machine_learning/inference/__init__.py +2 -0
  93. smftools/machine_learning/inference/inference_utils.py +2 -0
  94. smftools/machine_learning/inference/lightning_inference.py +6 -1
  95. smftools/machine_learning/inference/sklearn_inference.py +2 -0
  96. smftools/machine_learning/inference/sliding_window_inference.py +2 -0
  97. smftools/machine_learning/models/__init__.py +2 -0
  98. smftools/machine_learning/models/base.py +7 -2
  99. smftools/machine_learning/models/cnn.py +7 -2
  100. smftools/machine_learning/models/lightning_base.py +16 -11
  101. smftools/machine_learning/models/mlp.py +5 -1
  102. smftools/machine_learning/models/positional.py +7 -2
  103. smftools/machine_learning/models/rnn.py +5 -1
  104. smftools/machine_learning/models/sklearn_models.py +14 -9
  105. smftools/machine_learning/models/transformer.py +7 -2
  106. smftools/machine_learning/models/wrappers.py +6 -2
  107. smftools/machine_learning/training/__init__.py +2 -0
  108. smftools/machine_learning/training/train_lightning_model.py +13 -3
  109. smftools/machine_learning/training/train_sklearn_model.py +2 -0
  110. smftools/machine_learning/utils/__init__.py +2 -0
  111. smftools/machine_learning/utils/device.py +5 -1
  112. smftools/machine_learning/utils/grl.py +5 -1
  113. smftools/metadata.py +1 -1
  114. smftools/optional_imports.py +31 -0
  115. smftools/plotting/__init__.py +41 -31
  116. smftools/plotting/autocorrelation_plotting.py +9 -5
  117. smftools/plotting/classifiers.py +16 -4
  118. smftools/plotting/general_plotting.py +2415 -629
  119. smftools/plotting/hmm_plotting.py +97 -9
  120. smftools/plotting/position_stats.py +15 -7
  121. smftools/plotting/qc_plotting.py +6 -1
  122. smftools/preprocessing/__init__.py +36 -37
  123. smftools/preprocessing/append_base_context.py +17 -17
  124. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  125. smftools/preprocessing/archived/add_read_length_and_mapping_qc.py +2 -0
  126. smftools/preprocessing/archived/calculate_complexity.py +2 -0
  127. smftools/preprocessing/archived/mark_duplicates.py +2 -0
  128. smftools/preprocessing/archived/preprocessing.py +2 -0
  129. smftools/preprocessing/archived/remove_duplicates.py +2 -0
  130. smftools/preprocessing/binary_layers_to_ohe.py +2 -1
  131. smftools/preprocessing/calculate_complexity_II.py +4 -1
  132. smftools/preprocessing/calculate_consensus.py +1 -1
  133. smftools/preprocessing/calculate_pairwise_differences.py +2 -0
  134. smftools/preprocessing/calculate_pairwise_hamming_distances.py +3 -0
  135. smftools/preprocessing/calculate_position_Youden.py +9 -2
  136. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  137. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +2 -0
  138. smftools/preprocessing/filter_reads_on_modification_thresholds.py +2 -0
  139. smftools/preprocessing/flag_duplicate_reads.py +42 -54
  140. smftools/preprocessing/make_dirs.py +2 -1
  141. smftools/preprocessing/min_non_diagonal.py +2 -0
  142. smftools/preprocessing/recipes.py +2 -0
  143. smftools/readwrite.py +53 -17
  144. smftools/schema/anndata_schema_v1.yaml +15 -1
  145. smftools/tools/__init__.py +30 -18
  146. smftools/tools/archived/apply_hmm.py +2 -0
  147. smftools/tools/archived/classifiers.py +2 -0
  148. smftools/tools/archived/classify_methylated_features.py +2 -0
  149. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  150. smftools/tools/archived/subset_adata_v1.py +2 -0
  151. smftools/tools/archived/subset_adata_v2.py +2 -0
  152. smftools/tools/calculate_leiden.py +57 -0
  153. smftools/tools/calculate_nmf.py +119 -0
  154. smftools/tools/calculate_umap.py +93 -8
  155. smftools/tools/cluster_adata_on_methylation.py +7 -1
  156. smftools/tools/position_stats.py +17 -27
  157. smftools/tools/rolling_nn_distance.py +235 -0
  158. smftools/tools/tensor_factorization.py +169 -0
  159. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/METADATA +69 -33
  160. smftools-0.3.1.dist-info/RECORD +189 -0
  161. smftools-0.2.5.dist-info/RECORD +0 -181
  162. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  163. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  164. {smftools-0.2.5.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
1
4
  import shutil
2
5
  from pathlib import Path
3
6
  from typing import Iterable, Union
4
7
 
5
- from smftools.logging_utils import get_logger
8
+ import numpy as np
9
+
10
+ from smftools.constants import HMM_DIR, LOAD_DIR, LOGGING_DIR, PREPROCESS_DIR, SPATIAL_DIR
11
+ from smftools.logging_utils import get_logger, setup_logging
6
12
 
7
13
  from .helpers import AdataPaths
8
14
 
@@ -76,6 +82,62 @@ def delete_tsvs(
76
82
  logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
77
83
 
78
84
 
85
+ def load_adata(config_path: str):
86
+ """
87
+ CLI-facing wrapper for the load pipeline.
88
+
89
+ - Reads config CSV into ExperimentConfig
90
+ - Computes canonical paths for all downstream AnnData stages
91
+ - Registers those in the summary CSV
92
+ - Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
93
+ - If needed, calls the core pipeline to actually build the raw AnnData
94
+
95
+ Returns
96
+ -------
97
+ adata : anndata.AnnData | None
98
+ Newly created AnnData object, or None if we skipped because a later-stage
99
+ AnnData already exists.
100
+ adata_path : pathlib.Path
101
+ Path to the "current" AnnData that should be used downstream.
102
+ cfg : ExperimentConfig
103
+ Config object for downstream steps.
104
+ """
105
+ from datetime import datetime
106
+ from importlib import resources
107
+
108
+ from ..readwrite import add_or_update_column_in_csv, make_dirs
109
+ from .helpers import get_adata_paths, load_experiment_config
110
+
111
+ # -----------------------------
112
+ # 1) Load config into cfg
113
+ # -----------------------------
114
+ cfg = load_experiment_config(config_path)
115
+
116
+ # Ensure base output dir
117
+ output_directory = Path(cfg.output_directory)
118
+ make_dirs([output_directory])
119
+
120
+ # -----------------------------
121
+ # 2) Compute and register paths
122
+ # -----------------------------
123
+ paths = get_adata_paths(cfg)
124
+
125
+ # -----------------------------
126
+ # 3) Stage skipping logic
127
+ # -----------------------------
128
+ if not getattr(cfg, "force_redo_load_adata", False):
129
+ if paths.raw.exists():
130
+ logger.info(
131
+ f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
132
+ )
133
+ return None, paths.raw, cfg
134
+
135
+ # If we get here, we actually want to run the full load pipeline
136
+ adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
137
+
138
+ return adata, adata_path, cfg
139
+
140
+
79
141
  def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
80
142
  """
81
143
  Core load pipeline.
@@ -105,9 +167,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
105
167
  cfg : ExperimentConfig
106
168
  (Same object, possibly with some fields updated, e.g. fasta path.)
107
169
  """
108
- from pathlib import Path
109
-
110
- import numpy as np
170
+ from datetime import datetime
111
171
 
112
172
  from ..informatics.bam_functions import (
113
173
  align_and_sort_BAM,
@@ -115,6 +175,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
115
175
  concatenate_fastqs_to_bam,
116
176
  demux_and_index_BAM,
117
177
  extract_read_features_from_bam,
178
+ extract_read_tags_from_bam,
118
179
  split_and_index_BAM,
119
180
  )
120
181
  from ..informatics.basecalling import canoncall, modcall
@@ -125,7 +186,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
125
186
  get_chromosome_lengths,
126
187
  subsample_fasta_from_bed,
127
188
  )
128
- from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
189
+ from ..informatics.h5ad_functions import (
190
+ add_read_length_and_mapping_qc,
191
+ add_read_tag_annotations,
192
+ add_secondary_supplementary_alignment_flags,
193
+ )
129
194
  from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
130
195
  from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
131
196
  from ..informatics.pod5_functions import fast5_to_pod5
@@ -135,8 +200,25 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
135
200
  from .helpers import write_gz_h5ad
136
201
 
137
202
  ################################### 1) General params and input organization ###################################
203
+ date_str = datetime.today().strftime("%y%m%d")
204
+ now = datetime.now()
205
+ time_str = now.strftime("%H%M%S")
206
+
207
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
208
+
138
209
  output_directory = Path(cfg.output_directory)
139
- make_dirs([output_directory])
210
+ load_directory = output_directory / LOAD_DIR
211
+ logging_directory = load_directory / LOGGING_DIR
212
+
213
+ make_dirs([output_directory, load_directory])
214
+
215
+ if cfg.emit_log_file:
216
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
217
+ make_dirs([logging_directory])
218
+ else:
219
+ log_file = None
220
+
221
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
140
222
 
141
223
  raw_adata_path = paths.raw
142
224
  pp_adata_path = paths.pp
@@ -150,11 +232,9 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
150
232
 
151
233
  # Direct methylation detection SMF specific parameters
152
234
  if cfg.smf_modality == "direct":
153
- mod_bed_dir = cfg.output_directory / "mod_beds"
154
- add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
155
- mod_tsv_dir = cfg.output_directory / "mod_tsvs"
156
- add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
157
- bam_qc_dir = cfg.output_directory / "bam_qc"
235
+ mod_bed_dir = load_directory / "mod_beds"
236
+ mod_tsv_dir = load_directory / "mod_tsvs"
237
+ bam_qc_dir = load_directory / "bam_qc"
158
238
  mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
159
239
 
160
240
  if not check_executable_exists("dorado"):
@@ -190,7 +270,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
190
270
  # If the input files are fast5 files, convert the files to a pod5 file before proceeding.
191
271
  if cfg.input_type == "fast5":
192
272
  # take the input directory of fast5 files and write out a single pod5 file into the output directory.
193
- output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
273
+ output_pod5 = load_directory / "FAST5s_to_POD5.pod5"
194
274
  if output_pod5.exists():
195
275
  pass
196
276
  else:
@@ -204,7 +284,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
204
284
  # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
205
285
  elif cfg.input_type == "fastq":
206
286
  # Output file for FASTQ concatenation.
207
- output_bam = cfg.output_directory / "canonical_basecalls.bam"
287
+ output_bam = load_directory / "canonical_basecalls.bam"
208
288
  if output_bam.exists():
209
289
  logger.debug("Output BAM already exists")
210
290
  else:
@@ -219,6 +299,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
219
299
  rg_sample_field=None,
220
300
  progress=False,
221
301
  auto_pair=cfg.fastq_auto_pairing,
302
+ samtools_backend=cfg.samtools_backend,
222
303
  )
223
304
 
224
305
  logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
@@ -231,8 +312,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
231
312
  else:
232
313
  pass
233
314
 
234
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
235
-
236
315
  # Determine if the input data needs to be basecalled
237
316
  if cfg.input_type == "pod5":
238
317
  logger.info(f"Detected pod5 inputs: {cfg.input_files}")
@@ -249,25 +328,24 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
249
328
  model_basename = str(model_basename).replace(".", "_")
250
329
  if cfg.smf_modality == "direct":
251
330
  mod_string = "_".join(cfg.mod_list)
252
- bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
331
+ bam = load_directory / f"{model_basename}_{mod_string}_calls"
253
332
  else:
254
- bam = cfg.output_directory / f"{model_basename}_canonical_basecalls"
333
+ bam = load_directory / f"{model_basename}_canonical_basecalls"
255
334
  else:
256
- bam_base = cfg.input_data_path.name
257
- bam = cfg.output_directory / bam_base
335
+ bam_base = cfg.input_data_path.stem
336
+ bam = cfg.input_data_path.parent / bam_base
258
337
 
259
338
  # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
260
339
  unaligned_output = bam.with_suffix(cfg.bam_suffix)
340
+
261
341
  aligned_BAM = (
262
- cfg.output_directory / (bam.stem + "_aligned")
342
+ load_directory / (bam.stem + "_aligned")
263
343
  ) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
344
+
264
345
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
265
346
  aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
266
347
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
267
348
 
268
- add_or_update_column_in_csv(cfg.summary_file, "basecalled_bam", unaligned_output)
269
- add_or_update_column_in_csv(cfg.summary_file, "aligned_bam", aligned_output)
270
- add_or_update_column_in_csv(cfg.summary_file, "sorted_bam", aligned_sorted_output)
271
349
  ########################################################################################################################
272
350
 
273
351
  ################################### 2) FASTA Handling ###################################
@@ -281,11 +359,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
281
359
  if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
282
360
  fasta_stem = cfg.fasta.stem
283
361
  bed_stem = Path(cfg.fasta_regions_of_interest).stem
284
- output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
362
+ output_FASTA = load_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
285
363
 
286
364
  logger.info("Subsampling FASTA records using the provided BED file")
287
365
  subsample_fasta_from_bed(
288
- cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
366
+ cfg.fasta, cfg.fasta_regions_of_interest, load_directory, output_FASTA
289
367
  )
290
368
  fasta = output_FASTA
291
369
  else:
@@ -296,7 +374,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
296
374
  if cfg.smf_modality == "conversion":
297
375
  fasta_stem = fasta.stem
298
376
  converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
299
- converted_FASTA = cfg.output_directory / converted_FASTA_basename
377
+ converted_FASTA = load_directory / converted_FASTA_basename
300
378
 
301
379
  if "converted.fa" in fasta.name:
302
380
  logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
@@ -308,8 +386,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
308
386
  generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
309
387
  fasta = converted_FASTA
310
388
 
311
- add_or_update_column_in_csv(cfg.summary_file, "fasta", fasta)
312
-
313
389
  # Make a FAI and .chrom.names file for the fasta
314
390
  get_chromosome_lengths(fasta)
315
391
  ########################################################################################################################
@@ -370,13 +446,13 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
370
446
  logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
371
447
  else:
372
448
  logger.info(f"Aligning and sorting reads")
373
- align_and_sort_BAM(fasta, unaligned_output, cfg)
449
+ align_and_sort_BAM(fasta, unaligned_output, aligned_output, cfg)
374
450
  # Deleted the unsorted aligned output
375
451
  aligned_output.unlink()
376
452
 
377
453
  if cfg.make_beds:
378
454
  # Make beds and provide basic histograms
379
- bed_dir = cfg.output_directory / "beds"
455
+ bed_dir = load_directory / "beds"
380
456
  if bed_dir.is_dir():
381
457
  logger.debug(
382
458
  f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
@@ -384,7 +460,14 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
384
460
  else:
385
461
  logger.info("Making bed files from the aligned and sorted BAM file")
386
462
  aligned_BAM_to_bed(
387
- aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads
463
+ aligned_sorted_output,
464
+ load_directory,
465
+ fasta,
466
+ cfg.make_bigwigs,
467
+ cfg.threads,
468
+ samtools_backend=cfg.samtools_backend,
469
+ bedtools_backend=cfg.bedtools_backend,
470
+ bigwig_backend=cfg.bigwig_backend,
388
471
  )
389
472
  ########################################################################################################################
390
473
 
@@ -404,13 +487,19 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
404
487
  else:
405
488
  make_dirs([cfg.split_path])
406
489
  logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
407
- all_bam_files = split_and_index_BAM(aligned_sorted_BAM, cfg.split_path, cfg.bam_suffix)
490
+ all_bam_files = split_and_index_BAM(
491
+ aligned_sorted_BAM,
492
+ cfg.split_path,
493
+ cfg.bam_suffix,
494
+ samtools_backend=cfg.samtools_backend,
495
+ )
408
496
 
409
497
  unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
410
498
  bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
411
499
 
412
500
  se_bam_files = bam_files
413
501
  bam_dir = cfg.split_path
502
+ double_barcoded_path = None
414
503
 
415
504
  else:
416
505
  if single_barcoded_path.is_dir():
@@ -489,19 +578,34 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
489
578
  else:
490
579
  logger.info("Making BED files from BAM files for each sample")
491
580
  for bam in bam_files:
492
- aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
581
+ aligned_BAM_to_bed(
582
+ bam,
583
+ cfg.split_path,
584
+ fasta,
585
+ cfg.make_bigwigs,
586
+ cfg.threads,
587
+ samtools_backend=cfg.samtools_backend,
588
+ bedtools_backend=cfg.bedtools_backend,
589
+ bigwig_backend=cfg.bigwig_backend,
590
+ )
493
591
  ########################################################################################################################
494
592
 
495
593
  ################################### 6) SAMTools based BAM QC ######################################################################
496
594
 
497
595
  # 5) Samtools QC metrics on split BAM files
498
- bam_qc_dir = cfg.split_path / "bam_qc"
596
+ bam_qc_dir = load_directory / "bam_qc"
499
597
  if bam_qc_dir.is_dir():
500
598
  logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
501
599
  else:
502
600
  make_dirs([bam_qc_dir])
503
601
  logger.info("Performing BAM QC")
504
- bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
602
+ bam_qc(
603
+ bam_files,
604
+ bam_qc_dir,
605
+ cfg.threads,
606
+ modality=cfg.smf_modality,
607
+ samtools_backend=cfg.samtools_backend,
608
+ )
505
609
  ########################################################################################################################
506
610
 
507
611
  ################################### 7) AnnData loading ######################################################################
@@ -518,7 +622,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
518
622
  raw_adata, raw_adata_path = converted_BAM_to_adata(
519
623
  fasta,
520
624
  bam_dir,
521
- cfg.output_directory,
625
+ load_directory,
522
626
  cfg.input_already_demuxed,
523
627
  cfg.mapping_threshold,
524
628
  cfg.experiment_name,
@@ -529,6 +633,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
529
633
  deaminase_footprinting,
530
634
  delete_intermediates=cfg.delete_intermediate_hdfs,
531
635
  double_barcoded_path=double_barcoded_path,
636
+ samtools_backend=cfg.samtools_backend,
532
637
  )
533
638
  else:
534
639
  if mod_bed_dir.is_dir():
@@ -574,7 +679,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
574
679
  raw_adata, raw_adata_path = modkit_extract_to_adata(
575
680
  fasta,
576
681
  bam_dir,
577
- cfg.output_directory,
682
+ load_directory,
578
683
  cfg.input_already_demuxed,
579
684
  cfg.mapping_threshold,
580
685
  cfg.experiment_name,
@@ -584,6 +689,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
584
689
  cfg.delete_batch_hdfs,
585
690
  cfg.threads,
586
691
  double_barcoded_path,
692
+ cfg.samtools_backend,
587
693
  )
588
694
  if cfg.delete_intermediate_tsvs:
589
695
  delete_tsvs(mod_tsv_dir)
@@ -604,8 +710,28 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
604
710
  extract_read_features_from_bam_callable=extract_read_features_from_bam,
605
711
  bypass=cfg.bypass_add_read_length_and_mapping_qc,
606
712
  force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
713
+ samtools_backend=cfg.samtools_backend,
714
+ )
715
+
716
+ logger.info("Adding BAM tags and BAM flags to adata.obs")
717
+ add_read_tag_annotations(
718
+ raw_adata,
719
+ se_bam_files,
720
+ tag_names=getattr(cfg, "bam_tag_names", ["NM", "MD", "MM", "ML"]),
721
+ include_flags=True,
722
+ include_cigar=True,
723
+ extract_read_tags_from_bam_callable=extract_read_tags_from_bam,
724
+ samtools_backend=cfg.samtools_backend,
607
725
  )
608
726
 
727
+ if getattr(cfg, "annotate_secondary_supplementary", False):
728
+ logger.info("Annotating secondary/supplementary alignments from aligned BAM")
729
+ add_secondary_supplementary_alignment_flags(
730
+ raw_adata,
731
+ aligned_sorted_output,
732
+ samtools_backend=cfg.samtools_backend,
733
+ )
734
+
609
735
  raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
610
736
  ########################################################################################################################
611
737
 
@@ -618,7 +744,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
618
744
  raw_adata,
619
745
  cfg.input_data_path,
620
746
  n_jobs=cfg.threads,
621
- csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
747
+ csv_path=load_directory / "read_to_pod5_origin_mapping.csv",
622
748
  )
623
749
  ########################################################################################################################
624
750
 
@@ -637,12 +763,12 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
637
763
  ############################################### MultiQC HTML Report ###############################################
638
764
 
639
765
  # multiqc ###
640
- mqc_dir = cfg.split_path / "multiqc"
766
+ mqc_dir = load_directory / "multiqc"
641
767
  if mqc_dir.is_dir():
642
- logger.debug(f"{mqc_dir} already exists, skipping multiqc")
768
+ logger.info(f"{mqc_dir} already exists, skipping multiqc")
643
769
  else:
644
770
  logger.info("Running multiqc")
645
- run_multiqc(cfg.split_path, mqc_dir)
771
+ run_multiqc(bam_qc_dir, mqc_dir)
646
772
  ########################################################################################################################
647
773
 
648
774
  ############################################### delete intermediate BAM files ###############################################
@@ -665,93 +791,3 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
665
791
  ########################################################################################################################
666
792
 
667
793
  return raw_adata, raw_adata_path, cfg
668
-
669
-
670
- def load_adata(config_path: str):
671
- """
672
- CLI-facing wrapper for the load pipeline.
673
-
674
- - Reads config CSV into ExperimentConfig
675
- - Computes canonical paths for all downstream AnnData stages
676
- - Registers those in the summary CSV
677
- - Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
678
- - If needed, calls the core pipeline to actually build the raw AnnData
679
-
680
- Returns
681
- -------
682
- adata : anndata.AnnData | None
683
- Newly created AnnData object, or None if we skipped because a later-stage
684
- AnnData already exists.
685
- adata_path : pathlib.Path
686
- Path to the "current" AnnData that should be used downstream.
687
- cfg : ExperimentConfig
688
- Config object for downstream steps.
689
- """
690
- from datetime import datetime
691
- from importlib import resources
692
-
693
- from ..config import ExperimentConfig, LoadExperimentConfig
694
- from ..readwrite import add_or_update_column_in_csv, make_dirs
695
- from .helpers import get_adata_paths
696
-
697
- date_str = datetime.today().strftime("%y%m%d")
698
-
699
- # -----------------------------
700
- # 1) Load config into cfg
701
- # -----------------------------
702
- loader = LoadExperimentConfig(config_path)
703
- defaults_dir = resources.files("smftools").joinpath("config")
704
- cfg, report = ExperimentConfig.from_var_dict(
705
- loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
706
- )
707
-
708
- # Ensure base output dir
709
- make_dirs([cfg.output_directory])
710
-
711
- # -----------------------------
712
- # 2) Compute and register paths
713
- # -----------------------------
714
- paths = get_adata_paths(cfg)
715
-
716
- # experiment-level metadata in summary CSV
717
- add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
718
- add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
719
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
720
- add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
721
-
722
- # AnnData stage paths
723
- add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
724
- add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
725
- add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
726
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
727
- add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
728
-
729
- # -----------------------------
730
- # 3) Stage skipping logic
731
- # -----------------------------
732
- if not getattr(cfg, "force_redo_load_adata", False):
733
- if paths.hmm.exists():
734
- logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
735
- return None, paths.hmm, cfg
736
- if paths.spatial.exists():
737
- logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
738
- return None, paths.spatial, cfg
739
- if paths.pp_dedup.exists():
740
- logger.debug(
741
- f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
742
- f"Skipping smftools load"
743
- )
744
- return None, paths.pp_dedup, cfg
745
- if paths.pp.exists():
746
- logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
747
- return None, paths.pp, cfg
748
- if paths.raw.exists():
749
- logger.debug(
750
- f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
751
- )
752
- return None, paths.raw, cfg
753
-
754
- # If we get here, we actually want to run the full load pipeline
755
- adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
756
-
757
- return adata, adata_path, cfg