smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +32 -6
  3. smftools/cli/hmm_adata.py +232 -31
  4. smftools/cli/latent_adata.py +318 -0
  5. smftools/cli/load_adata.py +77 -73
  6. smftools/cli/preprocess_adata.py +178 -53
  7. smftools/cli/spatial_adata.py +149 -101
  8. smftools/cli_entry.py +12 -0
  9. smftools/config/conversion.yaml +11 -1
  10. smftools/config/default.yaml +38 -1
  11. smftools/config/experiment_config.py +53 -1
  12. smftools/constants.py +65 -0
  13. smftools/hmm/HMM.py +88 -0
  14. smftools/informatics/__init__.py +6 -0
  15. smftools/informatics/bam_functions.py +358 -8
  16. smftools/informatics/converted_BAM_to_adata.py +584 -163
  17. smftools/informatics/h5ad_functions.py +115 -2
  18. smftools/informatics/modkit_extract_to_adata.py +1003 -425
  19. smftools/informatics/sequence_encoding.py +72 -0
  20. smftools/logging_utils.py +21 -2
  21. smftools/metadata.py +1 -1
  22. smftools/plotting/__init__.py +9 -0
  23. smftools/plotting/general_plotting.py +2411 -628
  24. smftools/plotting/hmm_plotting.py +85 -7
  25. smftools/preprocessing/__init__.py +1 -0
  26. smftools/preprocessing/append_base_context.py +17 -17
  27. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  28. smftools/preprocessing/calculate_consensus.py +1 -1
  29. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  30. smftools/readwrite.py +53 -17
  31. smftools/schema/anndata_schema_v1.yaml +15 -1
  32. smftools/tools/__init__.py +4 -0
  33. smftools/tools/calculate_leiden.py +57 -0
  34. smftools/tools/calculate_nmf.py +119 -0
  35. smftools/tools/calculate_umap.py +91 -8
  36. smftools/tools/rolling_nn_distance.py +235 -0
  37. smftools/tools/tensor_factorization.py +169 -0
  38. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
  39. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
  40. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  41. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  42. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,318 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Optional, Tuple
6
+
7
+ import anndata as ad
8
+
9
+ from smftools.constants import LATENT_DIR, LOGGING_DIR, SEQUENCE_INTEGER_ENCODING
10
+ from smftools.logging_utils import get_logger, setup_logging
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ def latent_adata(
16
+ config_path: str,
17
+ ) -> Tuple[Optional[ad.AnnData], Optional[Path]]:
18
+ """
19
+ CLI-facing wrapper for representation learning.
20
+
21
+ Called by: `smftools latent <config_path>`
22
+
23
+ Responsibilities:
24
+ - Determine which AnnData stages exist (pp, pp_dedup, spatial, hmm).
25
+ - Call `latent_adata_core(...)` when actual work is needed.
26
+
27
+ Returns
28
+ -------
29
+ latent_adata : AnnData | None
30
+ AnnData with latent analyses, or None if we skipped because a later-stage
31
+ AnnData already exists.
32
+ latent_adata_path : Path | None
33
+ Path to the “current” latent AnnData.
34
+ """
35
+ from ..readwrite import add_or_update_column_in_csv, safe_read_h5ad
36
+ from .helpers import get_adata_paths, load_experiment_config
37
+
38
+ # 1) Ensure config + basic paths via load_adata
39
+ cfg = load_experiment_config(config_path)
40
+
41
+ paths = get_adata_paths(cfg)
42
+
43
+ pp_path = paths.pp
44
+ pp_dedup_path = paths.pp_dedup
45
+ spatial_path = paths.spatial
46
+ hmm_path = paths.hmm
47
+ latent_path = paths.latent
48
+
49
+ # Stage-skipping logic for latent
50
+ if not getattr(cfg, "force_redo_latent_analyses", False):
51
+ # If latent exists, we consider latent analyses already done.
52
+ if latent_path.exists():
53
+ logger.info(f"Latent AnnData found: {latent_path}\nSkipping smftools latent")
54
+ return None, latent_path
55
+
56
+ # Helper to load from disk, reusing loaded_adata if it matches
57
+ def _load(path: Path):
58
+ adata, _ = safe_read_h5ad(path)
59
+ return adata
60
+
61
+ # 3) Decide which AnnData to use as the *starting point* for latent analyses
62
+ if latent_path.exists():
63
+ start_adata = _load(latent_path)
64
+ source_path = latent_path
65
+ elif hmm_path.exists():
66
+ start_adata = _load(hmm_path)
67
+ source_path = hmm_path
68
+ elif spatial_path.exists():
69
+ start_adata = _load(spatial_path)
70
+ source_path = spatial_path
71
+ elif pp_dedup_path.exists():
72
+ start_adata = _load(pp_dedup_path)
73
+ source_path = pp_dedup_path
74
+ elif pp_path.exists():
75
+ start_adata = _load(pp_path)
76
+ source_path = pp_path
77
+ else:
78
+ logger.warning(
79
+ "No suitable AnnData found for latent analyses (need at least preprocessed)."
80
+ )
81
+ return None, None
82
+
83
+ # 4) Run the latent core
84
+ adata_latent, latent_path = latent_adata_core(
85
+ adata=start_adata,
86
+ cfg=cfg,
87
+ paths=paths,
88
+ source_adata_path=source_path,
89
+ config_path=config_path,
90
+ )
91
+
92
+ return adata_latent, latent_path
93
+
94
+
95
+ def latent_adata_core(
96
+ adata: ad.AnnData,
97
+ cfg,
98
+ paths: AdataPaths,
99
+ source_adata_path: Optional[Path] = None,
100
+ config_path: Optional[str] = None,
101
+ ) -> Tuple[ad.AnnData, Path]:
102
+ """
103
+ Core spatial analysis pipeline.
104
+
105
+ Assumes:
106
+ - `adata` is (typically) the preprocessed, duplicate-removed AnnData.
107
+ - `cfg` is the ExperimentConfig.
108
+
109
+ Does:
110
+ - Optional sample sheet load.
111
+ - Optional inversion & reindexing.
112
+ - PCA/UMAP/Leiden.
113
+ - Save latent AnnData to `latent_adata_path`.
114
+
115
+ Returns
116
+ -------
117
+ adata : AnnData
118
+ analyzed AnnData (same object, modified in-place).
119
+ adata_path : Path
120
+ Path where AnnData was written.
121
+ """
122
+ import os
123
+ import warnings
124
+ from datetime import datetime
125
+ from pathlib import Path
126
+
127
+ import numpy as np
128
+ import pandas as pd
129
+
130
+ from ..metadata import record_smftools_metadata
131
+ from ..plotting import (
132
+ plot_cp_sequence_components,
133
+ plot_embedding,
134
+ plot_nmf_components,
135
+ plot_pca,
136
+ plot_umap,
137
+ )
138
+ from ..preprocessing import (
139
+ invert_adata,
140
+ load_sample_sheet,
141
+ reindex_references_adata,
142
+ )
143
+ from ..readwrite import make_dirs, safe_read_h5ad
144
+ from ..tools import (
145
+ calculate_leiden,
146
+ calculate_nmf,
147
+ calculate_sequence_cp_decomposition,
148
+ calculate_umap,
149
+ )
150
+ from .helpers import write_gz_h5ad
151
+
152
+ # -----------------------------
153
+ # General setup
154
+ # -----------------------------
155
+ date_str = datetime.today().strftime("%y%m%d")
156
+ now = datetime.now()
157
+ time_str = now.strftime("%H%M%S")
158
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
159
+
160
+ latent_adata_path = paths.latent
161
+
162
+ output_directory = Path(cfg.output_directory)
163
+ latent_directory = output_directory / LATENT_DIR
164
+ logging_directory = latent_directory / LOGGING_DIR
165
+
166
+ make_dirs([output_directory, latent_directory])
167
+
168
+ if cfg.emit_log_file:
169
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
170
+ make_dirs([logging_directory])
171
+ else:
172
+ log_file = None
173
+
174
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
175
+
176
+ smf_modality = cfg.smf_modality
177
+ if smf_modality == "conversion":
178
+ deaminase = False
179
+ else:
180
+ deaminase = True
181
+
182
+ # -----------------------------
183
+ # Optional sample sheet metadata
184
+ # -----------------------------
185
+ if getattr(cfg, "sample_sheet_path", None):
186
+ load_sample_sheet(
187
+ adata,
188
+ cfg.sample_sheet_path,
189
+ mapping_key_column=cfg.sample_sheet_mapping_column,
190
+ as_category=True,
191
+ force_reload=cfg.force_reload_sample_sheet,
192
+ )
193
+
194
+ # -----------------------------
195
+ # Optional inversion along positions axis
196
+ # -----------------------------
197
+ if getattr(cfg, "invert_adata", False):
198
+ adata = invert_adata(adata)
199
+
200
+ # -----------------------------
201
+ # Optional reindexing by reference
202
+ # -----------------------------
203
+ reindex_references_adata(
204
+ adata,
205
+ reference_col=cfg.reference_column,
206
+ offsets=cfg.reindexing_offsets,
207
+ new_col=cfg.reindexed_var_suffix,
208
+ )
209
+
210
+ if adata.uns.get("reindex_references_adata_performed", False):
211
+ reindex_suffix = cfg.reindexed_var_suffix
212
+ else:
213
+ reindex_suffix = None
214
+
215
+ references = adata.obs[cfg.reference_column].cat.categories
216
+
217
+ # ============================================================
218
+ # 2) PCA/UMAP on *deduplicated* preprocessed AnnData
219
+ # ============================================================
220
+ latent_dir_dedup = latent_directory / "deduplicated"
221
+ umap_dir = latent_dir_dedup / "07_umaps"
222
+ nmf_dir = latent_dir_dedup / "07b_nmf"
223
+ nmf_sequence_dir = latent_dir_dedup / "07c_nmf_sequence"
224
+
225
+ var_filters = []
226
+ if smf_modality == "direct":
227
+ for ref in references:
228
+ for base in cfg.mod_target_bases:
229
+ var_filters.append(f"{ref}_{base}_site")
230
+ elif deaminase:
231
+ for ref in references:
232
+ var_filters.append(f"{ref}_C_site")
233
+ else:
234
+ for ref in references:
235
+ for base in cfg.mod_target_bases:
236
+ var_filters.append(f"{ref}_{base}_site")
237
+
238
+ # UMAP / Leiden
239
+ if umap_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
240
+ logger.debug(f"{umap_dir} already exists. Skipping UMAP plotting.")
241
+ else:
242
+ make_dirs([umap_dir])
243
+
244
+ adata = calculate_umap(
245
+ adata,
246
+ layer=cfg.layer_for_umap_plotting,
247
+ var_filters=var_filters,
248
+ n_pcs=10,
249
+ knn_neighbors=15,
250
+ )
251
+
252
+ calculate_leiden(adata, resolution=0.1)
253
+
254
+ umap_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
255
+ umap_layers += cfg.umap_layers_to_plot
256
+ plot_umap(adata, color=umap_layers, output_dir=umap_dir)
257
+ plot_pca(adata, color=umap_layers, output_dir=umap_dir)
258
+
259
+ # NMF
260
+ if nmf_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
261
+ logger.debug(f"{nmf_dir} already exists. Skipping NMF plotting.")
262
+ else:
263
+ make_dirs([nmf_dir])
264
+ adata = calculate_nmf(
265
+ adata,
266
+ layer=cfg.layer_for_umap_plotting,
267
+ var_filters=var_filters,
268
+ n_components=5,
269
+ )
270
+ nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
271
+ nmf_layers += cfg.umap_layers_to_plot
272
+ plot_embedding(adata, basis="nmf", color=nmf_layers, output_dir=nmf_dir)
273
+ plot_nmf_components(adata, output_dir=nmf_dir)
274
+
275
+ # CP decomposition using sequence integer encoding (no var filters)
276
+ if nmf_sequence_dir.is_dir() and not getattr(cfg, "force_redo_spatial_analyses", False):
277
+ logger.debug(f"{nmf_sequence_dir} already exists. Skipping sequence CP plotting.")
278
+ elif SEQUENCE_INTEGER_ENCODING not in adata.layers:
279
+ logger.warning(
280
+ "Layer %s not found; skipping sequence integer encoding CP.",
281
+ SEQUENCE_INTEGER_ENCODING,
282
+ )
283
+ else:
284
+ make_dirs([nmf_sequence_dir])
285
+ adata = calculate_sequence_cp_decomposition(
286
+ adata,
287
+ layer=SEQUENCE_INTEGER_ENCODING,
288
+ rank=5,
289
+ embedding_key="X_cp_sequence",
290
+ components_key="H_cp_sequence",
291
+ uns_key="cp_sequence",
292
+ )
293
+ nmf_layers = ["leiden", cfg.sample_name_col_for_plotting, "Reference_strand"]
294
+ nmf_layers += cfg.umap_layers_to_plot
295
+ plot_embedding(adata, basis="cp_sequence", color=nmf_layers, output_dir=nmf_sequence_dir)
296
+ plot_cp_sequence_components(
297
+ adata,
298
+ output_dir=nmf_sequence_dir,
299
+ components_key="H_cp_sequence",
300
+ uns_key="cp_sequence",
301
+ )
302
+
303
+ # ============================================================
304
+ # 5) Save latent AnnData
305
+ # ============================================================
306
+ if (not latent_adata_path.exists()) or getattr(cfg, "force_redo_latent_analyses", False):
307
+ logger.info("Saving latent analyzed AnnData (post preprocessing and duplicate removal).")
308
+ record_smftools_metadata(
309
+ adata,
310
+ step_name="latent",
311
+ cfg=cfg,
312
+ config_path=config_path,
313
+ input_paths=[source_adata_path] if source_adata_path else None,
314
+ output_path=latent_adata_path,
315
+ )
316
+ write_gz_h5ad(adata, latent_adata_path)
317
+
318
+ return adata, latent_adata_path
@@ -1,12 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  import shutil
4
5
  from pathlib import Path
5
6
  from typing import Iterable, Union
6
7
 
7
8
  import numpy as np
8
9
 
9
- from smftools.logging_utils import get_logger
10
+ from smftools.constants import HMM_DIR, LOAD_DIR, LOGGING_DIR, PREPROCESS_DIR, SPATIAL_DIR
11
+ from smftools.logging_utils import get_logger, setup_logging
10
12
 
11
13
  from .helpers import AdataPaths
12
14
 
@@ -103,63 +105,29 @@ def load_adata(config_path: str):
103
105
  from datetime import datetime
104
106
  from importlib import resources
105
107
 
106
- from ..config import ExperimentConfig, LoadExperimentConfig
107
108
  from ..readwrite import add_or_update_column_in_csv, make_dirs
108
- from .helpers import get_adata_paths
109
-
110
- date_str = datetime.today().strftime("%y%m%d")
109
+ from .helpers import get_adata_paths, load_experiment_config
111
110
 
112
111
  # -----------------------------
113
112
  # 1) Load config into cfg
114
113
  # -----------------------------
115
- loader = LoadExperimentConfig(config_path)
116
- defaults_dir = resources.files("smftools").joinpath("config")
117
- cfg, report = ExperimentConfig.from_var_dict(
118
- loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
119
- )
114
+ cfg = load_experiment_config(config_path)
120
115
 
121
116
  # Ensure base output dir
122
- make_dirs([cfg.output_directory])
117
+ output_directory = Path(cfg.output_directory)
118
+ make_dirs([output_directory])
123
119
 
124
120
  # -----------------------------
125
121
  # 2) Compute and register paths
126
122
  # -----------------------------
127
123
  paths = get_adata_paths(cfg)
128
124
 
129
- # experiment-level metadata in summary CSV
130
- add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
131
- add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
132
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
133
- add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
134
-
135
- # AnnData stage paths
136
- add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
137
- add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
138
- add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
139
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
140
- add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
141
-
142
125
  # -----------------------------
143
126
  # 3) Stage skipping logic
144
127
  # -----------------------------
145
128
  if not getattr(cfg, "force_redo_load_adata", False):
146
- if paths.hmm.exists():
147
- logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
148
- return None, paths.hmm, cfg
149
- if paths.spatial.exists():
150
- logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
151
- return None, paths.spatial, cfg
152
- if paths.pp_dedup.exists():
153
- logger.debug(
154
- f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
155
- f"Skipping smftools load"
156
- )
157
- return None, paths.pp_dedup, cfg
158
- if paths.pp.exists():
159
- logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
160
- return None, paths.pp, cfg
161
129
  if paths.raw.exists():
162
- logger.debug(
130
+ logger.info(
163
131
  f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
164
132
  )
165
133
  return None, paths.raw, cfg
@@ -199,6 +167,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
199
167
  cfg : ExperimentConfig
200
168
  (Same object, possibly with some fields updated, e.g. fasta path.)
201
169
  """
170
+ from datetime import datetime
202
171
 
203
172
  from ..informatics.bam_functions import (
204
173
  align_and_sort_BAM,
@@ -206,6 +175,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
206
175
  concatenate_fastqs_to_bam,
207
176
  demux_and_index_BAM,
208
177
  extract_read_features_from_bam,
178
+ extract_read_tags_from_bam,
209
179
  split_and_index_BAM,
210
180
  )
211
181
  from ..informatics.basecalling import canoncall, modcall
@@ -216,7 +186,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
216
186
  get_chromosome_lengths,
217
187
  subsample_fasta_from_bed,
218
188
  )
219
- from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
189
+ from ..informatics.h5ad_functions import (
190
+ add_read_length_and_mapping_qc,
191
+ add_read_tag_annotations,
192
+ add_secondary_supplementary_alignment_flags,
193
+ )
220
194
  from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
221
195
  from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
222
196
  from ..informatics.pod5_functions import fast5_to_pod5
@@ -226,8 +200,25 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
226
200
  from .helpers import write_gz_h5ad
227
201
 
228
202
  ################################### 1) General params and input organization ###################################
203
+ date_str = datetime.today().strftime("%y%m%d")
204
+ now = datetime.now()
205
+ time_str = now.strftime("%H%M%S")
206
+
207
+ log_level = getattr(logging, cfg.log_level.upper(), logging.INFO)
208
+
229
209
  output_directory = Path(cfg.output_directory)
230
- make_dirs([output_directory])
210
+ load_directory = output_directory / LOAD_DIR
211
+ logging_directory = load_directory / LOGGING_DIR
212
+
213
+ make_dirs([output_directory, load_directory])
214
+
215
+ if cfg.emit_log_file:
216
+ log_file = logging_directory / f"{date_str}_{time_str}_log.log"
217
+ make_dirs([logging_directory])
218
+ else:
219
+ log_file = None
220
+
221
+ setup_logging(level=log_level, log_file=log_file, reconfigure=log_file is not None)
231
222
 
232
223
  raw_adata_path = paths.raw
233
224
  pp_adata_path = paths.pp
@@ -241,11 +232,9 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
241
232
 
242
233
  # Direct methylation detection SMF specific parameters
243
234
  if cfg.smf_modality == "direct":
244
- mod_bed_dir = cfg.output_directory / "mod_beds"
245
- add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
246
- mod_tsv_dir = cfg.output_directory / "mod_tsvs"
247
- add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
248
- bam_qc_dir = cfg.output_directory / "bam_qc"
235
+ mod_bed_dir = load_directory / "mod_beds"
236
+ mod_tsv_dir = load_directory / "mod_tsvs"
237
+ bam_qc_dir = load_directory / "bam_qc"
249
238
  mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
250
239
 
251
240
  if not check_executable_exists("dorado"):
@@ -281,7 +270,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
281
270
  # If the input files are fast5 files, convert the files to a pod5 file before proceeding.
282
271
  if cfg.input_type == "fast5":
283
272
  # take the input directory of fast5 files and write out a single pod5 file into the output directory.
284
- output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
273
+ output_pod5 = load_directory / "FAST5s_to_POD5.pod5"
285
274
  if output_pod5.exists():
286
275
  pass
287
276
  else:
@@ -295,7 +284,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
295
284
  # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
296
285
  elif cfg.input_type == "fastq":
297
286
  # Output file for FASTQ concatenation.
298
- output_bam = cfg.output_directory / "canonical_basecalls.bam"
287
+ output_bam = load_directory / "canonical_basecalls.bam"
299
288
  if output_bam.exists():
300
289
  logger.debug("Output BAM already exists")
301
290
  else:
@@ -323,8 +312,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
323
312
  else:
324
313
  pass
325
314
 
326
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
327
-
328
315
  # Determine if the input data needs to be basecalled
329
316
  if cfg.input_type == "pod5":
330
317
  logger.info(f"Detected pod5 inputs: {cfg.input_files}")
@@ -341,25 +328,24 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
341
328
  model_basename = str(model_basename).replace(".", "_")
342
329
  if cfg.smf_modality == "direct":
343
330
  mod_string = "_".join(cfg.mod_list)
344
- bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
331
+ bam = load_directory / f"{model_basename}_{mod_string}_calls"
345
332
  else:
346
- bam = cfg.output_directory / f"{model_basename}_canonical_basecalls"
333
+ bam = load_directory / f"{model_basename}_canonical_basecalls"
347
334
  else:
348
- bam_base = cfg.input_data_path.name
349
- bam = cfg.output_directory / bam_base
335
+ bam_base = cfg.input_data_path.stem
336
+ bam = cfg.input_data_path.parent / bam_base
350
337
 
351
338
  # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
352
339
  unaligned_output = bam.with_suffix(cfg.bam_suffix)
340
+
353
341
  aligned_BAM = (
354
- cfg.output_directory / (bam.stem + "_aligned")
342
+ load_directory / (bam.stem + "_aligned")
355
343
  ) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
344
+
356
345
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
357
346
  aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
358
347
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
359
348
 
360
- add_or_update_column_in_csv(cfg.summary_file, "basecalled_bam", unaligned_output)
361
- add_or_update_column_in_csv(cfg.summary_file, "aligned_bam", aligned_output)
362
- add_or_update_column_in_csv(cfg.summary_file, "sorted_bam", aligned_sorted_output)
363
349
  ########################################################################################################################
364
350
 
365
351
  ################################### 2) FASTA Handling ###################################
@@ -373,11 +359,11 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
373
359
  if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
374
360
  fasta_stem = cfg.fasta.stem
375
361
  bed_stem = Path(cfg.fasta_regions_of_interest).stem
376
- output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
362
+ output_FASTA = load_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
377
363
 
378
364
  logger.info("Subsampling FASTA records using the provided BED file")
379
365
  subsample_fasta_from_bed(
380
- cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
366
+ cfg.fasta, cfg.fasta_regions_of_interest, load_directory, output_FASTA
381
367
  )
382
368
  fasta = output_FASTA
383
369
  else:
@@ -388,7 +374,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
388
374
  if cfg.smf_modality == "conversion":
389
375
  fasta_stem = fasta.stem
390
376
  converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
391
- converted_FASTA = cfg.output_directory / converted_FASTA_basename
377
+ converted_FASTA = load_directory / converted_FASTA_basename
392
378
 
393
379
  if "converted.fa" in fasta.name:
394
380
  logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
@@ -400,8 +386,6 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
400
386
  generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
401
387
  fasta = converted_FASTA
402
388
 
403
- add_or_update_column_in_csv(cfg.summary_file, "fasta", fasta)
404
-
405
389
  # Make a FAI and .chrom.names file for the fasta
406
390
  get_chromosome_lengths(fasta)
407
391
  ########################################################################################################################
@@ -462,13 +446,13 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
462
446
  logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
463
447
  else:
464
448
  logger.info(f"Aligning and sorting reads")
465
- align_and_sort_BAM(fasta, unaligned_output, cfg)
449
+ align_and_sort_BAM(fasta, unaligned_output, aligned_output, cfg)
466
450
  # Deleted the unsorted aligned output
467
451
  aligned_output.unlink()
468
452
 
469
453
  if cfg.make_beds:
470
454
  # Make beds and provide basic histograms
471
- bed_dir = cfg.output_directory / "beds"
455
+ bed_dir = load_directory / "beds"
472
456
  if bed_dir.is_dir():
473
457
  logger.debug(
474
458
  f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
@@ -477,7 +461,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
477
461
  logger.info("Making bed files from the aligned and sorted BAM file")
478
462
  aligned_BAM_to_bed(
479
463
  aligned_sorted_output,
480
- cfg.output_directory,
464
+ load_directory,
481
465
  fasta,
482
466
  cfg.make_bigwigs,
483
467
  cfg.threads,
@@ -515,6 +499,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
515
499
 
516
500
  se_bam_files = bam_files
517
501
  bam_dir = cfg.split_path
502
+ double_barcoded_path = None
518
503
 
519
504
  else:
520
505
  if single_barcoded_path.is_dir():
@@ -608,7 +593,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
608
593
  ################################### 6) SAMTools based BAM QC ######################################################################
609
594
 
610
595
  # 5) Samtools QC metrics on split BAM files
611
- bam_qc_dir = cfg.split_path / "bam_qc"
596
+ bam_qc_dir = load_directory / "bam_qc"
612
597
  if bam_qc_dir.is_dir():
613
598
  logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
614
599
  else:
@@ -637,7 +622,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
637
622
  raw_adata, raw_adata_path = converted_BAM_to_adata(
638
623
  fasta,
639
624
  bam_dir,
640
- cfg.output_directory,
625
+ load_directory,
641
626
  cfg.input_already_demuxed,
642
627
  cfg.mapping_threshold,
643
628
  cfg.experiment_name,
@@ -694,7 +679,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
694
679
  raw_adata, raw_adata_path = modkit_extract_to_adata(
695
680
  fasta,
696
681
  bam_dir,
697
- cfg.output_directory,
682
+ load_directory,
698
683
  cfg.input_already_demuxed,
699
684
  cfg.mapping_threshold,
700
685
  cfg.experiment_name,
@@ -728,6 +713,25 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
728
713
  samtools_backend=cfg.samtools_backend,
729
714
  )
730
715
 
716
+ logger.info("Adding BAM tags and BAM flags to adata.obs")
717
+ add_read_tag_annotations(
718
+ raw_adata,
719
+ se_bam_files,
720
+ tag_names=getattr(cfg, "bam_tag_names", ["NM", "MD", "MM", "ML"]),
721
+ include_flags=True,
722
+ include_cigar=True,
723
+ extract_read_tags_from_bam_callable=extract_read_tags_from_bam,
724
+ samtools_backend=cfg.samtools_backend,
725
+ )
726
+
727
+ if getattr(cfg, "annotate_secondary_supplementary", False):
728
+ logger.info("Annotating secondary/supplementary alignments from aligned BAM")
729
+ add_secondary_supplementary_alignment_flags(
730
+ raw_adata,
731
+ aligned_sorted_output,
732
+ samtools_backend=cfg.samtools_backend,
733
+ )
734
+
731
735
  raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
732
736
  ########################################################################################################################
733
737
 
@@ -740,7 +744,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
740
744
  raw_adata,
741
745
  cfg.input_data_path,
742
746
  n_jobs=cfg.threads,
743
- csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
747
+ csv_path=load_directory / "read_to_pod5_origin_mapping.csv",
744
748
  )
745
749
  ########################################################################################################################
746
750
 
@@ -759,12 +763,12 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
759
763
  ############################################### MultiQC HTML Report ###############################################
760
764
 
761
765
  # multiqc ###
762
- mqc_dir = cfg.split_path / "multiqc"
766
+ mqc_dir = load_directory / "multiqc"
763
767
  if mqc_dir.is_dir():
764
768
  logger.info(f"{mqc_dir} already exists, skipping multiqc")
765
769
  else:
766
770
  logger.info("Running multiqc")
767
- run_multiqc(cfg.split_path, mqc_dir)
771
+ run_multiqc(bam_qc_dir, mqc_dir)
768
772
  ########################################################################################################################
769
773
 
770
774
  ############################################### delete intermediate BAM files ###############################################