smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,19 @@
1
1
  import shutil
2
2
  from pathlib import Path
3
- from typing import Union, Iterable
3
+ from typing import Iterable, Union
4
+
5
+ from smftools.logging_utils import get_logger
6
+
7
+ from .helpers import AdataPaths
8
+
9
+ logger = get_logger(__name__)
10
+
4
11
 
5
12
  def check_executable_exists(cmd: str) -> bool:
6
13
  """Return True if a command-line executable is available in PATH."""
7
14
  return shutil.which(cmd) is not None
8
15
 
16
+
9
17
  def delete_tsvs(
10
18
  tsv_dir: Union[str, Path, Iterable[str], None],
11
19
  *,
@@ -25,158 +33,130 @@ def delete_tsvs(
25
33
  verbose : bool
26
34
  Print progress / warnings.
27
35
  """
36
+
28
37
  # Helper: remove a single file path (Path-like or string)
29
38
  def _maybe_unlink(p: Path):
30
39
  if not p.exists():
31
40
  if verbose:
32
- print(f"[skip] not found: {p}")
41
+ logger.info(f"[skip] not found: {p}")
33
42
  return
34
43
  if not p.is_file():
35
44
  if verbose:
36
- print(f"[skip] not a file: {p}")
45
+ logger.info(f"[skip] not a file: {p}")
37
46
  return
38
47
  if dry_run:
39
- print(f"[dry-run] would remove file: {p}")
48
+ logger.info(f"[dry-run] would remove file: {p}")
40
49
  return
41
50
  try:
42
51
  p.unlink()
43
52
  if verbose:
44
- print(f"Removed file: {p}")
53
+ logger.info(f"Removed file: {p}")
45
54
  except Exception as e:
46
- print(f"[error] failed to remove file {p}: {e}")
55
+ logger.warning(f"Failed to remove file {p}: {e}")
47
56
 
48
57
  # Remove tmp_dir recursively (if provided)
49
58
  if tsv_dir is not None:
50
59
  td = Path(tsv_dir)
51
60
  if not td.exists():
52
61
  if verbose:
53
- print(f"[skip] tsv_dir not found: {td}")
62
+ logger.info(f"[skip] tsv_dir not found: {td}")
54
63
  else:
55
64
  if not td.is_dir():
56
65
  if verbose:
57
- print(f"[skip] tsv_dir is not a directory: {td}")
66
+ logger.info(f"[skip] tsv_dir is not a directory: {td}")
58
67
  else:
59
68
  if dry_run:
60
- print(f"[dry-run] would remove directory tree: {td}")
69
+ logger.info(f"[dry-run] would remove directory tree: {td}")
61
70
  else:
62
71
  try:
63
72
  shutil.rmtree(td)
64
73
  if verbose:
65
- print(f"Removed directory tree: {td}")
74
+ logger.info(f"Removed directory tree: {td}")
66
75
  except Exception as e:
67
- print(f"[error] failed to remove tmp dir {td}: {e}")
68
-
69
- def load_adata(config_path):
70
- """
71
- High-level function to call for converting raw sequencing data to an adata object.
72
- Command line accesses this through smftools load <config_path>
73
- Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
74
- Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
75
- Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
76
+ logger.warning(f"[error] failed to remove tmp dir {td}: {e}")
76
77
 
77
- Parameters:
78
- config_path (str): A string representing the file path to the experiment configuration csv file.
79
78
 
80
- Returns:
81
- adata, adata_path, se_bam_files, cfg
79
+ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
82
80
  """
83
- from ..readwrite import make_dirs, safe_write_h5ad, add_or_update_column_in_csv
84
- from ..config import LoadExperimentConfig, ExperimentConfig
85
- from ..informatics.bam_functions import concatenate_fastqs_to_bam
86
- from ..informatics.pod5_functions import fast5_to_pod5
87
- from ..informatics.fasta_functions import subsample_fasta_from_bed
81
+ Core load pipeline.
82
+
83
+ Assumes:
84
+ - cfg is a fully initialized ExperimentConfig
85
+ - paths is an AdataPaths object describing canonical h5ad stage paths
86
+ - No stage-skipping or early returns based on existing AnnDatas are done here
87
+ (that happens in the wrapper).
88
+
89
+ Does:
90
+ - handle input format (fast5/pod5/fastq/bam/h5ad)
91
+ - basecalling / alignment / demux / BAM QC
92
+ - optional bed + bigwig generation
93
+ - AnnData construction (conversion or direct modality)
94
+ - basic read-level QC annotations
95
+ - write raw AnnData to paths.raw
96
+ - run MultiQC
97
+ - optional deletion of intermediate BAMs
98
+
99
+ Returns
100
+ -------
101
+ raw_adata : anndata.AnnData
102
+ Newly created raw AnnData object.
103
+ raw_adata_path : Path
104
+ Path where the raw AnnData was written (paths.raw).
105
+ cfg : ExperimentConfig
106
+ (Same object, possibly with some fields updated, e.g. fasta path.)
107
+ """
108
+ from pathlib import Path
88
109
 
89
110
  import numpy as np
90
- import pandas as pd
91
- import anndata as ad
92
- import scanpy as sc
93
-
94
- import os
95
- from importlib import resources
96
- from pathlib import Path
97
111
 
98
- from datetime import datetime
99
- date_str = datetime.today().strftime("%y%m%d")
112
+ from ..informatics.bam_functions import (
113
+ align_and_sort_BAM,
114
+ bam_qc,
115
+ concatenate_fastqs_to_bam,
116
+ demux_and_index_BAM,
117
+ extract_read_features_from_bam,
118
+ split_and_index_BAM,
119
+ )
120
+ from ..informatics.basecalling import canoncall, modcall
121
+ from ..informatics.bed_functions import aligned_BAM_to_bed
122
+ from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
123
+ from ..informatics.fasta_functions import (
124
+ generate_converted_FASTA,
125
+ get_chromosome_lengths,
126
+ subsample_fasta_from_bed,
127
+ )
128
+ from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
129
+ from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
130
+ from ..informatics.modkit_functions import extract_mods, make_modbed, modQC
131
+ from ..informatics.pod5_functions import fast5_to_pod5
132
+ from ..informatics.run_multiqc import run_multiqc
133
+ from ..metadata import record_smftools_metadata
134
+ from ..readwrite import add_or_update_column_in_csv, make_dirs
135
+ from .helpers import write_gz_h5ad
100
136
 
101
137
  ################################### 1) General params and input organization ###################################
138
+ output_directory = Path(cfg.output_directory)
139
+ make_dirs([output_directory])
102
140
 
103
- # Load experiment config parameters into global variables
104
- loader = LoadExperimentConfig(config_path)
105
- defaults_dir = resources.files("smftools").joinpath("config")
106
- cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
107
-
108
- # Make initial output directory
109
- make_dirs([cfg.output_directory])
110
-
111
- # Make a csv that contains experiment summary file paths
112
- add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
113
- add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
114
- add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
115
- add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
116
-
117
- # Initial h5ad file naming
118
- h5_dir = cfg.output_directory / 'h5ads'
119
- raw_adata_path = h5_dir / f'{cfg.experiment_name}.h5ad.gz'
120
-
121
- # Preprocessed adata path info
122
- pp_adata_basename = raw_adata_path.name.split(".")[0] + '_preprocessed.h5ad.gz'
123
- pp_adata_path = raw_adata_path.parent / pp_adata_basename
124
-
125
- # Preprocessed duplicate removed adata path info
126
- if cfg.smf_modality == 'direct':
127
- # For direct SMF, link the duplicate removed version just to the preprocessed version, since there is not a duplicate removal step for direct workflow
128
- pp_dup_rem_adata_path = pp_adata_path
129
- else:
130
- pp_dup_rem_adata_basename = pp_adata_path.name.split(".")[0] + '_duplicates_removed.h5ad.gz'
131
- pp_dup_rem_adata_path = pp_adata_path.parent / pp_dup_rem_adata_basename
132
-
133
- # Preprocessed duplicate removed adata with basic analyses appended path info
134
- spatial_adata_basename = pp_dup_rem_adata_path.name.split(".")[0] + '_spatial.h5ad.gz'
135
- spatial_adata_path = pp_dup_rem_adata_path.parent / spatial_adata_basename
136
-
137
- # hmm adata
138
- hmm_adata_basename = spatial_adata_path.name.split(".")[0] + '_hmm.h5ad.gz'
139
- hmm_adata_path = spatial_adata_path.parent / hmm_adata_basename
140
-
141
- add_or_update_column_in_csv(cfg.summary_file, "load_adata", raw_adata_path)
142
- add_or_update_column_in_csv(cfg.summary_file, "pp_adata", pp_adata_path)
143
- add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", pp_dup_rem_adata_path)
144
- add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", spatial_adata_path)
145
- add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
146
-
147
- if cfg.force_redo_load_adata:
148
- pass
149
- elif hmm_adata_path.exists():
150
- print(f"HMM AnnData already exists: {hmm_adata_path}\n Skipping smftools load")
151
- return None, hmm_adata_path, cfg
152
- elif spatial_adata_path.exists():
153
- print(f"Spatial AnnData already exists: {spatial_adata_path}\n Skipping smftools load")
154
- return None, spatial_adata_path, cfg
155
- elif pp_dup_rem_adata_path.exists():
156
- print(f"Preprocessed deduplicated AnnData already exists: {pp_dup_rem_adata_path}\n Skipping smftools load")
157
- return None, pp_dup_rem_adata_path, cfg
158
- elif pp_adata_path.exists():
159
- print(f"Preprocessed Anndata already exists: {pp_adata_path}\n Skipping smftools load")
160
- return None, pp_adata_path, cfg
161
- elif raw_adata_path.exists():
162
- print(f"Anndata from smftools load already exists: {raw_adata_path}\n Skipping smftools load")
163
- return None, raw_adata_path, cfg
164
- else:
165
- pass
141
+ raw_adata_path = paths.raw
142
+ pp_adata_path = paths.pp
143
+ pp_dup_rem_adata_path = paths.pp_dedup
144
+ spatial_adata_path = paths.spatial
145
+ hmm_adata_path = paths.hmm
166
146
 
167
147
  # Naming of the demultiplexed output directory
168
148
  double_barcoded_path = cfg.split_path / "both_ends_barcoded"
169
149
  single_barcoded_path = cfg.split_path / "at_least_one_end_barcoded"
170
150
 
171
151
  # Direct methylation detection SMF specific parameters
172
- if cfg.smf_modality == 'direct':
152
+ if cfg.smf_modality == "direct":
173
153
  mod_bed_dir = cfg.output_directory / "mod_beds"
174
154
  add_or_update_column_in_csv(cfg.summary_file, "mod_bed_dir", mod_bed_dir)
175
155
  mod_tsv_dir = cfg.output_directory / "mod_tsvs"
176
156
  add_or_update_column_in_csv(cfg.summary_file, "mod_tsv_dir", mod_tsv_dir)
177
157
  bam_qc_dir = cfg.output_directory / "bam_qc"
178
- mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
179
- mods = [mod_map[mod] for mod in cfg.mod_list]
158
+ mods = [cfg.mod_map[mod] for mod in cfg.mod_list]
159
+
180
160
  if not check_executable_exists("dorado"):
181
161
  raise RuntimeError(
182
162
  "Error: 'dorado' is not installed or not in PATH. "
@@ -188,9 +168,12 @@ def load_adata(config_path):
188
168
  "Install from https://github.com/nanoporetech/modkit"
189
169
  )
190
170
  else:
191
- pass
192
-
193
- if not cfg.input_already_demuxed or cfg.aligner == "dorado":
171
+ mod_bed_dir = None
172
+ mod_tsv_dir = None
173
+ mods = None
174
+
175
+ # demux / aligner executables
176
+ if (not cfg.input_already_demuxed) or cfg.aligner == "dorado":
194
177
  if not check_executable_exists("dorado"):
195
178
  raise RuntimeError(
196
179
  "Error: 'dorado' is not installed or not in PATH. "
@@ -200,42 +183,45 @@ def load_adata(config_path):
200
183
  if cfg.aligner == "minimap2":
201
184
  if not check_executable_exists("minimap2"):
202
185
  raise RuntimeError(
203
- "Error: 'minimap2' is not installed or not in PATH. "
204
- "Install minimap2"
186
+ "Error: 'minimap2' is not installed or not in PATH. Install minimap2"
205
187
  )
206
188
 
207
189
  # # Detect the input filetypes
208
190
  # If the input files are fast5 files, convert the files to a pod5 file before proceeding.
209
191
  if cfg.input_type == "fast5":
210
192
  # take the input directory of fast5 files and write out a single pod5 file into the output directory.
211
- output_pod5 = cfg.output_directory / 'FAST5s_to_POD5.pod5'
193
+ output_pod5 = cfg.output_directory / "FAST5s_to_POD5.pod5"
212
194
  if output_pod5.exists():
213
195
  pass
214
196
  else:
215
- print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
197
+ logger.info(
198
+ f"Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}"
199
+ )
216
200
  fast5_to_pod5(cfg.input_data_path, output_pod5)
217
201
  # Reassign the pod5_dir variable to point to the new pod5 file.
218
202
  cfg.input_data_path = output_pod5
219
- cfg.input_type == "pod5"
203
+ cfg.input_type = "pod5"
220
204
  # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
221
205
  elif cfg.input_type == "fastq":
222
206
  # Output file for FASTQ concatenation.
223
- output_bam = cfg.output_directory / 'canonical_basecalls.bam'
207
+ output_bam = cfg.output_directory / "canonical_basecalls.bam"
224
208
  if output_bam.exists():
225
- pass
209
+ logger.debug("Output BAM already exists")
226
210
  else:
211
+ logger.info("Concatenating FASTQ files into a single BAM file")
227
212
  summary = concatenate_fastqs_to_bam(
228
213
  cfg.input_files,
229
214
  output_bam,
230
- barcode_tag='BC',
231
- gzip_suffixes=('.gz','.gzip'),
215
+ barcode_tag="BC",
216
+ gzip_suffixes=(".gz", ".gzip"),
232
217
  barcode_map=cfg.fastq_barcode_map,
233
218
  add_read_group=True,
234
219
  rg_sample_field=None,
235
220
  progress=False,
236
- auto_pair=cfg.fastq_auto_pairing)
237
-
238
- print(f"Found the following barcodes: {summary['barcodes']}")
221
+ auto_pair=cfg.fastq_auto_pairing,
222
+ )
223
+
224
+ logger.info(f"Found the following barcodes in FASTQ inputs: {summary['barcodes']}")
239
225
 
240
226
  # Set the input data path to the concatenated BAM.
241
227
  cfg.input_data_path = output_bam
@@ -244,24 +230,24 @@ def load_adata(config_path):
244
230
  pass
245
231
  else:
246
232
  pass
247
-
233
+
248
234
  add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
249
235
 
250
236
  # Determine if the input data needs to be basecalled
251
237
  if cfg.input_type == "pod5":
252
- print(f'Detected pod5 inputs: {cfg.input_files}')
238
+ logger.info(f"Detected pod5 inputs: {cfg.input_files}")
253
239
  basecall = True
254
240
  elif cfg.input_type in ["bam"]:
255
- print(f'Detected bam input: {cfg.input_files}')
241
+ logger.info(f"Detected bam input: {cfg.input_files}")
256
242
  basecall = False
257
243
  else:
258
- print('Error, can not find input bam or pod5')
244
+ logger.info("Error, can not find input bam or pod5")
259
245
 
260
246
  # Generate the base name of the unaligned bam without the .bam suffix
261
247
  if basecall:
262
248
  model_basename = Path(cfg.model).name
263
- model_basename = str(model_basename).replace('.', '_')
264
- if cfg.smf_modality == 'direct':
249
+ model_basename = str(model_basename).replace(".", "_")
250
+ if cfg.smf_modality == "direct":
265
251
  mod_string = "_".join(cfg.mod_list)
266
252
  bam = cfg.output_directory / f"{model_basename}_{mod_string}_calls"
267
253
  else:
@@ -272,7 +258,9 @@ def load_adata(config_path):
272
258
 
273
259
  # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
274
260
  unaligned_output = bam.with_suffix(cfg.bam_suffix)
275
- aligned_BAM = cfg.output_directory / (bam.stem + "_aligned") # doing this allows specifying an input bam in a seperate directory as the aligned output bams
261
+ aligned_BAM = (
262
+ cfg.output_directory / (bam.stem + "_aligned")
263
+ ) # doing this allows specifying an input bam in a seperate directory as the aligned output bams
276
264
  aligned_output = aligned_BAM.with_suffix(cfg.bam_suffix)
277
265
  aligned_sorted_BAM = aligned_BAM.with_name(aligned_BAM.stem + "_sorted")
278
266
  aligned_sorted_output = aligned_sorted_BAM.with_suffix(cfg.bam_suffix)
@@ -283,34 +271,40 @@ def load_adata(config_path):
283
271
  ########################################################################################################################
284
272
 
285
273
  ################################### 2) FASTA Handling ###################################
286
- from ..informatics.fasta_functions import generate_converted_FASTA, get_chromosome_lengths
287
274
 
288
275
  try:
289
276
  cfg.fasta = Path(cfg.fasta)
290
- except:
291
- print("Need to provide an input FASTA path to proceed with smftools load")
277
+ except Exception:
278
+ logger.warning("Need to provide an input FASTA path to proceed with smftools load")
292
279
 
293
280
  # If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
294
- if cfg.fasta_regions_of_interest and '.bed' in cfg.fasta_regions_of_interest:
295
- fasta_basename = cfg.fasta.parent / cfg.fasta.stem
296
- bed_basename_minus_suffix = Path(cfg.fasta_regions_of_interest).stem
297
- output_FASTA = fasta_basename.with_name(fasta_basename.name + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta')
298
- subsample_fasta_from_bed(cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA)
299
- fasta = cfg.output_directory / output_FASTA
281
+ if cfg.fasta_regions_of_interest and ".bed" in cfg.fasta_regions_of_interest:
282
+ fasta_stem = cfg.fasta.stem
283
+ bed_stem = Path(cfg.fasta_regions_of_interest).stem
284
+ output_FASTA = cfg.output_directory / f"{fasta_stem}_subsampled_by_{bed_stem}.fasta"
285
+
286
+ logger.info("Subsampling FASTA records using the provided BED file")
287
+ subsample_fasta_from_bed(
288
+ cfg.fasta, cfg.fasta_regions_of_interest, cfg.output_directory, output_FASTA
289
+ )
290
+ fasta = output_FASTA
300
291
  else:
292
+ logger.info("Using the full FASTA file")
301
293
  fasta = cfg.fasta
302
294
 
303
295
  # For conversion style SMF, make a converted reference FASTA
304
- if cfg.smf_modality == 'conversion':
305
- fasta_basename = fasta.parent / fasta.stem
306
- converted_FASTA_basename = fasta_basename.with_name(fasta_basename.name + '_converted.fasta')
296
+ if cfg.smf_modality == "conversion":
297
+ fasta_stem = fasta.stem
298
+ converted_FASTA_basename = f"{fasta_stem}_converted.fasta"
307
299
  converted_FASTA = cfg.output_directory / converted_FASTA_basename
308
- if 'converted.fa' in fasta.name:
309
- print(f'{fasta} is already converted. Using existing converted FASTA.')
300
+
301
+ if "converted.fa" in fasta.name:
302
+ logger.info(f"{fasta} is already converted. Using existing converted FASTA.")
310
303
  converted_FASTA = fasta
311
304
  elif converted_FASTA.exists():
312
- print(f'{converted_FASTA} already exists. Using existing converted FASTA.')
305
+ logger.info(f"{converted_FASTA} already exists. Using existing converted FASTA.")
313
306
  else:
307
+ logger.info(f"Converting FASTA base sequences")
314
308
  generate_converted_FASTA(fasta, cfg.conversion_types, cfg.strands, converted_FASTA)
315
309
  fasta = converted_FASTA
316
310
 
@@ -321,121 +315,164 @@ def load_adata(config_path):
321
315
  ########################################################################################################################
322
316
 
323
317
  ################################### 3) Basecalling ###################################
324
- from ..informatics.basecalling import modcall, canoncall
318
+
325
319
  # 1) Basecall using dorado
326
- if basecall and cfg.sequencer == 'ont':
320
+ if basecall and cfg.sequencer == "ont":
327
321
  try:
328
322
  cfg.model_dir = Path(cfg.model_dir)
329
- except:
330
- print("Need to provide a valid path to a dorado model directory to use dorado basecalling")
323
+ except Exception:
324
+ logger.warning(
325
+ "Need to provide a valid path to a dorado model directory to use dorado basecalling"
326
+ )
331
327
  if aligned_sorted_output.exists():
332
- print(f'{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM.')
328
+ logger.info(
329
+ f"{aligned_sorted_output} already exists. Using existing basecalled, aligned, sorted BAM."
330
+ )
333
331
  elif unaligned_output.exists():
334
- print(f'{unaligned_output} already exists. Using existing basecalled BAM.')
335
- elif cfg.smf_modality != 'direct':
336
- canoncall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
332
+ logger.info(f"{unaligned_output} already exists. Using existing basecalled BAM.")
333
+ elif cfg.smf_modality != "direct":
334
+ logger.info("Running canonical basecalling using dorado")
335
+ canoncall(
336
+ str(cfg.model_dir),
337
+ cfg.model,
338
+ str(cfg.input_data_path),
339
+ cfg.barcode_kit,
340
+ str(bam),
341
+ cfg.bam_suffix,
342
+ cfg.barcode_both_ends,
343
+ cfg.trim,
344
+ cfg.device,
345
+ )
337
346
  else:
338
- modcall(str(cfg.model_dir), cfg.model, str(cfg.input_data_path), cfg.barcode_kit, cfg.mod_list, str(bam), cfg.bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
347
+ logger.info("Running modified basecalling using dorado")
348
+ modcall(
349
+ str(cfg.model_dir),
350
+ cfg.model,
351
+ str(cfg.input_data_path),
352
+ cfg.barcode_kit,
353
+ cfg.mod_list,
354
+ str(bam),
355
+ cfg.bam_suffix,
356
+ cfg.barcode_both_ends,
357
+ cfg.trim,
358
+ cfg.device,
359
+ )
339
360
  elif basecall:
340
- print(f"Basecalling is currently only supported for ont sequencers and not pacbio.")
361
+ logger.error("Basecalling is currently only supported for ont sequencers and not pacbio.")
341
362
  else:
342
363
  pass
343
364
  ########################################################################################################################
344
365
 
345
366
  ################################### 4) Alignment and sorting #############################################
346
- from ..informatics.bam_functions import align_and_sort_BAM
347
- from ..informatics.bed_functions import aligned_BAM_to_bed
367
+
348
368
  # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
349
369
  if aligned_sorted_output.exists():
350
- print(f'{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.')
370
+ logger.debug(f"{aligned_sorted_output} already exists. Using existing aligned/sorted BAM.")
351
371
  else:
352
- align_and_sort_BAM(fasta, unaligned_output, cfg.bam_suffix, cfg.output_directory, cfg.make_bigwigs, cfg.threads, cfg.aligner, cfg.aligner_args)
372
+ logger.info(f"Aligning and sorting reads")
373
+ align_and_sort_BAM(fasta, unaligned_output, cfg)
353
374
  # Deleted the unsorted aligned output
354
375
  aligned_output.unlink()
355
376
 
356
377
  if cfg.make_beds:
357
378
  # Make beds and provide basic histograms
358
- bed_dir = cfg.output_directory / 'beds'
379
+ bed_dir = cfg.output_directory / "beds"
359
380
  if bed_dir.is_dir():
360
- print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}')
381
+ logger.debug(
382
+ f"{bed_dir} already exists. Skipping BAM -> BED conversion for {aligned_sorted_output}"
383
+ )
361
384
  else:
362
- aligned_BAM_to_bed(aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads)
385
+ logger.info("Making bed files from the aligned and sorted BAM file")
386
+ aligned_BAM_to_bed(
387
+ aligned_sorted_output, cfg.output_directory, fasta, cfg.make_bigwigs, cfg.threads
388
+ )
363
389
  ########################################################################################################################
364
390
 
365
391
  ################################### 5) Demultiplexing ######################################################################
366
- from ..informatics.bam_functions import demux_and_index_BAM, split_and_index_BAM
392
+
367
393
  # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
368
394
  if cfg.input_already_demuxed:
369
395
  if cfg.split_path.is_dir():
370
- print(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
396
+ logger.debug(f"{cfg.split_path} already exists. Using existing demultiplexed BAMs.")
371
397
 
372
398
  all_bam_files = sorted(
373
- p for p in cfg.split_path.iterdir()
374
- if p.is_file()
375
- and p.suffix == cfg.bam_suffix
399
+ p for p in cfg.split_path.iterdir() if p.is_file() and p.suffix == cfg.bam_suffix
376
400
  )
377
401
  unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
378
402
  bam_files = [p for p in all_bam_files if "unclassified" not in p.name]
379
403
 
380
404
  else:
381
405
  make_dirs([cfg.split_path])
382
- all_bam_files = split_and_index_BAM(aligned_sorted_BAM,
383
- cfg.split_path,
384
- cfg.bam_suffix)
385
-
406
+ logger.info("Demultiplexing samples into individual aligned/sorted BAM files")
407
+ all_bam_files = split_and_index_BAM(aligned_sorted_BAM, cfg.split_path, cfg.bam_suffix)
408
+
386
409
  unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
387
410
  bam_files = sorted(p for p in all_bam_files if "unclassified" not in p.name)
388
411
 
389
412
  se_bam_files = bam_files
390
413
  bam_dir = cfg.split_path
391
-
414
+
392
415
  else:
393
416
  if single_barcoded_path.is_dir():
394
- print(f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs.")
417
+ logger.debug(
418
+ f"{single_barcoded_path} already exists. Using existing single ended demultiplexed BAMs."
419
+ )
395
420
 
396
421
  all_se_bam_files = sorted(
397
- p for p in single_barcoded_path.iterdir()
398
- if p.is_file()
399
- and p.suffix == cfg.bam_suffix
400
- )
422
+ p
423
+ for p in single_barcoded_path.iterdir()
424
+ if p.is_file() and p.suffix == cfg.bam_suffix
425
+ )
401
426
  unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
402
427
  se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
403
428
  else:
404
- make_dirs([cfg.split_path, single_barcoded_path])
405
- all_se_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
406
- single_barcoded_path,
407
- cfg.bam_suffix,
408
- cfg.barcode_kit,
409
- False,
410
- cfg.trim,
411
- cfg.threads)
412
-
429
+ make_dirs([cfg.split_path, single_barcoded_path])
430
+ logger.info(
431
+ "Demultiplexing samples into individual aligned/sorted BAM files based on single end barcode status with Dorado"
432
+ )
433
+ all_se_bam_files = demux_and_index_BAM(
434
+ aligned_sorted_BAM,
435
+ single_barcoded_path,
436
+ cfg.bam_suffix,
437
+ cfg.barcode_kit,
438
+ False,
439
+ cfg.trim,
440
+ cfg.threads,
441
+ )
442
+
413
443
  unclassified_se_bams = [p for p in all_se_bam_files if "unclassified" in p.name]
414
444
  se_bam_files = [p for p in all_se_bam_files if "unclassified" not in p.name]
415
-
445
+
416
446
  if double_barcoded_path.is_dir():
417
- print(f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs.")
447
+ logger.debug(
448
+ f"{double_barcoded_path} already exists. Using existing double ended demultiplexed BAMs."
449
+ )
418
450
 
419
451
  all_de_bam_files = sorted(
420
- p for p in double_barcoded_path.iterdir()
421
- if p.is_file()
422
- and p.suffix == cfg.bam_suffix
423
- )
452
+ p
453
+ for p in double_barcoded_path.iterdir()
454
+ if p.is_file() and p.suffix == cfg.bam_suffix
455
+ )
424
456
  unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
425
457
  de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
426
- else:
427
- make_dirs([cfg.split_path, double_barcoded_path])
428
- all_de_bam_files = demux_and_index_BAM(aligned_sorted_BAM,
429
- double_barcoded_path,
430
- cfg.bam_suffix,
431
- cfg.barcode_kit,
432
- True,
433
- cfg.trim,
434
- cfg.threads)
435
-
458
+ else:
459
+ make_dirs([cfg.split_path, double_barcoded_path])
460
+ logger.info(
461
+ "Demultiplexing samples into individual aligned/sorted BAM files based on double end barcode status with Dorado"
462
+ )
463
+ all_de_bam_files = demux_and_index_BAM(
464
+ aligned_sorted_BAM,
465
+ double_barcoded_path,
466
+ cfg.bam_suffix,
467
+ cfg.barcode_kit,
468
+ True,
469
+ cfg.trim,
470
+ cfg.threads,
471
+ )
472
+
436
473
  unclassified_de_bams = [p for p in all_de_bam_files if "unclassified" in p.name]
437
474
  de_bam_files = [p for p in all_de_bam_files if "unclassified" not in p.name]
438
-
475
+
439
476
  bam_files = se_bam_files + de_bam_files
440
477
  unclassified_bams = unclassified_se_bams + unclassified_de_bams
441
478
  bam_dir = single_barcoded_path
@@ -444,134 +481,277 @@ def load_adata(config_path):
444
481
 
445
482
  if cfg.make_beds:
446
483
  # Make beds and provide basic histograms
447
- bed_dir = cfg.split_path / 'beds'
484
+ bed_dir = cfg.split_path / "beds"
448
485
  if bed_dir.is_dir():
449
- print(f'{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams')
486
+ logger.debug(
487
+ f"{bed_dir} already exists. Skipping BAM -> BED conversion for demultiplexed bams"
488
+ )
450
489
  else:
490
+ logger.info("Making BED files from BAM files for each sample")
451
491
  for bam in bam_files:
452
492
  aligned_BAM_to_bed(bam, cfg.split_path, fasta, cfg.make_bigwigs, cfg.threads)
453
493
  ########################################################################################################################
454
494
 
455
495
  ################################### 6) SAMTools based BAM QC ######################################################################
456
- from ..informatics.bam_functions import bam_qc
496
+
457
497
  # 5) Samtools QC metrics on split BAM files
458
498
  bam_qc_dir = cfg.split_path / "bam_qc"
459
499
  if bam_qc_dir.is_dir():
460
- print( f'{bam_qc_dir} already exists. Using existing BAM QC calculations.')
500
+ logger.debug(f"{bam_qc_dir} already exists. Using existing BAM QC calculations.")
461
501
  else:
462
502
  make_dirs([bam_qc_dir])
503
+ logger.info("Performing BAM QC")
463
504
  bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=cfg.smf_modality)
464
- ########################################################################################################################
505
+ ########################################################################################################################
465
506
 
466
507
  ################################### 7) AnnData loading ######################################################################
467
- if cfg.smf_modality != 'direct':
508
+ if cfg.smf_modality != "direct":
468
509
  from ..informatics.converted_BAM_to_adata import converted_BAM_to_adata
510
+
469
511
  # 6) Take the converted BAM and load it into an adata object.
470
- if cfg.smf_modality == 'deaminase':
512
+ if cfg.smf_modality == "deaminase":
471
513
  deaminase_footprinting = True
472
514
  else:
473
515
  deaminase_footprinting = False
474
- raw_adata, raw_adata_path = converted_BAM_to_adata(fasta,
475
- bam_dir,
476
- cfg.output_directory,
477
- cfg.input_already_demuxed,
478
- cfg.mapping_threshold,
479
- cfg.experiment_name,
480
- cfg.conversion_types,
481
- cfg.bam_suffix,
482
- cfg.device,
483
- cfg.threads,
484
- deaminase_footprinting,
485
- delete_intermediates=cfg.delete_intermediate_hdfs,
486
- double_barcoded_path=double_barcoded_path)
516
+
517
+ logger.info(f"Loading Anndata from BAM files for {cfg.smf_modality} footprinting")
518
+ raw_adata, raw_adata_path = converted_BAM_to_adata(
519
+ fasta,
520
+ bam_dir,
521
+ cfg.output_directory,
522
+ cfg.input_already_demuxed,
523
+ cfg.mapping_threshold,
524
+ cfg.experiment_name,
525
+ cfg.conversion_types,
526
+ cfg.bam_suffix,
527
+ cfg.device,
528
+ cfg.threads,
529
+ deaminase_footprinting,
530
+ delete_intermediates=cfg.delete_intermediate_hdfs,
531
+ double_barcoded_path=double_barcoded_path,
532
+ )
487
533
  else:
488
534
  if mod_bed_dir.is_dir():
489
- print(f'{mod_bed_dir} already exists, skipping making modbeds')
535
+ logger.debug(f"{mod_bed_dir} already exists, skipping making modbeds")
490
536
  else:
491
- from ..informatics.modkit_functions import modQC, make_modbed
492
- make_dirs([mod_bed_dir])
493
-
494
- modQC(aligned_sorted_output,
495
- cfg.thresholds) # get QC metrics for mod calls
496
-
497
- make_modbed(aligned_sorted_output,
498
- cfg.thresholds,
499
- mod_bed_dir) # Generate bed files of position methylation summaries for every sample
500
-
537
+ from ..informatics.modkit_functions import make_modbed, modQC
538
+
539
+ make_dirs([mod_bed_dir])
540
+
541
+ logger.info("Performing modQC for direct footprinting samples")
542
+
543
+ modQC(aligned_sorted_output, cfg.thresholds) # get QC metrics for mod calls
544
+
545
+ logger.info("Making modified BED files for direct footprinting samples")
546
+
547
+ make_modbed(
548
+ aligned_sorted_output, cfg.thresholds, mod_bed_dir
549
+ ) # Generate bed files of position methylation summaries for every sample
550
+
501
551
  from ..informatics.modkit_functions import extract_mods
552
+
502
553
  make_dirs([mod_tsv_dir])
503
554
 
504
- extract_mods(cfg.thresholds,
505
- mod_tsv_dir,
506
- bam_dir,
507
- cfg.bam_suffix,
508
- skip_unclassified=cfg.skip_unclassified,
509
- modkit_summary=False,
510
- threads=cfg.threads) # Extract methylations calls for split BAM files into split TSV files
511
-
555
+ logger.info(
556
+ "Extracting single read modification states into TSVs for direct footprinting samples"
557
+ )
558
+
559
+ extract_mods(
560
+ cfg.thresholds,
561
+ mod_tsv_dir,
562
+ bam_dir,
563
+ cfg.bam_suffix,
564
+ skip_unclassified=cfg.skip_unclassified,
565
+ modkit_summary=False,
566
+ threads=cfg.threads,
567
+ ) # Extract methylations calls for split BAM files into split TSV files
568
+
512
569
  from ..informatics.modkit_extract_to_adata import modkit_extract_to_adata
513
- #6 Load the modification data from TSVs into an adata object
514
- raw_adata, raw_adata_path = modkit_extract_to_adata(fasta,
515
- bam_dir,
516
- cfg.output_directory,
517
- cfg.input_already_demuxed,
518
- cfg.mapping_threshold,
519
- cfg.experiment_name,
520
- mods,
521
- cfg.batch_size,
522
- mod_tsv_dir,
523
- cfg.delete_batch_hdfs,
524
- cfg.threads,
525
- double_barcoded_path)
570
+
571
+ logger.info("Making Anndata for direct modification detection SMF samples")
572
+
573
+ # 6 Load the modification data from TSVs into an adata object
574
+ raw_adata, raw_adata_path = modkit_extract_to_adata(
575
+ fasta,
576
+ bam_dir,
577
+ cfg.output_directory,
578
+ cfg.input_already_demuxed,
579
+ cfg.mapping_threshold,
580
+ cfg.experiment_name,
581
+ mods,
582
+ cfg.batch_size,
583
+ mod_tsv_dir,
584
+ cfg.delete_batch_hdfs,
585
+ cfg.threads,
586
+ double_barcoded_path,
587
+ )
526
588
  if cfg.delete_intermediate_tsvs:
527
589
  delete_tsvs(mod_tsv_dir)
528
590
 
529
- raw_adata.obs['Experiment_name'] = [cfg.experiment_name] * raw_adata.shape[0]
530
- raw_adata.obs['Experiment_name_and_barcode'] = (raw_adata.obs['Experiment_name'].astype(str) + "_" + raw_adata.obs['Barcode'].astype(str))
591
+ raw_adata.obs["Experiment_name"] = [cfg.experiment_name] * raw_adata.shape[0]
592
+ raw_adata.obs["Experiment_name_and_barcode"] = (
593
+ raw_adata.obs["Experiment_name"].astype(str) + "_" + raw_adata.obs["Barcode"].astype(str)
594
+ )
531
595
 
532
596
  ########################################################################################################################
533
597
 
534
598
  ############################################### Add basic read length, read quality, mapping quality stats ###############################################
535
- from ..informatics.h5ad_functions import add_read_length_and_mapping_qc
536
- from ..informatics.bam_functions import extract_read_features_from_bam
537
- add_read_length_and_mapping_qc(raw_adata, se_bam_files,
538
- extract_read_features_from_bam_callable=extract_read_features_from_bam,
539
- bypass=cfg.bypass_add_read_length_and_mapping_qc,
540
- force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
541
599
 
542
- raw_adata.obs['Raw_modification_signal'] = np.nansum(raw_adata.X, axis=1)
600
+ logger.info("Adding read length, mapping quality, and modification signal to Anndata")
601
+ add_read_length_and_mapping_qc(
602
+ raw_adata,
603
+ se_bam_files,
604
+ extract_read_features_from_bam_callable=extract_read_features_from_bam,
605
+ bypass=cfg.bypass_add_read_length_and_mapping_qc,
606
+ force_redo=cfg.force_redo_add_read_length_and_mapping_qc,
607
+ )
608
+
609
+ raw_adata.obs["Raw_modification_signal"] = np.nansum(raw_adata.X, axis=1)
610
+ ########################################################################################################################
611
+
612
+ ############################################### if input data type was pod5, append the pod5 file origin to each read ###############################################
613
+ from ..informatics.h5ad_functions import annotate_pod5_origin
614
+
615
+ if cfg.input_type == "pod5":
616
+ logger.info("Adding the POD5 origin file to each read into Anndata")
617
+ annotate_pod5_origin(
618
+ raw_adata,
619
+ cfg.input_data_path,
620
+ n_jobs=cfg.threads,
621
+ csv_path=output_directory / "read_to_pod5_origin_mapping.csv",
622
+ )
543
623
  ########################################################################################################################
544
624
 
545
625
  ############################################### Save final adata ###############################################
546
- print(f"Saving AnnData to {raw_adata_path}")
547
- safe_write_h5ad(raw_adata, raw_adata_path, compression='gzip', backup=True)
626
+ logger.info(f"Saving AnnData to {raw_adata_path}")
627
+ record_smftools_metadata(
628
+ raw_adata,
629
+ step_name="load",
630
+ cfg=cfg,
631
+ config_path=config_path,
632
+ output_path=raw_adata_path,
633
+ )
634
+ write_gz_h5ad(raw_adata, raw_adata_path)
548
635
  ########################################################################################################################
549
636
 
550
637
  ############################################### MultiQC HTML Report ###############################################
551
- from ..informatics.run_multiqc import run_multiqc
638
+
552
639
  # multiqc ###
553
640
  mqc_dir = cfg.split_path / "multiqc"
554
641
  if mqc_dir.is_dir():
555
- print(f'{mqc_dir} already exists, skipping multiqc')
642
+ logger.debug(f"{mqc_dir} already exists, skipping multiqc")
556
643
  else:
644
+ logger.info("Running multiqc")
557
645
  run_multiqc(cfg.split_path, mqc_dir)
558
646
  ########################################################################################################################
559
647
 
560
648
  ############################################### delete intermediate BAM files ###############################################
561
649
  if cfg.delete_intermediate_bams:
650
+ logger.info("Deleting intermediate BAM files")
562
651
  # delete aligned and sorted bam
563
652
  aligned_sorted_output.unlink()
564
- bai = aligned_sorted_output.parent / (aligned_sorted_output.name + '.bai')
653
+ bai = aligned_sorted_output.parent / (aligned_sorted_output.name + ".bai")
565
654
  bai.unlink()
566
655
  # delete the demultiplexed bams. Keep the demultiplexing summary files and directories to faciliate demultiplexing in the future with these files
567
656
  for bam in bam_files:
568
- bai = bam.parent / (bam.name + '.bai')
657
+ bai = bam.parent / (bam.name + ".bai")
569
658
  bam.unlink()
570
659
  bai.unlink()
571
660
  for bam in unclassified_bams:
572
- bai = bam.parent / (bam.name + '.bai')
661
+ bai = bam.parent / (bam.name + ".bai")
573
662
  bam.unlink()
574
- bai.unlink()
663
+ bai.unlink()
664
+ logger.info("Finished deleting intermediate BAM files")
575
665
  ########################################################################################################################
576
666
 
577
- return raw_adata, raw_adata_path, cfg
667
+ return raw_adata, raw_adata_path, cfg
668
+
669
+
670
+ def load_adata(config_path: str):
671
+ """
672
+ CLI-facing wrapper for the load pipeline.
673
+
674
+ - Reads config CSV into ExperimentConfig
675
+ - Computes canonical paths for all downstream AnnData stages
676
+ - Registers those in the summary CSV
677
+ - Applies stage-skipping logic (hmm > spatial > pp_dedup > pp > raw)
678
+ - If needed, calls the core pipeline to actually build the raw AnnData
679
+
680
+ Returns
681
+ -------
682
+ adata : anndata.AnnData | None
683
+ Newly created AnnData object, or None if we skipped because a later-stage
684
+ AnnData already exists.
685
+ adata_path : pathlib.Path
686
+ Path to the "current" AnnData that should be used downstream.
687
+ cfg : ExperimentConfig
688
+ Config object for downstream steps.
689
+ """
690
+ from datetime import datetime
691
+ from importlib import resources
692
+
693
+ from ..config import ExperimentConfig, LoadExperimentConfig
694
+ from ..readwrite import add_or_update_column_in_csv, make_dirs
695
+ from .helpers import get_adata_paths
696
+
697
+ date_str = datetime.today().strftime("%y%m%d")
698
+
699
+ # -----------------------------
700
+ # 1) Load config into cfg
701
+ # -----------------------------
702
+ loader = LoadExperimentConfig(config_path)
703
+ defaults_dir = resources.files("smftools").joinpath("config")
704
+ cfg, report = ExperimentConfig.from_var_dict(
705
+ loader.var_dict, date_str=date_str, defaults_dir=defaults_dir
706
+ )
707
+
708
+ # Ensure base output dir
709
+ make_dirs([cfg.output_directory])
710
+
711
+ # -----------------------------
712
+ # 2) Compute and register paths
713
+ # -----------------------------
714
+ paths = get_adata_paths(cfg)
715
+
716
+ # experiment-level metadata in summary CSV
717
+ add_or_update_column_in_csv(cfg.summary_file, "experiment_name", cfg.experiment_name)
718
+ add_or_update_column_in_csv(cfg.summary_file, "config_path", config_path)
719
+ add_or_update_column_in_csv(cfg.summary_file, "input_data_path", cfg.input_data_path)
720
+ add_or_update_column_in_csv(cfg.summary_file, "input_files", [cfg.input_files])
721
+
722
+ # AnnData stage paths
723
+ add_or_update_column_in_csv(cfg.summary_file, "load_adata", paths.raw)
724
+ add_or_update_column_in_csv(cfg.summary_file, "pp_adata", paths.pp)
725
+ add_or_update_column_in_csv(cfg.summary_file, "pp_dedup_adata", paths.pp_dedup)
726
+ add_or_update_column_in_csv(cfg.summary_file, "spatial_adata", paths.spatial)
727
+ add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", paths.hmm)
728
+
729
+ # -----------------------------
730
+ # 3) Stage skipping logic
731
+ # -----------------------------
732
+ if not getattr(cfg, "force_redo_load_adata", False):
733
+ if paths.hmm.exists():
734
+ logger.debug(f"HMM AnnData already exists: {paths.hmm}\nSkipping smftools load")
735
+ return None, paths.hmm, cfg
736
+ if paths.spatial.exists():
737
+ logger.debug(f"Spatial AnnData already exists: {paths.spatial}\nSkipping smftools load")
738
+ return None, paths.spatial, cfg
739
+ if paths.pp_dedup.exists():
740
+ logger.debug(
741
+ f"Preprocessed deduplicated AnnData already exists: {paths.pp_dedup}\n"
742
+ f"Skipping smftools load"
743
+ )
744
+ return None, paths.pp_dedup, cfg
745
+ if paths.pp.exists():
746
+ logger.debug(f"Preprocessed AnnData already exists: {paths.pp}\nSkipping smftools load")
747
+ return None, paths.pp, cfg
748
+ if paths.raw.exists():
749
+ logger.debug(
750
+ f"Raw AnnData from smftools load already exists: {paths.raw}\nSkipping smftools load"
751
+ )
752
+ return None, paths.raw, cfg
753
+
754
+ # If we get here, we actually want to run the full load pipeline
755
+ adata, adata_path, cfg = load_adata_core(cfg, paths, config_path=config_path)
756
+
757
+ return adata, adata_path, cfg